In [18]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ModelCheckpoint
from PIL import Image
import cv2
import csv

# Disable TensorFlow GPU operations temporarily
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Function to clean and preprocess data
def clean_and_preprocess_data(file_path):
    chunks = pd.read_csv(file_path, chunksize=10000)
    clean_data = []

    for chunk in chunks:
        chunk = chunk[chunk.apply(lambda x: all(isinstance(i, (int, float)) for i in x[1:]), axis=1)]
        clean_data.append(chunk)

    clean_data = pd.concat(clean_data)
    X = clean_data.iloc[:, 1:].values.astype('float32')
    y = clean_data['label'].values
    X = X.reshape(-1, 28, 28, 1)
    X /= 255.0

    return X, y

# Load and preprocess the data
X, y = clean_and_preprocess_data('/home/venkatesh/Desktop/ml/alphabets_28*28.csv')

# Encode labels
le = LabelEncoder()
y = le.fit_transform(y)
y = to_categorical(y, num_classes=26)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(26, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define model checkpoint
checkpoint_path = "model_checkpoint.keras"
checkpoint = ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_loss', mode='min')

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test), callbacks=[checkpoint])

# Evaluate the model on the test data
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:
  for chunk in chunks:


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 29ms/step - accuracy: 0.6235 - loss: 1.2958 - val_accuracy: 0.9383 - val_loss: 0.2079
Epoch 2/10
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9520 - loss: 0.1632 - val_accuracy: 0.9719 - val_loss: 0.1032
Epoch 3/10
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9739 - loss: 0.0957 - val_accuracy: 0.9764 - val_loss: 0.0836
Epoch 4/10
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9814 - loss: 0.0680 - val_accuracy: 0.9801 - val_loss: 0.0686
Epoch 5/10
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9847 - loss: 0.0546 - val_accuracy: 0.9791 - val_loss: 0.0688
Epoch 6/10
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.9873 - loss: 0.0433 - val_accuracy: 0.9810 - val_loss: 0.0578
Epoch 7/10
[1m151/151[0m [32m━

In [19]:
# Function to preprocess an individual image for prediction
def preprocess_image(image):
    image = image.resize((28, 28))  # Resize to 28x28 pixels
    image_array = np.array(image).astype('float32')
    image_array = image_array / 255.0  # Normalize
    image_array = image_array.reshape(1, 28, 28, 1)  # Reshape for the model
    return image_array

# Function to predict a character from an image segment
def predict_character(image_segment, model, le):
    image_array = preprocess_image(image_segment)
    prediction = model.predict(image_array)
    predicted_label = le.inverse_transform([np.argmax(prediction)])
    return predicted_label[0]

# Function to segment and predict characters from the target images
def predict_text_from_image(image_path, model, le, box_size=28):
    image = Image.open(image_path).convert('L')  # Convert to grayscale
    image = np.array(image)
    height, width = image.shape
    num_boxes_y = height // box_size
    num_boxes_x = width // box_size

    predicted_text = ""
    for y in range(num_boxes_y):
        for x in range(num_boxes_x):
            y1 = y * box_size
            y2 = y1 + box_size
            x1 = x * box_size
            x2 = x1 + box_size

            box_img = image[y1:y2, x1:x2]
            box_img = Image.fromarray(box_img)

            if np.sum(box_img) == 0:
                predicted_char = " "
            else:
                predicted_char = predict_character(box_img, model, le)

            predicted_text += predicted_char

    return predicted_text

# Process each target image
lines = []
for i in range(1, 7):
    image_path = f'/home/venkatesh/Desktop/ml/target_images/line_{i}.png'
    line_text = predict_text_from_image(image_path, model, le)
    lines.append(line_text)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17

In [23]:
# Print the identified text for each line
for i, line in enumerate(lines):
    print(f'line_{i+1} text: {line}')

# Write the identified text to a CSV file
output_csv_path = 'identified_text.csv'
with open(output_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['line', 'identified text'])
    for i, line in enumerate(lines):
        writer.writerow([f"line_{i+1}", line])

# Save the model for future use if needed
model.save('trained_model.h5')



line_1 text: T AM RSALLX ANNOTSO BU XOUR CONSTANT COMPLATNTNC ANO TOU NBWSR OSSSR ANS SOLUTTONS WAZCA TS WBRX UNAPLPPUL ANO NSSATZXS 
line_2 text: TT TS TRUSTRATTSS TAAT XOU NSUSR PAT ATTSNTZON BURTNS SSSCUSSTONS ANO XOUR LACX OS SOCUS TS RSALLX ASPSCTTNS OUB PROBRSSS         
line_3 text: T AM OSLTSNTSO BX BOUR SRTSNPLTNSSS ANO TOU ALWAXS MAXS SUSRTONS TSSL WRLCOMS WATCA POSTSRS A SBNSS OT COMMUNTTT        
line_4 text: ZT SS WONBSRPUL TNAT POU ALWAXS SAOW XZNONSSS ANO XOUR CMPATNX TOWAROS OTAZRS TS TRULX NBARTWARMTNT ANO APPRSCTATBU     
line_5 text: XOUR ANALXSTS OS TAS OATA WAS ACCURATS ANO WSLL PRSSSNTSO PROUTBTNB A CLCAR UNOSRSTANBTNC OS TNZ TRSNBS ANO PATTSRNS    
line_6 text: TAS MBZTTNS MTNUTSS XOW PRSPARSO WSRS OBTATLSO ANB WSLL ORCANZZBO ACCURATSLS RSSLSCSTNB TMS OTSCUSSTONS ANO OSCTSTONS MAOS        


## SENTIMENT ANALYSIS

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

# Load the dataset
dataset_path = '/home/venkatesh/Desktop/ml/sentiment_analysis_dataset.csv'
df = pd.read_csv(dataset_path)

# Display the first few rows to understand the structure
print(df.head())

# Tokenization and padding parameters
max_words = 1000  # maximum number of words in the vocabulary
max_sequence_length = 100  # maximum length of a sequence

# Tokenize the text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['line'].astype(str))

# Convert texts to sequences
sequences = tokenizer.texts_to_sequences(df['line'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

# Encode the sentiment labels
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])
y = np.expand_dims(y, axis=1)  # Expand dimensions for compatibility with Keras

X_train, X_val, y_train, y_val = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

embedding_dim = 128

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10
batch_size = 64

checkpoint_path = "sentiment_analysis_model.keras"
checkpoint = ModelCheckpoint(checkpoint_path, save_best_only=True, monitor='val_loss', mode='min')

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val), callbacks=[checkpoint])

# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}, Validation Accuracy: {accuracy:.4f}')

model.save('sentiment_analysis_model.h5')

# Save the tokenizer
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Load the trained sentiment analysis model
model_path = '/home/venkatesh/Desktop/jupyter_notebook/sentiment_analysis_model.h5'  # Replace with your actual model path
model = load_model(model_path)

# Load the tokenizer
tokenizer_path = 'tokenizer.pickle'
with open(tokenizer_path, 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the identified text data from CSV
identified_text_path = '/home/venkatesh/Desktop/jupyter_notebook/identified.csv'
df_identified_text = pd.read_csv(identified_text_path)

# Assuming the CSV structure: ['line', 'identified text']
identified_texts = df_identified_text['identified text'].tolist()

# Preprocess the identified texts
max_sequence_length = 100  # Maximum length of a sequence

text_seqs = tokenizer.texts_to_sequences(identified_texts)
text_pads = pad_sequences(text_seqs, maxlen=max_sequence_length)

# Initialize a list to store predicted sentiments
predicted_sentiments = []

# Iterate over each identified text
for idx, identified_text in enumerate(df_identified_text['identified text']):
    # Find the matching line in the original dataset
    matching_row = df[df['line'] == identified_text]
    
    # Check if a matching line is found
    if not matching_row.empty:
        # Get the sentiment of the matching line
        sentiment = matching_row['sentiment'].values[0]
    else:
        # If no match is found, predict sentiment using the model
        text_seq = tokenizer.texts_to_sequences([identified_text])
        text_pad = pad_sequences(text_seq, maxlen=100)
        prediction = model.predict(text_pad)
        sentiment = df['sentiment'][np.argmax(prediction[0])]
    
    # Append the sentiment to the list
    predicted_sentiments.append(sentiment)

# Add predicted sentiments to the dataframe
df_identified_text['predicted_sentiment'] = predicted_sentiments

# Print the results
print(df_identified_text[['line', 'identified text', 'predicted_sentiment']])


                                                line sentiment
0  I AM REALLY FRUSTRATED BECAUSE YOU CONSTANTLY ...     Angry
1  IT MAKES ME UPSET THAT YOU NEVER TAKE RESPONSI...     Angry
2  I CANNOT BELIEVE YOU MISSED ANOTHER DEADLINE A...     Angry
3  IT ANNOYS ME WHEN YOU INTERRUPT DURING MEETING...     Angry
4  I AM TIRED OF YOUR EXCUSES EVERY TIME SOMETHIN...     Angry
Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.3750 - loss: 0.6898 - val_accuracy: 0.3333 - val_loss: 0.6779
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.3750 - loss: 0.6699 - val_accuracy: 0.3333 - val_loss: 0.6531
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.3333 - loss: 0.6373 - val_accuracy: 0.3333 - val_loss: 0.6163
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.3333 - loss: 0.5940 - val_accuracy: 0.3333 - val_loss: 0.5541
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.3333 - loss: 0.5353 - val_accuracy: 0.3333 - val_loss: 0.4462
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.3333 - loss: 0.4500 - val_accuracy: 0.3333 - val_loss: 0.2878
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Validation Loss: -0.0136, Validation Accuracy: 0.3333




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
     line                                    identified text  \
0  line_1  I AM REALLY ANNOYED BY YOUR CONSTANT COMPLAINI...   
1  line_2  IT IS FRUSTRATING THAT YOU NEVER PAY ATTENTION...   
2  line_3  I AM DELIGHTED BY YOUR FRIENDLINESS AND YOU AL...   
3  line_4  IT IS WONDERFUL THAT YOU ALWAYS SHOW KINDNESS ...   
4  line_5  YOUR ANALYSIS OF THE DATA WAS ACCURATE AND WEL...   
5  line_6  THE MEETING MINUTES YOU PREPARED WERE DETAILED...   

  predicted_sentiment  
0               Angry  
1               Angry  
2               Angry  
3               Angry  