**AN LSTM MODEL TO DO SENTIMENT ANALYSIS ON MOVIE REVIEW**

Importing libraries and preprocessing the dataset

In [50]:
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

# reading the csv file using pandas
df = pd.read_csv('Test.csv')
texts = df["text"].astype(str)
labels = df["label"].astype(int)


# Basic cleaning function

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br\s*/?>", " ", text)     # remove HTML breaks
    text = re.sub(r"http\S+|www\.\S+", " ", text)  # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)    # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()       # normalize spaces
    return text

texts_clean = [clean_text(t) for t in texts]

#building voacbulary
vocab_size = 20000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(texts_clean)

#text -> integers
sequences = tokenizer.texts_to_sequences(texts_clean)

# truncate sequences
max_len = 200
X = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")
y = np.array(labels, dtype="int32")

Spliting the dataset for training(about 70%, used for training)

In [51]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train shape:", X_train.shape, y_train.shape)
print("Val shape:  ", X_val.shape, y_val.shape)
print("Test shape: ", X_test.shape, y_test.shape)

Train shape: (3500, 200) (3500,)
Val shape:   (750, 200) (750,)
Test shape:  (750, 200) (750,)


Training the dataset

In [52]:
vocab_size = min(vocab_size, len(tokenizer.word_index) + 1)
embed_dim = 128
lstm_units = 128

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")  # binary sentiment
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy"]
)

# Train model
history = model.fit(
    X_train, y_train,
    batch_size=64,
    epochs= 25,
    validation_data=(X_val, y_val)
)

Epoch 1/25




[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 809ms/step - accuracy: 0.5123 - loss: 0.6921 - val_accuracy: 0.7200 - val_loss: 0.5876
Epoch 2/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 920ms/step - accuracy: 0.7662 - loss: 0.5142 - val_accuracy: 0.7587 - val_loss: 0.5530
Epoch 3/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 799ms/step - accuracy: 0.9053 - loss: 0.2593 - val_accuracy: 0.7267 - val_loss: 0.6447
Epoch 4/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 798ms/step - accuracy: 0.9497 - loss: 0.1351 - val_accuracy: 0.7533 - val_loss: 0.7309
Epoch 5/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 823ms/step - accuracy: 0.9731 - loss: 0.0785 - val_accuracy: 0.7613 - val_loss: 0.8523
Epoch 6/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 798ms/step - accuracy: 0.9884 - loss: 0.0362 - val_accuracy: 0.7293 - val_loss: 1.0358
Epoch 7/25
[1m55/55[0m [32m━━━

Evalutating the model using test data

In [56]:

#Evaluate on test set
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print(f"Test loss: {loss:.4f}, Test accuracy: {acc:.4f}")

#predict sentiment for test data
iter =  10
count = 0
for i in range(iter):
    x = X_test[i:i+1]
    true_label = y_test[i]

    prob = float(model.predict(x, verbose=0)[0][0])
    pred_label = 1 if prob >= 0.5 else 0
    if pred_label == true_label:
      count += 1

    print(f"True label: {true_label}  Pred prob (positive): {prob:.4f}")
    print("Predicted:", "POSITIVE" if pred_label == 1 else "NEGATIVE")
    print("")

print(f"Accuracy = {(100*count/iter) :.4f} %")


Test loss: 1.6173, Test accuracy: 0.7067
True label: 1  Pred prob (positive): 0.9927
Predicted: POSITIVE

True label: 0  Pred prob (positive): 0.0001
Predicted: NEGATIVE

True label: 1  Pred prob (positive): 0.0048
Predicted: NEGATIVE

True label: 0  Pred prob (positive): 0.9987
Predicted: POSITIVE

True label: 0  Pred prob (positive): 0.0000
Predicted: NEGATIVE

True label: 1  Pred prob (positive): 0.9994
Predicted: POSITIVE

True label: 0  Pred prob (positive): 0.1782
Predicted: NEGATIVE

True label: 0  Pred prob (positive): 0.9984
Predicted: POSITIVE

True label: 0  Pred prob (positive): 0.0000
Predicted: NEGATIVE

True label: 0  Pred prob (positive): 0.0001
Predicted: NEGATIVE

Accuracy = 70.0000 %
