# LSTM Sentiment Model with SASentimentModel Interface

In [None]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import matplotlib.pyplot as plt


In [None]:
# Load and clean dataset
df = pd.read_csv("train_60K.csv")
df.columns = ["sentiment", "short_review", "full_review"]
df.dropna(subset=["full_review"], inplace=True)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["full_review"] = df["full_review"].astype(str).apply(clean_text)
df["sentiment"] = df["sentiment"].replace({1: 0, 2: 1})

X = df["full_review"]
y = df["sentiment"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from abc import ABC, abstractmethod

class SASentimentModel(ABC):
    def __init__(self, vocab_size=10000, embedding_dim=64, sequence_length=200, batch_size=128, epochs=5):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.epochs = epochs

        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=self.vocab_size, oov_token="<OOV>")
        self.model = None
        self.history = None

    @abstractmethod
    def register(self, X_train, y_train, X_val, y_val, X_test, y_test):
        pass

    @abstractmethod
    def preprocess(self):
        pass

    @abstractmethod
    def fit(self):
        pass

    @abstractmethod
    def predict(self):
        pass

    @abstractmethod
    def evaluate(self):
        pass

    @abstractmethod
    def summary(self):
        pass

    def run(self):
        self.register(self.X_train, self.y_train, self.X_val, self.y_val, self.X_test, self.y_test)
        self.preprocess()
        self.fit()
        self.summary()
        self.predict()
        self.evaluate()


In [None]:
class LSTMSentimentModel(SASentimentModel):
    def register(self, X_train, y_train, X_val, y_val, X_test, y_test):
        self.X_train, self.y_train = X_train, y_train
        self.X_val, self.y_val = X_val, y_val
        self.X_test, self.y_test = X_test, y_test

    def preprocess(self):
        self.tokenizer.fit_on_texts(self.X_train)
        self.X_train_seq = tf.keras.preprocessing.sequence.pad_sequences(
            self.tokenizer.texts_to_sequences(self.X_train), maxlen=self.sequence_length, padding="post")
        self.X_val_seq = tf.keras.preprocessing.sequence.pad_sequences(
            self.tokenizer.texts_to_sequences(self.X_val), maxlen=self.sequence_length, padding="post")
        self.X_test_seq = tf.keras.preprocessing.sequence.pad_sequences(
            self.tokenizer.texts_to_sequences(self.X_test), maxlen=self.sequence_length, padding="post")

    def fit(self):
        self.model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim, input_length=self.sequence_length),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=False)),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(64, activation="relu"),
            tf.keras.layers.Dense(1, activation="sigmoid")
        ])
        self.model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
        self.history = self.model.fit(
            self.X_train_seq, self.y_train,
            validation_data=(self.X_val_seq, self.y_val),
            batch_size=self.batch_size,
            epochs=self.epochs,
            verbose=1
        )

    def predict(self):
        self.y_pred = (self.model.predict(self.X_test_seq) > 0.5).astype(int).flatten()
        print(classification_report(self.y_test, self.y_pred))

    def evaluate(self):
        loss, acc = self.model.evaluate(self.X_test_seq, self.y_test)
        print(f"Test Loss: {loss:.4f}, Test Accuracy: {acc:.4f}")
        f1 = f1_score(self.y_test, self.y_pred)
        print(f"F1 Score: {f1:.4f}")
        cm = confusion_matrix(self.y_test, self.y_pred)
        print("Confusion Matrix:")
        print(cm)

    def summary(self):
        self.model.summary()


In [None]:
model = LSTMSentimentModel()
model.register(X_train, y_train, X_val, y_val, X_test, y_test)
model.run()

In [None]:
# Visualize training history
def plot_history(history):
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

plot_history(model.history)