# Sentiment Analysis Experiment: Custom Model vs. RoBERTa (Transfer Learning)

This notebook presents a full comparison between a self-trained sentiment analysis model (LSTM, CNN, Hybrid) and a transfer learning approach using RoBERTa. The dataset used is Sentiment140.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt


## Step 1: Load and Preprocess Sentiment140 Dataset

In [None]:
# Load dataset
df = pd.read_csv('sentiment140.csv', encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df['target'] = df['target'].replace({0: 0, 2: 1, 4: 2})

# Tokenization and Padding
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=50)

# Encode target
y = tf.keras.utils.to_categorical(df['target'], num_classes=3)

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


## Step 2: Custom Model Architectures (LSTM, CNN, Hybrid)

In [None]:
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 128, input_length=50),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model_cnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 128, input_length=50),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')
])
model_cnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model_hybrid = tf.keras.Sequential([
    tf.keras.layers.Embedding(5000, 128, input_length=50),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(3, activation='softmax')
])
model_hybrid.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


## Step 3: Model Training with Early Stopping

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3),
    tf.keras.callbacks.ModelCheckpoint('best_model_lstm.h5', save_best_only=True)
]

history = model_lstm.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    callbacks=callbacks
)


## Step 4: Model Evaluation and Visualization

In [None]:
y_pred = model_lstm.predict(X_test)
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1), target_names=['Negative', 'Neutral', 'Positive']))
