In [1]:
# Import neccesary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import json

# For Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# For LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

# For BERT
from datasets import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments

In [2]:
# 1. Data Loading & Preprocessing
# Load a small subset of dataset 
df = pd.read_csv('review.csv').sample(n=5000, random_state=42)

# Remove rows with missing values and duplicates
df.dropna(subset=['review', 'score'], inplace=True)
df.drop_duplicates(inplace=True)

# Cleaning function for text
def clean_text(text):
    text = text.strip()
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.lower()

df['review'] = df['review'].apply(clean_text)

# Map scores to sentiment labels (0: Negative, 1: Neutral, 2: Positive)
def score_to_label(score):
    if score <= 2:
        return 0
    elif score == 3:
        return 1
    else:
        return 2

df['label'] = df['score'].apply(score_to_label)

# For BERT, rename the review column to "text"
bert_df = df[['review', 'label']].rename(columns={'review': 'text'})

# Split the data for all
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print("Data loaded and preprocessed.")

Data loaded and preprocessed.


In [3]:
# Experiment 1: Logistic Regression + TF-IDF
print("\n--- Logistic Regression Experiment ---")
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(train_df['review'])
X_test_tfidf = tfidf.transform(test_df['review'])

# Train a simple Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, train_df['label'])

# Evaluate the model
lr_preds = lr_model.predict(X_test_tfidf)
print("Logistic Regression Classification Report:")
print(classification_report(test_df['label'], lr_preds))


--- Logistic Regression Experiment ---
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.11      0.20        73
           1       0.00      0.00      0.00        43
           2       0.89      1.00      0.94       874

    accuracy                           0.89       990
   macro avg       0.59      0.37      0.38       990
weighted avg       0.85      0.89      0.85       990



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Experiment 2: LSTM Model
print("\n--- LSTM Experiment ---")
# Tokenize the text 
vocab_size = 5000
max_length = 100
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['review'])

train_seq = tokenizer.texts_to_sequences(train_df['review'])
test_seq = tokenizer.texts_to_sequences(test_df['review'])
train_pad = pad_sequences(train_seq, maxlen=max_length, padding='post', truncating='post')
test_pad = pad_sequences(test_seq, maxlen=max_length, padding='post', truncating='post')

# Build a simple LSTM model
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_length),
    LSTM(64),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])
lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Train the model 
lstm_model.fit(train_pad, train_df['label'].values, epochs=3, batch_size=32, validation_split=0.2)
loss, acc = lstm_model.evaluate(test_pad, test_df['label'].values, verbose=1)
print("LSTM Test Accuracy:", acc)


--- LSTM Experiment ---
Epoch 1/3
Epoch 2/3
Epoch 3/3
LSTM Test Accuracy: 0.8828282952308655


In [9]:
# Experiment 3: BERT 
print("\n--- BERT Experiment  ---")
small_train_df = train_df.sample(n=1000, random_state=42)
small_test_df = test_df.sample(n=100, random_state=42)

# Create Hugging Face Datasets; rename review to text
train_dataset = Dataset.from_pandas(small_train_df[['review', 'label']].rename(columns={'review': 'text'}))
test_dataset = Dataset.from_pandas(small_test_df[['review', 'label']].rename(columns={'review': 'text'}))

model_name = "distilbert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenization function 
def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=64, truncation=True)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
test_dataset = test_dataset.map(tokenize_fn, batched=True)

# Set the format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Initialize model for sequence classification
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Define training arguments )
training_args = TrainingArguments(
    output_dir="./distilbert_fast",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_steps=10,
    report_to=[]
)

# Initialize Trainer 
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("BERT (Fast Mode) Evaluation Results:", results)
predictions_output = trainer.predict(test_dataset)
preds = np.argmax(predictions_output.predictions, axis=1)
print("BERT (Fast Mode) Classification Report:")
print(classification_report(predictions_output.label_ids, preds))


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.



--- BERT Experiment  ---


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4815,0.258341


BERT (Fast Mode) Evaluation Results: {'eval_loss': 0.25834140181541443, 'eval_runtime': 6.3216, 'eval_samples_per_second': 15.819, 'eval_steps_per_second': 2.056, 'epoch': 1.0}
BERT (Fast Mode) Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.00      0.00      0.00         5
           2       0.95      0.99      0.97        87

    accuracy                           0.92       100
   macro avg       0.54      0.58      0.56       100
weighted avg       0.88      0.92      0.90       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
