In [None]:
# Import neccesary library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import os
import json
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Data preprocessing

### 1. Data loading

In [None]:
# Load dataset 
df = pd.read_csv('review.csv') # make sure the data is downloaded to the same folder as this file

# Print the structure and summary of the DataFrame
print("DataFrame Info:")
print(df.info())

# Display the first 5 rows of the data
print("\nFirst 5 rows:")
print(df.head())

### 2. Data Cleaning

In [None]:
# Display initial shape of the DataFrame
print("Initial shape:", df.shape)

# Remove duplicate rows if any
df.drop_duplicates(inplace=True)

# Drop rows with missing values in review or score
df.dropna(subset=['review', 'score'], inplace=True)

# Define a cleaning function for the review text
def clean_text(text):
    # Remove leading and trailing whitespace
    text = text.strip()
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

# Apply the cleaning function to the review column
df['review'] = df['review'].apply(clean_text)

# Display the shape of the DataFrame after cleaning
print("Shape after cleaning:", df.shape)

# Display the first few rows of the cleaned DataFrame
print(df.head())

### 3. Feature engineering

In [None]:
# Create numeric labels from scores
def score_to_label(score):
    if score <= 2:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['label'] = df['score'].apply(score_to_label)

# Create a DataFrame with only the text and label columns 
bert_df = df[['review', 'label']].rename(columns={'review': 'text'})
print("\nData sample for BERT:")
print(bert_df.head())

# Count the number of instances for each label
label_counts = bert_df['label'].value_counts()
print("Label counts:")
print(label_counts)


In [None]:
# Set the desired sample size for class 2
set_class2_size = 200000

# Undersample class 2 (Positive) to the desired count
df2 = bert_df[bert_df['label'] == 2].sample(n=set_class2_size, random_state=42)

# Keep class 0 (Negative) and class 1 (Neutral) as they are
df0 = bert_df[bert_df['label'] == 0]
df1 = bert_df[bert_df['label'] == 1]

# Combine the datasets and shuffle
balanced_df = pd.concat([df0, df1, df2]).sample(frac=1, random_state=42)
print("\nBalanced label counts:")
print(balanced_df['label'].value_counts())

# Model Building

### Baseline Model 1 using Logistic Regression and TF-IDF

In [None]:
# Split the balanced data into training and testing sets for baseline
X_train, X_test, y_train, y_test = train_test_split(balanced_df['text'], balanced_df['label'],
                                                    test_size=0.2, random_state=42)

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# Predict and Evaluate
y_pred = clf.predict(X_test_tfidf)
print("\nBaseline Model: Logistic Regression + TF-IDF")
print(classification_report(y_test, y_pred, digits=2))
cm_baseline = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
plt.imshow(cm_baseline, interpolation='nearest', cmap=plt.cm.Oranges)
plt.title('Baseline Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(3)
plt.xticks(tick_marks, ['Negative (0)', 'Neutral (1)', 'Positive (2)'])
plt.yticks(tick_marks, ['Negative (0)', 'Neutral (1)', 'Positive (2)'])
thresh = cm_baseline.max() / 2.
for i in range(cm_baseline.shape[0]):
    for j in range(cm_baseline.shape[1]):
        plt.text(j, i, format(cm_baseline[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm_baseline[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


### Baseline Model 2 using LSTM and Embedding Layers with early stopping

In [None]:
# Set hyperparameters
vocab_size = 20000      # Maximum number of words in the vocabulary
embedding_dim = 100     # Dimension of the embedding vector
max_length = 100        # Maximum length of each review in tokens
padding_type = 'post'   # Pad sequences at the end
trunc_type = 'post'     # Truncate sequences at the end if longer than max_length
oov_tok = "<OOV>"       # Token for out-of-vocabulary words
num_epochs = 5          
batch_size = 128       

# Prepare  text and label data 
texts = balanced_df['text'].tolist()     
labels = balanced_df['label'].tolist()    

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Initialize and fit the Tokenizer on the training data
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad the sequences so that all inputs have the same length
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Build the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')  # Three classes: Negative, Neutral, Positive
])

# Compile the model with sparse categorical crossentropy
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()

# Set up early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model
history = model.fit(
    train_padded, 
    np.array(train_labels), 
    epochs=num_epochs, 
    batch_size=batch_size, 
    validation_split=0.2, 
    callbacks=[early_stop]
)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_padded, np.array(test_labels), verbose=1)
print("Test Accuracy:", accuracy)

In [None]:
# Get predictions from the test dataset 
predictions = model.predict(test_padded)
preds = np.argmax(predictions, axis=1)
true_labels = np.array(test_labels)

# Print the classification report
report = classification_report(true_labels, preds, digits=2)
print("\nClassification Report:")
print(report)

# Plot the confusion matrix 
cm = confusion_matrix(true_labels, preds)
plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=14)
plt.colorbar()
classes = ['Negative (0)', 'Neutral (1)', 'Positive (2)']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, fontsize=12)
plt.yticks(tick_marks, classes, fontsize=12)

# Add text annotations inside the squares
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=12)

plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()

## Advanced Model: Fine-tuning BERT for Sentiment Classification

In [None]:
# Make sure your system is setup for tensorflow with GPU acceleration(CUDA)
if __name__ == '__main__':
    # Split the data into training and testing sets
    train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)
    
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Run with subset to decrease run time
    train_dataset = train_dataset.shuffle(seed=42).select(range(100000)) # Change 100K to higher/lower number based on your computing resource
    test_dataset = test_dataset.shuffle(seed=42).select(range(10000)) # Change 10K to higher/lower number based on your computing resource
    
    # Load the pre-trained BERT tokenizer and model
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

    # Tokenize the datasets
    def tokenize_function(examples, tokenizer=tokenizer):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    # Use multiple CPU cores for tokenization 
    num_cores = 8  # Check your CPU for the number of cores to maximize your computing resource
    train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=num_cores)
    test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=num_cores)

    # Set the format for PyTorch to use the necessary columns
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

    # Define training arguments, enabling best model saving and early stopping
    training_args = TrainingArguments(
        output_dir=r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2", # Define a path of your own to store the model
        eval_strategy="epoch",
        save_strategy="epoch",               
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir=r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_logs", # Define a path of your own to store the logs
        logging_steps=10,
        report_to=[],
        load_best_model_at_end=True,       
        metric_for_best_model="eval_loss"  
    )

    # Initialize the Trainer with an early stopping callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model 
    trainer.train()

    # Evaluate the model's performance on the test set
    results = trainer.evaluate()
    print("\nEvaluation Results:")
    print(results)


### Reload best model (for use for reloading model after restart of console)

In [None]:
#find best model out of checkpoints 
checkpoints = [
    r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2\checkpoint-6250", #Use the path to your model that has all the checkpoint folders
    r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2\checkpoint-12500", #Include all checkpoint folders to find best model
    r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2\checkpoint-18750"
]

for ckpt_path in checkpoints:
    trainer_state_path = os.path.join(ckpt_path, "trainer_state.json")
    with open(trainer_state_path, "r") as f:
        trainer_state = json.load(f)
    best_ckpt = trainer_state.get("best_model_checkpoint", None)
    print(f"{ckpt_path} => best_model_checkpoint: {best_ckpt}")


In [None]:
# Split balanced DataFrame to obtain the test set 
_, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

test_dataset = Dataset.from_pandas(test_df)

# Select a subset 
test_dataset = test_dataset.shuffle(seed=42).select(range(10000))

# Load the pre-trained tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define the tokenization function and include a tokenizer parameter
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Tokenize the test dataset using multiple CPU cores; pass the tokenizer via fn_kwargs
num_cores = 8 # Check your CPU for the number of cores to maximize your computing resource
test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=num_cores, fn_kwargs={'tokenizer': tokenizer})
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments 
training_args = TrainingArguments(
    output_dir=r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2", # Define a new path as output
    eval_strategy="epoch",
    save_strategy="epoch",               
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir=r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_logs", # Define a new path as logs
    logging_steps=10,
    report_to=[],
    load_best_model_at_end=True,       
    metric_for_best_model="eval_loss"
)

# Define the best checkpoint path from previous training
best_checkpoint = r"C:\Users\zapor\OneDrive\桌面\IE 7500\Project\bert_results_2\checkpoint-12500" #Use the path that was find previously to reload best model

# Reload the trained model from the best checkpoint and move it to GPU 
model = BertForSequenceClassification.from_pretrained(best_checkpoint)
model.to("cuda")  

# Initialize the Trainer with the reloaded model and the test dataset
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

results = trainer.evaluate()
print("\nEvaluation Results:")
print(results)

In [None]:
# Get predictions from the test dataset
predictions_output = trainer.predict(test_dataset)
preds = np.argmax(predictions_output.predictions, axis=1)
true_labels = predictions_output.label_ids

# Print the classification report
report = classification_report(true_labels, preds, digits=2)
print("\nClassification Report:")
print(report)

# Plot the confusion matrix using matplotlib
cm = confusion_matrix(true_labels, preds)
plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix', fontsize=14)
plt.colorbar()
classes = ['Negative (0)', 'Neutral (1)', 'Positive (2)']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, fontsize=12)
plt.yticks(tick_marks, classes, fontsize=12)

# Add text annotations inside the squares
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=12)

plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.show()