In [7]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [8]:
# Install required libraries
!pip install -q transformers datasets seaborn

In [9]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [10]:
# Check GPU availability
print("GPU Available:", torch.cuda.is_available())

GPU Available: True


In [11]:
# Load the preprocessed dataset
df_reviews = pd.read_csv("/content/df_reviews.csv")
df_reviews.head()


Unnamed: 0,sentiment,text,tokens,length,processed_text
0,neutral,decide eat aware going take 2 hour beginning e...,"['decide', 'eat', 'aware', 'going', 'take', '2...",277,decide eat aware going take 2 hour beginning e...
1,positive,ive taken lot spin class year nothing compare ...,"['ive', 'taken', 'lot', 'spin', 'class', 'year...",540,ive taken lot spin class year nothing compare ...
2,neutral,family diner buffet eclectic assortment large ...,"['family', 'diner', 'buffet', 'eclectic', 'ass...",260,family diner buffet eclectic assortment large ...
3,positive,wow yummy different delicious favorite lamb cu...,"['wow', 'yummy', 'different', 'delicious', 'fa...",153,wow yummy different delicious favorite lamb cu...
4,positive,cute interior owner gave u tour upcoming patio...,"['cute', 'interior', 'owner', 'gave', 'u', 'to...",380,cute interior owner gave u tour upcoming patio...


In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df_reviews['text'],
    df_reviews['sentiment'],
    test_size=0.2,
    random_state=42
)

In [13]:
# Map sentiment labels to numeric values
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y_train_numeric = y_train.map(label_map).tolist()
y_test_numeric = y_test.map(label_map).tolist()


In [14]:
import os
# Create the directory if it doesn't exist
os.makedirs("/content/drive/MyDrive/data/processed", exist_ok=True)

# Save y_test_numeric to a CSV file
pd.DataFrame(y_test_numeric, columns=['sentiment']).to_csv("/content/drive/MyDrive/data/processed/y_test_numeric.csv", index=False)

In [15]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [16]:
# Tokenization for RoBERTa
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
train_encodings_roberta = tokenizer_roberta(
    X_train.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt"
)
test_encodings_roberta = tokenizer_roberta(
    X_test.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt"
)

In [17]:
# Tokenization for XLNet
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased')
train_encodings_xlnet = tokenizer_xlnet(
    X_train.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt"
)
test_encodings_xlnet = tokenizer_xlnet(
    X_test.tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt"
)

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [18]:
# Create datasets
train_dataset_roberta = CustomDataset(train_encodings_roberta, y_train_numeric)
test_dataset_roberta = CustomDataset(test_encodings_roberta, y_test_numeric)
train_dataset_xlnet = CustomDataset(train_encodings_xlnet, y_train_numeric)
test_dataset_xlnet = CustomDataset(test_encodings_xlnet, y_test_numeric)


In [19]:
# Define models
model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model_xlnet = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=3)


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# Define compute_metrics function for more detailed evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Training arguments with early stopping and regularization
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/models',
    num_train_epochs=10, 
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.05, 
    logging_dir='/content/drive/MyDrive/logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    gradient_accumulation_steps=2,
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
)

In [22]:
# Early stopping callback
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=5) 


In [23]:
# Train and evaluate using RoBERTa
trainer_roberta = Trainer(
    model=model_roberta,
    args=training_args,
    train_dataset=train_dataset_roberta,
    eval_dataset=test_dataset_roberta,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [24]:
trainer_roberta.train()
predictions_roberta = trainer_roberta.predict(test_dataset_roberta)
y_pred_roberta = np.argmax(predictions_roberta.predictions, axis=1)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2778,0.45638,0.8547,0.833057,0.8547,0.834065
2,0.3209,0.365075,0.8572,0.860256,0.8572,0.858447
3,0.1693,0.39516,0.8611,0.853861,0.8611,0.856225
4,0.2101,0.453184,0.8615,0.859812,0.8615,0.860493
5,0.1729,0.520737,0.8552,0.863618,0.8552,0.857308
6,0.1632,0.609383,0.8589,0.861633,0.8589,0.860224
7,0.1379,0.7712,0.8598,0.862131,0.8598,0.860689


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [25]:
# Train and evaluate using XLNet
trainer_xlnet = Trainer(
    model=model_xlnet,
    args=training_args,
    train_dataset=train_dataset_xlnet,
    eval_dataset=test_dataset_xlnet,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)

In [26]:
trainer_xlnet.train()
predictions_xlnet = trainer_xlnet.predict(test_dataset_xlnet)
y_pred_xlnet = np.argmax(predictions_xlnet.predictions, axis=1)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3118,0.411099,0.8537,0.827353,0.8537,0.827457
2,0.3221,0.375768,0.8568,0.854428,0.8568,0.855095
3,0.2245,0.451623,0.8546,0.840628,0.8546,0.84428
4,0.1166,0.492462,0.8565,0.858611,0.8565,0.856312
5,0.1614,0.532378,0.8583,0.848452,0.8583,0.851994
6,0.1549,0.763616,0.852,0.853786,0.852,0.85287
7,0.1198,0.890975,0.8496,0.853505,0.8496,0.851453


In [32]:
def plot_training_and_validation_loss(trainer, model_name):
    training_losses = [entry["loss"] for entry in trainer.state.log_history if "loss" in entry]
    validation_losses = [entry["eval_loss"] for entry in trainer.state.log_history if "eval_loss" in entry]

    # Fix: Adjust epochs to match the length of validation_losses
    epochs = range(1, len(validation_losses) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(epochs, training_losses[:len(validation_losses)], label='Training Loss', color='blue') # Adjust training_losses as well
    plt.plot(epochs, validation_losses, label='Validation Loss', color='orange')
    plt.title(f'{model_name} Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.savefig(f'/content/drive/MyDrive/models/{model_name.lower()}_loss_plot.png')
    plt.close()

In [33]:
def plot_evaluation_metrics(y_true, y_pred, model_name):
    metrics = {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, average='weighted'),
        "Recall": recall_score(y_true, y_pred, average='weighted'),
        "F1 Score": f1_score(y_true, y_pred, average='weighted')
    }
    metric_df = pd.DataFrame(metrics.items(), columns=['Metric', 'Score'])

    plt.figure(figsize=(10, 6))
    sns.barplot(data=metric_df, x='Metric', y='Score')
    plt.title(f'{model_name} Evaluation Metrics')
    plt.ylim(0, 1)
    plt.savefig(f'/content/drive/MyDrive/models/{model_name.lower()}_metrics_plot.png')
    plt.close()

In [34]:
def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=list(label_map.keys()), yticklabels=list(label_map.keys()))
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'/content/drive/MyDrive/models/{model_name.lower()}_confusion_matrix.png')
    plt.close()

In [35]:
# Plot losses and metrics
plot_training_and_validation_loss(trainer_roberta, "RoBERTa")
plot_training_and_validation_loss(trainer_xlnet, "XLNet")
plot_evaluation_metrics(y_test_numeric, y_pred_roberta, "RoBERTa")
plot_evaluation_metrics(y_test_numeric, y_pred_xlnet, "XLNet")
plot_confusion_matrix(y_test_numeric, y_pred_roberta, "RoBERTa")
plot_confusion_matrix(y_test_numeric, y_pred_xlnet, "XLNet")

In [36]:
# Save the trained models
trainer_roberta.save_model('/content/drive/MyDrive/models/roberta_model')
trainer_xlnet.save_model('/content/drive/MyDrive/models/xlnet_model')


In [37]:
# Generate a final report
report = f"""
Training Report
================

Models:
- RoBERTa
- XLNet

Training Epochs: 20

Evaluation Metrics:

RoBERTa:
- Accuracy: {accuracy_score(y_test_numeric, y_pred_roberta)}
- Precision: {precision_score(y_test_numeric, y_pred_roberta, average='weighted')}
- Recall: {recall_score(y_test_numeric, y_pred_roberta, average='weighted')}
- F1 Score: {f1_score(y_test_numeric, y_pred_roberta, average='weighted')}

XLNet:
- Accuracy: {accuracy_score(y_test_numeric, y_pred_xlnet)}
- Precision: {precision_score(y_test_numeric, y_pred_xlnet, average='weighted')}
- Recall: {recall_score(y_test_numeric, y_pred_xlnet, average='weighted')}
- F1 Score: {f1_score(y_test_numeric, y_pred_xlnet, average='weighted')}

Confusion Matrices:
- RoBERTa: {confusion_matrix(y_test_numeric, y_pred_roberta)}
- XLNet: {confusion_matrix(y_test_numeric, y_pred_xlnet)}

Models saved at:
- RoBERTa: /content/drive/MyDrive/models/roberta_model
- XLNet: /content/drive/MyDrive/models/xlnet_model
"""

print(report)


Training Report

Models:
- RoBERTa
- XLNet

Training Epochs: 20

Evaluation Metrics:

RoBERTa:
- Accuracy: 0.8572
- Precision: 0.8602561736670985
- Recall: 0.8572
- F1 Score: 0.8584470266223454

XLNet:
- Accuracy: 0.8568
- Precision: 0.8544281172687748
- Recall: 0.8568
- F1 Score: 0.8550951313767481

Confusion Matrices:
- RoBERTa: [[1466  320   91]
 [ 188  552  390]
 [  92  347 6554]]
- XLNet: [[1425  339  113]
 [ 177  508  445]
 [  94  264 6635]]

Models saved at:
- RoBERTa: /content/drive/MyDrive/models/roberta_model
- XLNet: /content/drive/MyDrive/models/xlnet_model



In [38]:
# Save the report to a file
with open('/content/drive/MyDrive/models/training_report.txt', 'w') as f:
    f.write(report)