In [None]:
# Text Sentiment Analysis using DistilBERT Transformer

import torch
from torch.utils.data import DataLoader, random_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import os

# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and sample IMDB dataset
dataset = load_dataset("imdb")
small_train_dataset = dataset['train'].shuffle(seed=42).select(range(2000))
small_test_dataset = dataset['test'].shuffle(seed=42).select(range(1000))

# Display label distribution
labels = [example['label'] for example in small_train_dataset]
label_df = pd.DataFrame(labels, columns=['label'])
label_df['label'] = label_df['label'].map({0: 'Negative', 1: 'Positive'})
sns.countplot(data=label_df, x='label')
plt.title("Label Distribution in Sampled IMDB Training Set")
plt.show()

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

# Tokenize datasets
tokenized_dataset = small_train_dataset.map(tokenize, batched=True)
tokenized_test = small_test_dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Split train/validation
train_size = int(0.8 * len(tokenized_dataset))
val_size = len(tokenized_dataset) - train_size
tokenized_train, tokenized_val = random_split(tokenized_dataset, [train_size, val_size])

# DataLoaders
train_loader = DataLoader(tokenized_train, batch_size=16, shuffle=True)
val_loader = DataLoader(tokenized_val, batch_size=16)
test_loader = DataLoader(tokenized_test, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop with validation
EPOCHS = 3
results = []
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, axis=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    report = classification_report(val_labels, val_preds, target_names=['Negative', 'Positive'], output_dict=True)
    accuracy = report['accuracy']
    results.append({"epoch": epoch+1, "loss": total_loss / len(train_loader), "val_accuracy": accuracy})
    model.train()

# Save model
os.makedirs("models", exist_ok=True)
torch.save(model.state_dict(), "models/distilbert_imdb.pt")

# Save tokenizer
tokenizer.save_pretrained("models/tokenizer/")


# Evaluation on test set
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Classification metrics
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))

from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Set Accuracy: {test_accuracy:.4f}")


# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Plot training loss
plt.figure()
plt.plot(results_df["epoch"], results_df["loss"], marker='o', label='Training Loss')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss per Epoch")
plt.legend()
plt.show()

# Plot validation accuracy
plt.figure()
plt.plot(results_df["epoch"], results_df["val_accuracy"], marker='o', color='green', label='Validation Accuracy')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Validation Accuracy per Epoch")
plt.legend()
plt.show()


# Predict custom review sentiment
def predict_sentiment(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, axis=1).item()
    return "Positive" if pred == 1 else "Negative"

# Test prediction
example_review = "This movie was absolutely fantastic, with great performances!"
print(f"\nTest Review Sentiment: {predict_sentiment(example_review)}")
