In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Load IMDB dataset (binary sentiment classification)
dataset = load_dataset("imdb")

In [None]:
# Display dataset details
print(f"Train Size: {len(dataset['train'])}, Test Size: {len(dataset['test'])}")
print("Example Positive Review:\n", dataset['train'][0]['text'][:500])
print("\nExample Negative Review:\n", dataset['train'][-1]['text'][:500])

In [None]:
# Plot label distribution
labels = [example['label'] for example in dataset['train']]
label_df = pd.DataFrame(labels, columns=['label'])
label_df['label'] = label_df['label'].map({0: 'Negative', 1: 'Positive'})
sns.countplot(data=label_df, x='label')
plt.title("Label Distribution in IMDB Training Set")
plt.show()

Model

In [None]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.to(device)

In [None]:
# Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
# Prepare DataLoaders
train_loader = DataLoader(tokenized_dataset['train'], batch_size=16, shuffle=True)
test_loader = DataLoader(tokenized_dataset['test'], batch_size=16)

In [None]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
# Training Loop (1 epoch example)
model.train()
for batch in train_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    break

In [None]:
# Evaluate model
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        break

# Print metrics
print(classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']))


In [None]:
# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()