<a href="https://colab.research.google.com/github/uresha1995/Research-Methodology/blob/main/Assignment_2_22074260_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Sentiment Analysis on the IMDb dataset**

---



In [None]:
!pip install --no-cache-dir transformers==4.38.2 datasets==2.18.0 evaluate==0.4.1

In [None]:
#Import libraries

from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader
import evaluate
import torch
from tqdm import tqdm
import re
import random
import seaborn as sns

In [None]:
#Download dataset from Hugging Face datasets
#Dataset contains 50,000 movie reviews labelled as positive (1) or negative (0)

dataset = load_dataset("imdb")
print(dataset)

In [None]:
#Print  random reviews
#To understand the dataset content

sample_texts = [dataset["train"][i]["text"] for i in random.sample(range(25000), 5)]

for i, text in enumerate(sample_texts):
    print(f"--- Sample {i+1} ---\n{text[:500]}\n")

Preprocessing

In [None]:
#Remove empty and short reviews
#less than 20 characters

def rem_empty(example):
    return len(example["text"].strip()) > 20

In [None]:
#Remove HTML break tags and replace with space

def clean_text(example):
    text = example["text"]
    text = re.sub(r"<br\s*/?>", " ", text)
    example["text"] = text.strip()
    return example

In [None]:
#Filter out short reviews

dataset["train"] = dataset["train"].filter(rem_empty)
dataset["test"] = dataset["test"].filter(rem_empty)
dataset = dataset.map(clean_text)

Tokenization using BertTokenizerFast

In [None]:
#Making subset for training and testing
#Shuffle and take small subset to train/test faster

train_data = dataset["train"].shuffle(seed=42).select(range(2000))
test_data = dataset["test"].shuffle(seed=42).select(range(1000))

In [None]:
#Define tokenization function
#Load the BERT tokenizer

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [None]:
#Apply tokenization

tokenized_train = small_train.map(tokenize_function, batched=True)
tokenized_test = small_test.map(tokenize_function, batched=True)

In [None]:
#Convert to PyTorch tensor format
#Make data compatible with DataLoader and training loop

train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
#Create PyTorch DataLoaders for batching during training/testing

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

In [None]:
#Load pre-trained BERT model for binary classification
#Use GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)


In [None]:
#Optimizer for training the model

from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

Training Loop

In [None]:
#Fine-tune the BERT model for 2 epochs

from tqdm import tqdm

epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['labels'] = batch.pop('label')  # Ensure label is correctly named
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} - Training Loss: {total_loss:.4f}")

Evaluation

In [None]:
#Evaluate model on the test dataset
#Test accuracy and classification report

from sklearn.metrics import accuracy_score, classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating", leave=False):
        batch = {k: v.to(device) for k, v in batch.items()}
        if 'label' in batch:
            batch['labels'] = batch.pop('label')
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

acc = accuracy_score(all_labels, all_preds)

print(f"Test Accuracy: {acc:.4f}")
print(f"Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))


In [None]:
#Confusion matrix

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt

#Compute the confusion matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1])

#Display the matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive"])

#Plot
plt.figure(figsize=(6, 5))
disp.plot(cmap="Blues", values_format='d')
plt.title("Confusion Matrix")
plt.show()

In [None]:
#F1 score

#Print F1-score
f1 = f1_score(all_labels, all_preds)
print(f"F1-score: {f1:.4f}")

Sample prediction

In [None]:
#Predict sentiment with confidence
def predict_sentiment(text):

    #Tokenize input and move to same device as model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)

    #Forward pass through the model
    outputs = model(**inputs)

    #Get predicted class and confidence
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if predicted_class == 1 else "Negative"

#Example predictions
sample_texts = [
    "The movie was absolutely fantastic!",
    "It was a boring and predictable film.",
    "I don't know how to feel about it.",
    "This is the best performance I've ever seen.",
    "The film tried hard but didn’t deliver much.",
    "Honestly, I expected more from the director.",
    "A masterpiece. Every scene was beautifully crafted"
]

#Print result
for text in sample_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: {text}\nPredicted Sentiment: {sentiment}\n")