In [None]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import BertTokenizer
import warnings
warnings.filterwarnings('ignore')

from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df = pd.read_csv('twitter_training.csv')
print (df.head())

In [None]:
df = df.drop(df.columns[[0, 1]], axis=1)
df.columns = ['sentiment', 'text']
df.dropna(inplace=True)
print(df.head())

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def tokenize_text(text):
    return tokenizer.encode_plus(
        text,
        max_length=128,
        truncation= True,
        padding = 'max_length',
        return_tensors= 'pt'
    )
df['tokens'] = df['text'].apply(tokenize_text)
label_map = {'Positive':1, 'Negative':0, 'Neutral': 2}
df['label'] = df['sentiment'].map(label_map)

In [None]:
df = df[df['sentiment'] != 'Irrelevant']
print(df['sentiment'].unique())

In [None]:
df['label'] = df['label'].astype(int)
input_ids = torch.stack([t['input_ids'].squeeze(0) for t in df['tokens']])
attention_masks = torch.stack([t['attention_mask'].squeeze(0) for t in df['tokens']])
labels = torch.tensor(df['label'].values, dtype=torch.long)

In [None]:
class Sentiment_dataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.attention_masks = attention_masks
        self.input_ids = input_ids
        self.labels = labels
        
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return{
            'input_ids': self.input_ids[idx],
            'attention_masks': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [None]:
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(input_ids, attention_masks, labels, test_size=0.2, random_state=42)
print(f"Train size: {len(train_input_ids)}, Validation size: {len(val_input_ids)}")

In [None]:
train_dataset = Sentiment_dataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = Sentiment_dataset(val_input_ids, val_attention_masks, val_labels)

train_dataloader = DataLoader(train_dataset, batch_size= 32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
loss_fn = nn.CrossEntropyLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device being used:", device)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
num_epochs = 10
save_path = "bert_sentiment_model.pth"
for epoch in range(num_epoch):
    model.train()
    total_train_loss = 0
    correct = 0
    total = 0
    train_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]")

    for batch in train_bar:
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_masks'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs =model(input_ids, attention_mask = attention_masks)
        loss = loss_fn(outputs.logits, labels)
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        train_bar.set_postfix(loss=loss.item(), acc=correct / total)
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_accuracy = correct/total
    print(f"\nTraining Loss: {avg_train_loss:.4f}, \nTraining Accuracy: {train_accuracy:.4f}")

    model.eval()
    total_val_loss = 0
    correct = 0
    total = 0

    val_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]")

    with torch.no_grad():
        for batch in val_bar:
            input_ids = batch['input_ids'].to(device)
            attention_masks = batch['attention_masks'].to(device)
            labels = batch['labels'].to(device)

            outputs =model(input_ids, attention_masks = attention_masks)
            loss = loss_fn(outputs.logits, labels)
            
            total_val_loss += loss.item()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            
            val_bar.set_postfix(loss=loss.item(), acc=correct / total)
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct/total
    print(f"\nValidation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print("-" * 50)

    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

In [None]:
label_map = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
reverse_label_map = {v: k for k, v in label_map.items()}  # Reverse mapping for easier indexing

def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)  # True labels
            
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            
            all_preds.extend(predictions.cpu().numpy())  # Move to CPU
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds

# Get predictions
true_labels, pred_labels = evaluate_model(model, test_dataloader, device)

# Create confusion matrix
cm = confusion_matrix(true_labels, pred_labels)

# Plot confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_map.values(), yticklabels=label_map.values())
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

# Print classification report for additional metrics
print(classification_report(true_labels, pred_labels, target_names=label_map.values()))