<a href="https://colab.research.google.com/github/utsabsarkar12/Deep_Learning/blob/main/Bert_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pandas torch transformers scikit-learn tqdm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import pandas as pd
import torch
import os
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm.auto import tqdm

In [3]:
# Step 1: Load the dataset
try:
    df = pd.read_csv('/content/student_people_mass_uprising_public_sentiments_dataset.csv')
except FileNotFoundError:
    print("Error: The file 'student_people_mass_uprising_public_sentiments_dataset.csv' was not found.")
    exit()

In [4]:
# Step 2: Prepare the data
df = df[['comment', 'label']].dropna()
label_mapping = {'Positive': 2, 'Neutral': 1, 'Negative': 0}
df['label'] = df['label'].map(label_mapping)
df.dropna(inplace=True)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [10]:
# Step 3: Load the model and tokenizer
MODEL_NAME = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(102025, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [11]:
# Step 4: Custom dataset class and data loaders
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SentimentDataset(train_df['comment'].tolist(), train_df['label'].tolist(), tokenizer)
val_dataset = SentimentDataset(val_df['comment'].tolist(), val_df['label'].tolist(), tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [12]:
# Step 5: Conditional logic to train or load the model
MODEL_FILE = "/content/banglabert_sentiment_model.pt"

if os.path.exists(MODEL_FILE):
    print(f"\nFound existing model file '{MODEL_FILE}'. Loading the model state...")
    try:
        model.load_state_dict(torch.load(MODEL_FILE, map_location=device))
        print("Model loaded successfully. Skipping training.")
    except Exception as e:
        print(f"Error loading model: {e}")
        print("Re-training the model from scratch.")
        model_trained = False
    else:
        model_trained = True
else:
    print(f"\nNo model file '{MODEL_FILE}' found. Starting model training from scratch...")
    model_trained = False

if not model_trained:
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 3
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    def train_epoch(model, data_loader, optimizer, device, scheduler):
        model.train()
        total_loss = 0
        progress_bar = tqdm(data_loader, desc="Training")
        for data in progress_bar:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        return total_loss / len(data_loader)

    for epoch in range(3):
        train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
        print(f"Epoch {epoch + 1} complete. Average training loss: {train_loss:.4f}")

    torch.save(model.state_dict(), MODEL_FILE)
    print(f"Model saved to '{MODEL_FILE}'.")


Found existing model file '/content/banglabert_sentiment_model.pt'. Loading the model state...
Model loaded successfully. Skipping training.


In [13]:
# Step 6: Evaluate the trained model
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted', zero_division=0
    )

    return accuracy, precision, recall, f1

print("\nEvaluating the trained model on the validation set...")
accuracy, precision, recall, f1 = evaluate_model(model, val_loader, device)

print(f"\n--- Evaluation Metrics ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Evaluating the trained model on the validation set...

--- Evaluation Metrics ---
Accuracy: 0.8548
Precision: 0.8563
Recall: 0.8548
F1-Score: 0.8554


In [14]:
# Step 7: Define a function to predict sentiment on new input
def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_class_index = torch.argmax(probabilities, dim=1).item()
    label_map_inv = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = label_map_inv[predicted_class_index]
    return predicted_sentiment, probabilities[0][predicted_class_index].item()

In [16]:
# Step 8: Get user input and predict sentiment
user_input_text = "রাস্তাই অনেক ভীর, কারণ স্টুডেন্টরা রাস্তা ব্লক করে দিয়েছে "
sentiment, probability = predict_sentiment(user_input_text, model, tokenizer, device)

print(f"\n--- Prediction ---")
print(f"User Input: '{user_input_text}'")
print(f"Predicted Sentiment: {sentiment} (Probability: {probability:.2f})")


--- Prediction ---
User Input: 'রাস্তাই অনেক ভীর, কারণ স্টুডেন্টরা রাস্তা ব্লক করে দিয়েছে '
Predicted Sentiment: Negative (Probability: 0.85)
