In [1]:
!pip install transformers datasets -q

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
import numpy as np


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:

MAX_LEN = 128
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:

class YouTubeBertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        enc = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv("youtube-comments-sentiment.csv", encoding='utf-8', on_bad_lines='skip', engine='python')


df = df[["CommentText", "Sentiment"]].dropna()


df = df[df["Sentiment"].isin(["Negative", "Neutral", "Positive"])]


label_to_id = {"Negative": 0, "Neutral": 1, "Positive": 2}
df["label"] = df["Sentiment"].map(label_to_id)


X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df["CommentText"].astype(str).tolist(),
    df["label"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print("Train size:", len(X_train_texts))
print("Test size:", len(X_test_texts))
print("Labels distribution (train):")
print(pd.Series(y_train).value_counts())


Train size: 207516
Test size: 51879
Labels distribution (train):
0    69598
2    69142
1    68776
Name: count, dtype: int64


In [8]:

train_dataset = YouTubeBertDataset(X_train_texts, y_train, tokenizer, max_len=MAX_LEN)
test_dataset  = YouTubeBertDataset(X_test_texts,  y_test,  tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset,  batch_size=16, shuffle=False)

print("Train batches:", len(train_loader))
print("Test batches :", len(test_loader))

Train batches: 6485
Test batches : 3243


In [9]:

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:

def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = out.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(out.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(loader), correct / total


def eval_epoch(model, loader, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = out.loss
            preds = torch.argmax(out.logits, dim=1)

            total_loss += loss.item()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(loader), correct / total, np.array(all_labels), np.array(all_preds)

In [11]:

EPOCHS = 2
for epoch in range(EPOCHS):
    print(f"\n==== Epoch {epoch+1}/{EPOCHS} ====")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc, y_true, y_pred = eval_epoch(model, test_loader, device)

    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")


==== Epoch 1/2 ====


Training: 100%|██████████| 6485/6485 [1:07:24<00:00,  1.60it/s]
Evaluating: 100%|██████████| 3243/3243 [06:09<00:00,  8.78it/s]


Train Loss: 0.6608, Train Acc: 0.7120
Val Loss: 0.6093, Val Acc: 0.7373

==== Epoch 2/2 ====


Training: 100%|██████████| 6485/6485 [1:07:30<00:00,  1.60it/s]
Evaluating: 100%|██████████| 3243/3243 [06:09<00:00,  8.79it/s]

Train Loss: 0.5182, Train Acc: 0.7820
Val Loss: 0.6206, Val Acc: 0.7434





In [12]:

print("\n=== FINAL RESULTS (BERT) ===")
print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"]))
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))


=== FINAL RESULTS (BERT) ===
Test Accuracy: 0.7434414695734305

Classification Report:
              precision    recall  f1-score   support

    Negative       0.77      0.74      0.75     17400
     Neutral       0.69      0.70      0.70     17194
    Positive       0.77      0.78      0.78     17285

    accuracy                           0.74     51879
   macro avg       0.74      0.74      0.74     51879
weighted avg       0.74      0.74      0.74     51879

Confusion Matrix:
[[12939  3121  1340]
 [ 2495 12084  2615]
 [ 1452  2287 13546]]
