In [None]:
# Cell 1 — Fix PyTorch Windows installation (shm.dll error)
# This fixes the OSError: [WinError 126] shm.dll issue on Windows
# Uninstall existing PyTorch
!pip uninstall -y torch torchvision torchaudio

# Reinstall PyTorch (CPU version for Windows compatibility)
# NOTE: If you have CUDA GPU and want GPU support, change 'cpu' to 'cu118' or 'cu121' below
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu




In [None]:
# Cell 1b — Install other required packages
!pip install -q transformers datasets evaluate accelerate sentencepiece


In [None]:
pip install torch --index-url https://download.pytorch.org/whl/cpu


In [None]:
import sys
print(sys.executable)


In [None]:
# Cell 2 — imports
import re
import json
import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, #turns text into tokens.
    AutoModelForSequenceClassification, #loads a pretrained model for classification tasks.
    TrainingArguments, #holds training hyperparameters.
    Trainer
)

import evaluate


In [None]:
# Cell 3 — configuration
TRAIN_CSV = "/content/train.csv"   # upload to Colab
TEST_CSV  = "/content/test.csv"    # optional

TEXT_COL = "tweets"
LABEL_COL = "class"

MODEL_NAME = "microsoft/deberta-v3-base"   # DeBERTa-v3 base (good default)
MAX_LENGTH = 128 #Maximum token length when tokenizing text
BATCH_SIZE = 16  #Number of samples processed per training step.
NUM_EPOCHS = 3
LR = 2e-5
OUTPUT_DIR = "deberta_v3_output"  #Directory where trained model checkpoints will be saved

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


In [None]:
# Cell 4 — load data
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Sample rows:")
display(train_df.head(6))


In [None]:
# Cell 5 — robust cleaning
def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+|www\S+", " ", t)   # remove URLs
    t = re.sub(r"@\w+", " ", t)             # remove mentions
    t = re.sub(r"#\w+", " ", t)             # remove hashtags
    t = re.sub(r"[^\x00-\x7f]", " ", t)     # remove non-ascii (emojis)
    t = re.sub(r"[^a-z0-9.,!?']+", " ", t)  # keep letters, numbers, basic punctuation
    t = re.sub(r"(.)\1{2,}", r"\1\1", t)    # normalize elongation
    t = re.sub(r"([!?.])\1+", r"\1", t)     # repeated punctuation -> single
    t = re.sub(r"\s+", " ", t).strip()
    return t

# apply cleaning
train_df["clean"] = train_df[TEXT_COL].astype(str).apply(clean_text)
test_df["clean"]  = test_df[TEXT_COL].astype(str).apply(clean_text)

# quick sanity
print("Example cleaned:", train_df["clean"].iloc[0])


In [None]:
# Cell 6 — labels
train_df = train_df.dropna(subset=["clean", LABEL_COL]).reset_index(drop=True) #Removes any rows where clean text or label is missing.
test_df  = test_df.dropna(subset=["clean", LABEL_COL]).reset_index(drop=True)

label_names = sorted(train_df[LABEL_COL].unique()) #Gets all unique labels
label2id = {lbl: i for i, lbl in enumerate(label_names)}
id2label = {i: lbl for lbl, i in label2id.items()}

print("Label mapping:", label2id)
print("\nClass counts (train):")
print(train_df[LABEL_COL].value_counts()) #Shows how many samples exist per class in training data.


In [None]:
# Cell 7 — HF dataset
train_df["label"] = train_df[LABEL_COL].map(label2id) #Converts each label into its numeric ID using the dictionary label2id.
test_df["label"]  = test_df[LABEL_COL].map(label2id)

hf_train = Dataset.from_pandas(train_df[["clean","label"]])#Converts only two columns (clean, label) of the training dataframe into a HuggingFace Dataset object
hf_test  = Dataset.from_pandas(test_df[["clean","label"]])

dataset = DatasetDict({"train": hf_train, "test": hf_test}) #Combines both datasets into a dictionary-like structure:
dataset


In [None]:
# Cell 8 — tokenizer + tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) #Loads the tokenizer for microsoft/deberta-v3-base.

def tokenize_fn(batch):
    return tokenizer(batch["clean"], truncation=True, padding="max_length", max_length=MAX_LENGTH)

encoded = dataset.map(tokenize_fn, batched=True, remove_columns=["clean"])
encoded.set_format(type="torch")
print(encoded)


In [None]:
# Cell 9 — model
num_labels = len(label2id)#Loads the pretrained DeBERTa model for classification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels, #Adds a classification head with num_labels outputs
    id2label=id2label,#Passes the label mappings → helpful during predictions.
    label2id=label2id
)
model.to(device)


In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR, #Where to save model checkpoints.
    eval_strategy="epoch",     # Run evaluation after every epoch.
    save_strategy="epoch",     #Save model checkpoint every epoch.
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE, #Batch size per GPU/CPU for training.
    per_device_eval_batch_size=BATCH_SIZE, #Batch size during evaluation.
    num_train_epochs=NUM_EPOCHS, #Train for the specified number of epochs.
    weight_decay=0.01, #Regularization to prevent overfitting.
    logging_steps=100, #Log training loss every 100 steps
    load_best_model_at_end=True, #After training, automatically loads best checkpoint based on chosen metric.
    metric_for_best_model="f1_macro", #Best model = highest macro F1 score.
    greater_is_better=True, 
    report_to="none",
    push_to_hub=False, #Do NOT upload model automatically to HuggingFace Hub.
)


In [None]:
# Cell 11 — metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": float(accuracy.compute(predictions=preds, references=labels)["accuracy"]),
        "f1_macro": float(f1.compute(predictions=preds, references=labels, average="macro")["f1"])
    }


In [None]:
# Cell 12 — trainer + train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
print("Train result:", train_result)


In [None]:
# Cell 13 — evaluate and confusion matrix
metrics = trainer.evaluate(encoded["test"])
print("Eval metrics:", metrics)

# get predictions for confusion matrix
pred_out = trainer.predict(encoded["test"])
preds = np.argmax(pred_out.predictions, axis=1)
labels = pred_out.label_ids

from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

print("\nClassification report:")
print(classification_report(labels, preds, target_names=label_names))

cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_names, yticklabels=label_names, cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('True'); plt.show()


In [None]:
# Cell 14 — save everything
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

with open(f"{OUTPUT_DIR}/label2id.json","w") as f:
    json.dump(label2id, f)
print("Saved model to", OUTPUT_DIR)


In [None]:
# Cell 15 — GPU-safe predict helper
def predict_text_deberta(texts, topk=1):
    # texts: str or list[str]
    single = False
    if isinstance(texts, str):
        texts = [texts]
        single = True

    enc = tokenizer(texts, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt")
    enc = {k: v.to(device) for k,v in enc.items()}

    model.eval()
    with torch.no_grad():
        out = model(**enc)
        probs = torch.nn.functional.softmax(out.logits, dim=-1).cpu().numpy()

    top_preds = np.argsort(-probs, axis=1)[:,:topk]
    results = []
    for i in range(len(texts)):
        row = [(label_names[idx], float(probs[i, idx])) for idx in top_preds[i]]
        results.append(row if single==False else row[0])
    return results

# Example
print(predict_text_deberta("Yeah sure, because failing the exam was the highlight of my week"))


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_ENDPOINT"] = "https://huggingface.co"
import os
# disable logging to external trackers (WandB, mlflow, etc.)
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip uninstall -y wandb
report_to="none"
