In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install git+https://github.com/csebuetnlp/normalizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from normalizer import normalize

model_name = "csebuetnlp/banglabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

In [None]:
df_train = pd.read_csv("Train.csv")
df_val = pd.read_csv("Val.csv")
df_test = pd.read_csv("Test.csv")

for df in [df_train, df_val, df_test]:
    df.rename(columns={"Data": "text", "Label": "label"}, inplace=True)
    df['text'] = df['text'].apply(normalize)

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

print("Data loaded and columns renamed successfully.")

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_tokenized = train_dataset.map(tokenize_function, batched=True)
val_tokenized = val_dataset.map(tokenize_function, batched=True)
test_tokenized = test_dataset.map(tokenize_function, batched=True)

print("Tokenization complete.")

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./logs',

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",

    fp16=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,      # Uses validation set during training
    compute_metrics=compute_metrics, # Uses our custom accuracy function
)

print("Starting training...")
trainer.train()


print("\nEvaluating on Test Set...")
test_results = trainer.predict(test_tokenized)
print("Test Set Results:", test_results.metrics)

SAVE_PATH = "/content/drive/MyDrive/banglabert_sentiment"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

print("Model saved to Google Drive successfully.")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7796,0.748444,0.706445,0.678621,0.698897,0.706445
2,0.6845,0.697592,0.742183,0.734911,0.732378,0.742183
3,0.4956,0.700047,0.742183,0.73774,0.735166,0.742183



Evaluating on Test Set...


Test Set Results: {'test_loss': 0.7463292479515076, 'test_accuracy': 0.7238335435056746, 'test_f1': 0.7173617724609864, 'test_precision': 0.7154085994435214, 'test_recall': 0.7238335435056746, 'test_runtime': 4.1175, 'test_samples_per_second': 385.186, 'test_steps_per_second': 48.33}
Model saved to Google Drive successfully.


In [None]:
from transformers import pipeline
from normalizer import normalize
import torch

MODEL_PATH = "/content/drive/MyDrive/banglabert_sentiment"

pipe = pipeline(
    "sentiment-analysis",
    model=MODEL_PATH,
    tokenizer=MODEL_PATH,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)