In [21]:
# Step 0: Install packages (Colab-safe) + pick model
# Run this cell on a fresh runtime first.
!pip install --quiet "pandas==2.2.2" "numpy==2.0.2"
!pip install --quiet -U transformers datasets accelerate evaluate scikit-learn matplotlib

# Choose model: change to "google/muril-base-cased" if you prefer MuRIL
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"   # or "google/muril-base-cased"
MAX_LENGTH = 128
print("Model selected:", MODEL_NAME)

Model selected: ai4bharat/IndicBERTv2-MLM-only


In [22]:
# Step 1: Upload train/dev/test CSVs
from google.colab import files
import os, shutil

REQUIRED = [
    "tamil_sentiment_full_train.csv",
    "tamil_sentiment_full_dev.csv",
    "tamil_sentiment_full_test.csv"
]

uploaded = files.upload()  # choose your 3 CSVs
uploaded = files.upload()
uploaded = files.upload()
def normalize_expected(basename):
    if os.path.exists(basename): return basename
    root, ext = os.path.splitext(basename)
    candidates = [f for f in os.listdir() if f.startswith(root) and f.endswith(ext)]
    if candidates:
        src = sorted(candidates, key=len, reverse=True)[0]
        if src != basename:
            shutil.move(src, basename)
    return basename

for p in REQUIRED: normalize_expected(p)
for p in REQUIRED: print(p, "exists?", os.path.exists(p))

Saving tamil_sentiment_full_train.csv to tamil_sentiment_full_train (2).csv


Saving tamil_sentiment_full_dev.csv to tamil_sentiment_full_dev (1).csv


Saving tamil_sentiment_full_test.csv to tamil_sentiment_full_test (1).csv
tamil_sentiment_full_train.csv exists? True
tamil_sentiment_full_dev.csv exists? True
tamil_sentiment_full_test.csv exists? True


In [23]:
# Step 2: Load train/dev/test with messy line handling
import pandas as pd

def read_codemix(path):
    rows=[]
    with open(path,"r",encoding="utf-8",errors="replace") as f:
        for raw in f:
            line = raw.rstrip("\n\r")
            if not line: continue
            low = line.lower()
            if low.startswith(("text\t","text,","tweet\t","tweet,")): continue
            pos = line.rfind("\t")
            if pos==-1: pos=line.rfind(",")
            if pos==-1:
                parts=line.split()
                if len(parts)>=2: text=" ".join(parts[:-1]); label=parts[-1]
                else: continue
            else:
                text=line[:pos]; label=line[pos+1:]
            text=text.strip().replace("\u200d","")
            label=label.strip()
            if text and label: rows.append((text,label))
    return pd.DataFrame(rows,columns=["text","label"])

df_train = read_codemix("tamil_sentiment_full_train.csv")
df_dev   = read_codemix("tamil_sentiment_full_dev.csv")
df_test  = read_codemix("tamil_sentiment_full_test.csv")

print("Shapes:", df_train.shape, df_dev.shape, df_test.shape)
display(df_train.head(3))

Shapes: (35219, 2) (4397, 2) (4400, 2)


Unnamed: 0,text,label
0,First like button vijay setupati fans,unknown_state
1,Vetri ne dhanusha pudiche thongitu iru....,Positive
2,Ithu romba naal ku munnadi Short film'a pathat...,Positive


In [24]:
# Step 3: Clean up noisy labels + encode to IDs
import re
from sklearn.preprocessing import LabelEncoder
import numpy as np

def normalize_label_raw(s):
    if not isinstance(s,str): s=str(s)
    s=s.strip().strip('"').strip("'")
    parts=re.split(r"[;|/\\,]",s)
    parts=[p.strip() for p in parts if p and p.strip()!=""]
    if not parts: return s
    for p in reversed(parts):
        if len(p)<=60: return p
    return parts[-1]

for df in (df_train,df_dev,df_test):
    df["label"]=df["label"].astype(str).map(normalize_label_raw)

train_labels=set(df_train["label"].unique())
fallback="unknown_state" if "unknown_state" in train_labels else df_train["label"].mode().iloc[0]
df_dev["label"]=df_dev["label"].apply(lambda x: x if x in train_labels else fallback)
df_test["label"]=df_test["label"].apply(lambda x: x if x in train_labels else fallback)

le=LabelEncoder()
le.fit(df_train["label"].astype(str))
df_train["label_id"]=le.transform(df_train["label"].astype(str))
df_dev["label_id"]=le.transform(df_dev["label"].astype(str))
df_test["label_id"]=le.transform(df_test["label"].astype(str))

id2label={i:cls for i,cls in enumerate(le.classes_)}
label2id={v:k for k,v in id2label.items()}
num_labels=len(id2label)

print("num_labels:",num_labels)
print("id2label:",id2label)

num_labels: 5
id2label: {0: 'Mixed_feelings', 1: 'Negative', 2: 'Positive', 3: 'not-Tamil', 4: 'unknown_state'}


In [25]:
# Step 4: Convert pandas -> HF DatasetDict
from datasets import Dataset, DatasetDict
cols_keep=["text","label_id"]
ds=DatasetDict({
    "train":Dataset.from_pandas(df_train[cols_keep],preserve_index=False),
    "validation":Dataset.from_pandas(df_dev[cols_keep],preserve_index=False),
    "test":Dataset.from_pandas(df_test[cols_keep],preserve_index=False),
})
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label_id'],
        num_rows: 35219
    })
    validation: Dataset({
        features: ['text', 'label_id'],
        num_rows: 4397
    })
    test: Dataset({
        features: ['text', 'label_id'],
        num_rows: 4400
    })
})


In [26]:
# Step 5: Tokenize datasets
from transformers import AutoTokenizer, DataCollatorWithPadding

MODEL_NAME="ai4bharat/IndicBERTv2-MLM-only"   # or "google/muril-base-cased"
MAX_LENGTH=96  # shorter to avoid OOM

tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME,use_fast=True)
def tokenize_fn(batch): return tokenizer(batch["text"],truncation=True,max_length=MAX_LENGTH)

tokenized=ds.map(tokenize_fn,batched=True,remove_columns=["text"])
data_collator=DataCollatorWithPadding(tokenizer=tokenizer,pad_to_multiple_of=8 if torch.cuda.is_available() else None)

print("Tokenized sizes:",{k:len(v) for k,v in tokenized.items()})

Map:   0%|          | 0/35219 [00:00<?, ? examples/s]

Map:   0%|          | 0/4397 [00:00<?, ? examples/s]

Map:   0%|          | 0/4400 [00:00<?, ? examples/s]

Tokenized sizes: {'train': 35219, 'validation': 4397, 'test': 4400}


In [30]:
# ===== Step 6: Training (MuRIL-style args, 3 epochs, eval/save each epoch) =====
import os, torch, numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Safety check
required = ["tokenized", "tokenizer", "data_collator", "id2label", "label2id", "num_labels"]
missing = [v for v in required if v not in globals()]
if missing:
    raise AssertionError(f"Missing: {missing}. Re-run preprocessing/tokenization steps first.")

# Choose model (IndicBERT v2 by default; set to "google/muril-base-cased" to train MuRIL instead)
MODEL_NAME = globals().get("MODEL_NAME", "ai4bharat/IndicBERTv2-MLM-only")

# fp16 toggle like your MuRIL run
use_fp16 = torch.cuda.is_available()

# Metrics: provide BOTH "f1" and "weighted_f1" so either key can be tracked
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    p_w, r_w, f_w, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    p_m, r_m, f_m, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f_m,                 # macro-F1 (classic "f1")
        "weighted_f1": f_w,        # weighted-F1 (your project’s primary)
        "weighted_precision": p_w,
        "weighted_recall": r_w,
        "macro_precision": p_m,
        "macro_recall": r_m,
    }

# Load model (classification head will be initialized)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, id2label=id2label, label2id=label2id
)

# EXACT MuRIL-style TrainingArguments you shared (with the correct kw names)
args = TrainingArguments(
    output_dir="indicbert-cls",
    num_train_epochs=8,                  # 🔥 run longer, early stopping will cut off if not improving
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
    seed=42,
)


# Trainer with validation set (needed for best-model selection/early stopping)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # safe guard; remove if you don’t want it
)

print("Starting training… (3 epochs, eval/save each epoch)")
trainer.train()

# Save final artifacts
final_dir = os.path.join(args.output_dir, "final_model")
trainer.save_model(final_dir)
tokenizer.save_pretrained(final_dir)
print("Training complete. Saved to:", final_dir)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.


Starting training… (3 epochs, eval/save each epoch)


ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [31]:
# Fix: rename label_id -> labels, update trainer datasets, and resume training safely
import os

# 1) Rename column in tokenized dataset (works if 'label_id' exists)
for split in ("train", "validation", "test"):
    if "label_id" in tokenized[split].column_names:
        print("Renaming label_id -> labels for", split)
        tokenized[split] = tokenized[split].rename_column("label_id", "labels")
    elif "labels" in tokenized[split].column_names:
        print("labels column already present for", split)
    else:
        raise RuntimeError(f"No label column found in tokenized[{split}] (expected 'label_id' or 'labels').")

# 2) Update trainer's datasets (trainer already exists from Step 6)
trainer.train_dataset = tokenized["train"]
trainer.eval_dataset  = tokenized["validation"]

# 3) Detect latest checkpoint (safe resume only if exists)
out_dir = trainer.args.output_dir if hasattr(trainer.args, "output_dir") else "indicbert-cls"
latest_ckpt = None
if os.path.isdir(out_dir):
    ckpts = sorted([d for d in os.listdir(out_dir) if d.startswith("checkpoint")])
    if ckpts:
        latest_ckpt = os.path.join(out_dir, ckpts[-1])
        print("Found checkpoint:", latest_ckpt)
    else:
        print("No checkpoints found in", out_dir, "- starting fresh training.")

# 4) Start / resume training
print("Starting trainer.train(resume_from_checkpoint=...) — this will now produce loss correctly.")
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)

# 5) Save final model and tokenizer
final_dir = os.path.join(trainer.args.output_dir, "final_model")
trainer.save_model(final_dir)
trainer.tokenizer.save_pretrained(final_dir)
print("Training finished and saved to:", final_dir)

Renaming label_id -> labels for train
Renaming label_id -> labels for validation
Renaming label_id -> labels for test
No checkpoints found in indicbert-cls - starting fresh training.
Starting trainer.train(resume_from_checkpoint=...) — this will now produce loss correctly.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
1,0.9852,1.0162,0.616784,0.440279,0.576082,0.577692,0.616784,0.534074,0.413324
2,0.8465,1.014753,0.616102,0.460898,0.587586,0.592812,0.616102,0.5197,0.448289
3,0.7571,1.039839,0.614055,0.46742,0.595542,0.587059,0.614055,0.497124,0.453933
4,0.6562,1.157007,0.616329,0.466819,0.592564,0.586765,0.616329,0.513809,0.448609
5,0.3896,1.408131,0.59859,0.456041,0.584264,0.574736,0.59859,0.471239,0.447101


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Training finished and saved to: indicbert-cls/final_model


In [32]:
# Inspect per-class metrics & confusion matrix (run this now)
import numpy as np, json
from sklearn.metrics import classification_report, confusion_matrix

# Ensure trainer.predict will use the currently loaded model and tokenized datasets
preds = trainer.predict(tokenized["validation"])
y_true = preds.label_ids
y_pred = np.argmax(preds.predictions, axis=-1)

print("Classification report (validation):\n")
print(classification_report(y_true, y_pred, target_names=[str(id2label[i]) for i in range(num_labels)], digits=4, zero_division=0))

cm = confusion_matrix(y_true, y_pred, labels=list(range(num_labels)))
print("\nConfusion matrix (rows=true, cols=pred):\n", cm)

Classification report (validation):

                precision    recall  f1-score   support

Mixed_feelings     0.2857    0.1409    0.1887       511
      Negative     0.3989    0.4165    0.4075       521
      Positive     0.7237    0.8165    0.7673      2447
     not-Tamil     0.6087    0.4444    0.5138       189
 unknown_state     0.4687    0.4513    0.4598       729

      accuracy                         0.6141      4397
     macro avg     0.4971    0.4539    0.4674      4397
  weighted avg     0.5871    0.6141    0.5955      4397


Confusion matrix (rows=true, cols=pred):
 [[  72   88  268    4   79]
 [  55  217  171    7   71]
 [  77  154 1998   28  190]
 [   3    9   60   84   33]
 [  45   76  264   15  329]]


In [37]:
# Fix Trainer: ensure evaluation is ON (epoch) and save per epoch, then recreate Trainer and resume train
import os, torch, numpy as np
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification
from transformers.trainer_utils import IntervalStrategy, SaveStrategy
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# safety
required = ["tokenized","tokenizer","data_collator","id2label","label2id","num_labels","trainer"]
missing = [v for v in required if v not in globals()]
if missing:
    raise AssertionError(f"Missing required objects: {missing}. Re-run earlier steps as needed.")

# show current state
print("Before patch -- args summary:")
print(" evaluation_strategy:", getattr(trainer.args, "evaluation_strategy", None))
print(" eval_starategy (if present):", getattr(trainer.args, "eval_starategy", None))
print(" save_strategy:", getattr(trainer.args, "save_strategy", None))

# Patch args: ensure evaluation runs each epoch and save per epoch
args = trainer.args  # existing TrainingArguments
try:
    # canonical name
    args.evaluation_strategy = "epoch"
except Exception:
    try:
        setattr(args, "evaluation_strategy", "epoch")
    except Exception:
        pass

# also set any variant names you used
for attr in ("eval_starategy", "eval_strategy", "eval_strategy"):
    try:
        setattr(args, attr, "epoch")
    except Exception:
        pass

# ensure save strategy
try:
    args.save_strategy = "epoch"
except Exception:
    try:
        setattr(args, "save_strategy", "epoch")
    except Exception:
        pass

# Verify internal enums (optional)
print("\nAfter patch -- args summary:")
print(" evaluation_strategy:", getattr(args, "evaluation_strategy", None))
print(" eval_starategy (if present):", getattr(args, "eval_starategy", None))
print(" save_strategy:", getattr(args, "save_strategy", None))

# Recreate Trainer cleanly (keeps same model, datasets, metrics, callbacks)
ModelClass = type(trainer.model)
model = trainer.model  # reuse in-memory model (weights already loaded/trained)
# Rebuild Trainer (safe): re-use args, model, datasets, tokenizer, data_collator, metrics, callbacks
new_trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=trainer.compute_metrics,
    callbacks=trainer.callback_handler.callbacks,  # reuse callbacks
)

# Replace global trainer
trainer = new_trainer
print("\nRecreated Trainer with evaluation enabled. Trainer will evaluate each epoch and save per epoch.")

# Optionally show whether any checkpoints exist and then start/resume training safely
out_dir = args.output_dir if hasattr(args, "output_dir") else "outputs"
latest_ckpt = None
if os.path.isdir(out_dir):
    ckpts = sorted([d for d in os.listdir(out_dir) if d.startswith("checkpoint")])
    if ckpts:
        latest_ckpt = os.path.join(out_dir, ckpts[-1])
        print("Found checkpoint to resume:", latest_ckpt)
    else:
        print("No checkpoints found; will start fresh training.")

# Start/resume training (safe resume)
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)

  new_trainer = Trainer(
You are adding a <class 'transformers.trainer_callback.DefaultFlowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
You are adding a <class 'transformers.utils.notebook.NotebookProgressCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
DefaultFlowCallback
EarlyStoppingCallback
NotebookProgressCallback


Before patch -- args summary:
 evaluation_strategy: None
 eval_starategy (if present): None
 save_strategy: SaveStrategy.EPOCH

After patch -- args summary:
 evaluation_strategy: epoch
 eval_starategy (if present): epoch
 save_strategy: epoch

Recreated Trainer with evaluation enabled. Trainer will evaluate each epoch and save per epoch.
Found checkpoint to resume: indicbert-cls/checkpoint-8808


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.3896,1.408131,0.59859,0.456041,0.584264,0.574736,0.59859,0.471239,0.447101


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.3896,1.408131,0.59859,0.456041,0.584264,0.574736,0.59859,0.471239,0.447101


TrainOutput(global_step=11010, training_loss=0.0946864191344605, metrics={'train_runtime': 472.755, 'train_samples_per_second': 595.979, 'train_steps_per_second': 37.262, 'total_flos': 5181593801985504.0, 'train_loss': 0.0946864191344605, 'epoch': 5.0})

In [40]:
# ===== Fix: accept Trainer's extra kwargs in compute_loss, then resume weighted-loss training =====
import os, torch
from torch import nn
from collections import Counter
from transformers import TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Safety check: required objects
required = ["tokenized","tokenizer","data_collator","id2label","label2id","num_labels"]
missing = [v for v in required if v not in globals()]
if missing:
    raise AssertionError(f"Missing required objects: {missing}. Re-run earlier steps.")

# Recompute class weights (safe to do again)
from collections import Counter
train_counts = Counter(tokenized["train"]["labels"])
labels_sorted = [train_counts[i] for i in sorted(train_counts.keys())]
freqs = torch.tensor(labels_sorted, dtype=torch.float)
class_weights = (1.0 / freqs)
class_weights = class_weights / class_weights.sum() * len(class_weights)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class_weights = class_weights.to(device)
print("Class weights:", class_weights.cpu().numpy())

# Redefine WeightedTrainer with **kwargs in compute_loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Accepts extra kwargs (e.g. num_items_in_batch) forwarded by Trainer.
        """
        labels = inputs.get("labels")
        # Forward pass (exclude 'labels' so model doesn't compute loss internally)
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Recreate model if needed (reuse existing 'model' if present and same architecture)
if 'model' not in globals():
    model_name = globals().get("MODEL_NAME", "ai4bharat/IndicBERTv2-MLM-only")
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels, id2label=id2label, label2id=label2id
    ).to(device)
else:
    # ensure model on right device
    model = model.to(device)

# Use existing new_args if present, otherwise fallback to trainer.args or create minimal args
if 'new_args' in globals():
    args = new_args
elif 'trainer' in globals() and trainer.args is not None:
    args = trainer.args
else:
    args = TrainingArguments(output_dir="indicbert-cls", num_train_epochs=3, per_device_train_batch_size=16)

# Deduplicate callbacks (if any exist)
callbacks = []
if 'trainer' in globals():
    seen = set()
    for cb in trainer.callback_handler.callbacks:
        name = cb.__class__.__name__
        if name not in seen:
            seen.add(name)
            callbacks.append(cb)
# ensure EarlyStoppingCallback present
from transformers import EarlyStoppingCallback
if not any(isinstance(cb, EarlyStoppingCallback) for cb in callbacks):
    callbacks.append(EarlyStoppingCallback(early_stopping_patience=2))

# Instantiate new WeightedTrainer
wt = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=trainer.compute_metrics if 'trainer' in globals() else None,
    callbacks=callbacks,
)

# Find latest checkpoint if any
out_dir = args.output_dir if hasattr(args, "output_dir") else "indicbert-cls"
latest_ckpt = None
if os.path.isdir(out_dir):
    ckpts = sorted([d for d in os.listdir(out_dir) if d.startswith("checkpoint")])
    if ckpts:
        latest_ckpt = os.path.join(out_dir, ckpts[-1])
        print("Found checkpoint:", latest_ckpt)
    else:
        print("No checkpoint found in", out_dir, "- starting fresh.")

# Resume training (safe)
print("Resuming weighted training. resume_from_checkpoint =", latest_ckpt if latest_ckpt else None)
wt.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)

# Save and swap trainer
save_dir = os.path.join(out_dir, "final_weighted")
wt.save_model(save_dir)
tokenizer.save_pretrained(save_dir)
trainer = wt
print("Weighted training complete and saved to:", save_dir)

  wt = WeightedTrainer(
You are adding a <class 'transformers.trainer_callback.DefaultFlowCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
You are adding a <class 'transformers.utils.notebook.NotebookProgressCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
DefaultFlowCallback
EarlyStoppingCallback
NotebookProgressCallback


Class weights: [0.96949154 0.91052973 0.19013168 2.2466214  0.68322575]
Found checkpoint: indicbert-cls/checkpoint-8808
Resuming weighted training. resume_from_checkpoint = indicbert-cls/checkpoint-8808


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.4953,1.802919,0.559472,0.453725,0.569426,0.583964,0.559472,0.439341,0.474622
6,0.3891,2.060485,0.555834,0.447869,0.564604,0.578349,0.555834,0.432868,0.470551
7,0.3069,2.452645,0.574483,0.454306,0.575959,0.577888,0.574483,0.451671,0.45781


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.4953,1.802919,0.559472,0.453725,0.569426,0.583964,0.559472,0.439341,0.474622
6,0.3891,2.060485,0.555834,0.447869,0.564604,0.578349,0.555834,0.432868,0.470551
7,0.3069,2.452645,0.574483,0.454306,0.575959,0.577888,0.574483,0.451671,0.45781


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.4953,1.802919,0.559472,0.453725,0.569426,0.583964,0.559472,0.439341,0.474622
6,0.3891,2.060485,0.555834,0.447869,0.564604,0.578349,0.555834,0.432868,0.470551
7,0.3069,2.452645,0.574483,0.454306,0.575959,0.577888,0.574483,0.451671,0.45781
8,0.3103,2.629865,0.573573,0.452918,0.573303,0.573604,0.573573,0.447675,0.459286


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
5,0.4953,1.802919,0.559472,0.453725,0.569426,0.583964,0.559472,0.439341,0.474622
6,0.3891,2.060485,0.555834,0.447869,0.564604,0.578349,0.555834,0.432868,0.470551
7,0.3069,2.452645,0.574483,0.454306,0.575959,0.577888,0.574483,0.451671,0.45781
8,0.3103,2.629865,0.573573,0.452918,0.573303,0.573604,0.573573,0.447675,0.459286


Weighted training complete and saved to: indicbert-cls/final_weighted


In [None]:
# ===== Oversample minority classes and resume training (run this cell only) =====
from collections import Counter
from datasets import concatenate_datasets
import os

# safety checks
required = ["tokenized", "trainer", "id2label", "num_labels"]
missing = [v for v in required if v not in globals()]
if missing:
    raise AssertionError(f"Missing required objects: {missing}. Run previous steps first.")

print("Original train size:", len(tokenized["train"]))

# compute class counts
ctr = Counter(tokenized["train"]["labels"])
print("Train label counts:", {k: int(ctr[k]) for k in sorted(ctr.keys())})

max_count = max(ctr.values())

parts = []
for label_id in sorted(ctr.keys()):
    # ✅ fix: only ex (no i) since with_indices=False
    ds_label = tokenized["train"].filter(
        lambda ex, lab=label_id: ex["labels"] == lab,
        with_indices=False
    )
    count = len(ds_label)
    if count == 0:
        continue
    repeat = max_count // count
    rem = max_count % count

    # build repeated dataset
    reps = [ds_label] * repeat
    if rem:
        reps.append(ds_label.select(range(rem)))
    ds_rep = concatenate_datasets(reps) if reps else ds_label
    parts.append(ds_rep)
    print(f"Label {label_id} ({id2label[label_id]}): original {count} -> oversampled {len(ds_rep)}")

# concatenate per-label parts and shuffle
if not parts:
    raise RuntimeError("No parts created for oversampling — check labels.")
new_train = concatenate_datasets(parts).shuffle(seed=42)
print("New oversampled train size:", len(new_train))

# Replace tokenized train (keep validation/test intact)
tokenized["train"] = new_train

# Update trainer's train_dataset
trainer.train_dataset = tokenized["train"]

# Detect latest checkpoint (resume if exists)
out_dir = trainer.args.output_dir if hasattr(trainer.args, "output_dir") else "indicbert-cls"
latest_ckpt = None
if os.path.isdir(out_dir):
    ckpts = sorted([d for d in os.listdir(out_dir) if d.startswith("checkpoint")])
    if ckpts:
        latest_ckpt = os.path.join(out_dir, ckpts[-1])
        print("Resuming from checkpoint:", latest_ckpt)
    else:
        print("No checkpoint found; starting training fresh.")

# Start/resume training (safe)
print("Starting training on oversampled data. This will take longer per epoch.")
trainer.train(resume_from_checkpoint=latest_ckpt if latest_ckpt else None)

# Save final model and tokenizer
save_dir = os.path.join(out_dir, "final_oversampled")
trainer.save_model(save_dir)
trainer.tokenizer.save_pretrained(save_dir)
print("Oversampled training finished. Model saved to:", save_dir)

Original train size: 35219
Train label counts: {0: 3907, 1: 4160, 2: 19922, 3: 1686, 4: 5544}


Filter:   0%|          | 0/35219 [00:00<?, ? examples/s]

Label 0 (Mixed_feelings): original 3907 -> oversampled 19922


Filter:   0%|          | 0/35219 [00:00<?, ? examples/s]

Label 1 (Negative): original 4160 -> oversampled 19922


Filter:   0%|          | 0/35219 [00:00<?, ? examples/s]

Label 2 (Positive): original 19922 -> oversampled 19922


Filter:   0%|          | 0/35219 [00:00<?, ? examples/s]

Label 3 (not-Tamil): original 1686 -> oversampled 19922


Filter:   0%|          | 0/35219 [00:00<?, ? examples/s]

Label 4 (unknown_state): original 5544 -> oversampled 19922
New oversampled train size: 99610
Resuming from checkpoint: indicbert-cls/checkpoint-8808
Starting training on oversampled data. This will take longer per epoch.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
2,0.2686,2.206684,0.459859,0.411069,0.485448,0.588081,0.459859,0.410198,0.474773
3,0.1106,3.322966,0.518535,0.443831,0.539655,0.581341,0.518535,0.44439,0.458195
4,0.0691,4.044867,0.585854,0.469148,0.583071,0.583341,0.585854,0.479382,0.464959
5,0.0272,4.710324,0.583352,0.449197,0.571118,0.564592,0.583352,0.458994,0.447676


Epoch,Training Loss,Validation Loss,Accuracy,F1,Weighted F1,Weighted Precision,Weighted Recall,Macro Precision,Macro Recall
2,0.2686,2.206684,0.459859,0.411069,0.485448,0.588081,0.459859,0.410198,0.474773
3,0.1106,3.322966,0.518535,0.443831,0.539655,0.581341,0.518535,0.44439,0.458195
4,0.0691,4.044867,0.585854,0.469148,0.583071,0.583341,0.585854,0.479382,0.464959
5,0.0272,4.710324,0.583352,0.449197,0.571118,0.564592,0.583352,0.458994,0.447676
