In [None]:
# Cell 1 — install
!pip install -q transformers datasets accelerate evaluate scikit-learn sentencepiece

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Cell 3 — upload local file via browser
from google.colab import files
uploaded = files.upload()   # choose your tamil_sentiment_full.csv from your machine
# After upload, file will be in current working directory, e.g. '/content/tamil_sentiment_full.csv'

Saving tamil_sentiment_full.csv to tamil_sentiment_full.csv


In [None]:
DATA_PATH = 'tamil_sentiment_full.csv'   # update if needed

# print first 120 lines and lines around line 51
with open(DATA_PATH, 'r', encoding='utf-8', errors='replace') as f:
    lines = f.readlines()

print("Total lines:", len(lines))
print("\n--- first 20 lines ---")
for i, line in enumerate(lines[:20], 1):
    print(f"{i:03d}: {line.rstrip()!r}")

print("\n--- around line 51 (±5) ---")
start = max(0, 50-5)   # zero-indexed
for i in range(start, min(len(lines), 50+5)):
    print(f"{i+1:03d}: {lines[i].rstrip()!r}")

Total lines: 44161

--- first 20 lines ---
001: 'Negative\tEnna da ellam avan seyal  Mari iruku'
002: 'Negative\tThis movei is just like  ellam avan seyal'
003: 'Positive\tPadam vanthathum 13k dislike pottavaga yellam yea da dislike  pannom nu feel pannanum'
004: 'Positive\tNeraya neraya neraya... ... V era level...thala'
005: 'Positive\twow thavala sema mass....padam oru pundaikum aagathu'
006: 'Negative\tAndha 19 k unlike panavangaluku kolandha porakathu'
007: 'Positive\tYaarellam frst like pottutu video paaka start paneenga....hit like'
008: 'Positive\tEthana padam vanthanu SALT AND PEPPER Mattum than..THÃLÃ🤩🤩'
009: 'Positive\tThala mass  Hvy sprt kerala Surya anna fans'
010: 'Negative\tElam avan jayal movie  remake pa'
011: "Positive\tDhayavasenju indha padathula mass ila mayiru ila nu yevanum saavadikadheenga.. let him do such roles.. it's healthy!"
012: 'Positive\tvera lvl.... Thala sammaaaaaaaaaaaaa......Bgm sammaya iruku'
013: 'Positive\tRomba nal aparam ajith ah normal ah paka

In [None]:
# Clean tab-quoted file and produce a tidy CSV for training
DATA_PATH = 'tamil_sentiment_full.csv'   # your uploaded file
CLEAN_PATH = 'tamil_sentiment_full.cleaned.csv'

rows = []
bad_lines = []
with open(DATA_PATH, 'r', encoding='utf-8', errors='replace') as f:
    for i, raw in enumerate(f, 1):
        line = raw.rstrip('\n').rstrip('\r')
        if not line.strip():
            continue
        # remove a single leading and trailing single-quote if present
        if line.startswith("'") and line.endswith("'"):
            line = line[1:-1]
        # split on the first tab into label and text
        if '\t' in line:
            label, text = line.split('\t', 1)
            rows.append((label.strip(), text.strip()))
        else:
            # keep track of malformed lines
            bad_lines.append((i, line))

# Build DataFrame
import pandas as pd
clean_df = pd.DataFrame(rows, columns=['label', 'text'])
print(f"Cleaned rows: {len(clean_df)}  |  Malformed (no tab) lines: {len(bad_lines)}")

# Show a few malformed lines (if any) so you can inspect
if bad_lines:
    print("\nSample malformed lines (line_no, content):")
    for ln, content in bad_lines[:10]:
        print(ln, repr(content))

# Basic sanity checks
print("\nSample rows:")
display(clean_df.head(8))
print("\nLabel distribution:")
display(clean_df['label'].value_counts())

# Save cleaned CSV (comma-separated). If you prefer TSV, change sep='\t'
clean_df.to_csv(CLEAN_PATH, index=False, encoding='utf-8')
print(f"\nSaved cleaned CSV -> {CLEAN_PATH}")

Cleaned rows: 44020  |  Malformed (no tab) lines: 0

Sample rows:


Unnamed: 0,label,text
0,Negative,Enna da ellam avan seyal Mari iruku
1,Negative,This movei is just like ellam avan seyal
2,Positive,Padam vanthathum 13k dislike pottavaga yellam ...
3,Positive,Neraya neraya neraya... ... V era level...thala
4,Positive,wow thavala sema mass....padam oru pundaikum a...
5,Negative,Andha 19 k unlike panavangaluku kolandha porak...
6,Positive,Yaarellam frst like pottutu video paaka start ...
7,Positive,Ethana padam vanthanu SALT AND PEPPER Mattum t...



Label distribution:


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Positive,24873
unknown_state,6904
Negative,5228
Mixed_feelings,4928
not-Tamil,2087



Saved cleaned CSV -> tamil_sentiment_full.cleaned.csv


In [None]:
# BERT-family training pipeline (mBERT, MuRIL, IndicBERT, XLM-R) — 5 epochs, class-weighted
# Requirements (run once): !pip install -q transformers datasets accelerate evaluate scikit-learn sentencepiece

import os
os.environ["WANDB_DISABLED"] = "true"   # disable W&B prompt

import random, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding
)

# ---------- CONFIG ----------
DATA_PATH = 'tamil_sentiment_full.cleaned.csv'   # ensure this file is uploaded in Colab
MODEL_IDS = {
    'mBERT': 'bert-base-multilingual-cased',
    'MuRIL': 'google/muril-base-cased',
    'IndicBERT': 'ai4bharat/indic-bert',
    'XLM-R': 'xlm-roberta-base'
}
OUTPUT_DIR = './bert_models'
os.makedirs(OUTPUT_DIR, exist_ok=True)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

EPOCHS = 5                # <--- you requested at least 5 epochs
BATCH_SIZE = 16           # reduce if OOM (8 or 4)
LR = 2e-5
MAX_LENGTH = 256          # reduce to 128 if OOM
FP16 = True if torch.cuda.is_available() else False
GRAD_ACCUM = 1
# ----------------------------

# Load data
df = pd.read_csv(DATA_PATH, encoding='utf-8')
assert 'text' in df.columns and 'label' in df.columns, "CSV must contain 'text' and 'label' columns"
df['text'] = df['text'].astype(str)

# Label encode
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])
num_labels = len(le.classes_)
print("Classes (id->label):", list(enumerate(le.classes_)))

# Train/test split (stratified)
train_df, test_df = train_test_split(df[['text','label_enc']], test_size=0.2, random_state=SEED, stratify=df['label_enc'])
train_df = train_df.reset_index(drop=True); test_df = test_df.reset_index(drop=True)
print("Train / Test sizes:", len(train_df), len(test_df))

# Compute class weights (balanced)
class_weights_np = compute_class_weight(class_weight='balanced', classes=np.unique(train_df['label_enc']), y=train_df['label_enc'])
class_weights_tensor = torch.tensor(class_weights_np, dtype=torch.float)
print("Class weights:", class_weights_np)

# Prepare metrics
metric_precision = evaluate.load('precision')
metric_recall = evaluate.load('recall')
metric_f1 = evaluate.load('f1')
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    if isinstance(logits, tuple): logits = logits[0]
    preds = np.argmax(logits, axis=1)
    p = metric_precision.compute(predictions=preds, references=labels, average='macro')['precision']
    r = metric_recall.compute(predictions=preds, references=labels, average='macro')['recall']
    f = metric_f1.compute(predictions=preds, references=labels, average='macro')['f1']
    return {'precision': p, 'recall': r, 'f1': f}

# Custom Trainer to inject class weights; accept arbitrary kwargs to be compatible
from transformers import Trainer as HfTrainer
class WeightedTrainer(HfTrainer):
    def __init__(self, class_weights_tensor=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights_tensor = class_weights_tensor

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        inputs_for_model = {k:v for k,v in inputs.items() if k != "labels"}
        outputs = model(**inputs_for_model)
        logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0]
        weight = self.class_weights_tensor.to(logits.device).to(logits.dtype) if self.class_weights_tensor is not None else None
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

results = []

for short_name, model_id in MODEL_IDS.items():
    print(f"\n=== TRAINING {short_name} ({model_id}) ===")
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels)

    # Prepare datasets
    ds_train = Dataset.from_pandas(train_df.rename(columns={'label_enc':'label'}))
    ds_test = Dataset.from_pandas(test_df.rename(columns={'label_enc':'label'}))

    def preprocess(examples):
        return tokenizer(examples['text'], truncation=True, padding=False, max_length=MAX_LENGTH)
    ds_train = ds_train.map(preprocess, batched=True)
    ds_test = ds_test.map(preprocess, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model_output = os.path.join(OUTPUT_DIR, short_name)
    training_args = TrainingArguments(
        output_dir=model_output,
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=LR,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='f1',
        seed=SEED,
        logging_steps=200,
        fp16=FP16,
        gradient_accumulation_steps=GRAD_ACCUM,
        save_total_limit=2,
        report_to=[]      # disables W&B / trackers
    )

    trainer = WeightedTrainer(
        class_weights_tensor=class_weights_tensor,
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train
    trainer.train()

    # Evaluate
    eval_res = trainer.evaluate(eval_dataset=ds_test)
    print('Eval (macro metrics):', {k:v for k,v in eval_res.items() if k.startswith('eval_')})

    # Predict for classification report / confusion matrix
    preds_output = trainer.predict(ds_test)
    logits = preds_output.predictions
    if isinstance(logits, tuple): logits = logits[0]
    preds = np.argmax(logits, axis=1)
    labels = preds_output.label_ids

    report = classification_report(labels, preds, target_names=le.classes_, digits=4, zero_division=0)
    print(f"\nClassification report for {short_name}:\n{report}")
    cm = confusion_matrix(labels, preds, normalize='true')
    np.savetxt(os.path.join(model_output, 'confusion_matrix_norm.csv'), cm, delimiter=',')
    with open(os.path.join(model_output, 'classification_report.txt'), 'w', encoding='utf-8') as fh:
        fh.write(report)

    results.append({
        'model': short_name,
        'model_id': model_id,
        'precision': float(eval_res.get('eval_precision', np.nan)),
        'recall': float(eval_res.get('eval_recall', np.nan)),
        'f1': float(eval_res.get('eval_f1', np.nan))
    })

    print(f"Saved artifacts to {model_output}")

# Summarize
results_df = pd.DataFrame(results).sort_values('f1', ascending=False).reset_index(drop=True)
results_df.to_csv('model_comparison_results_weighted.csv', index=False)
print("\nModel comparison (macro metrics):")
print(results_df)
print("\nSaved model_comparison_results_weighted.csv and model outputs under", OUTPUT_DIR)


Classes (id->label): [(0, 'Mixed_feelings'), (1, 'Negative'), (2, 'Positive'), (3, 'not-Tamil'), (4, 'unknown_state')]
Train / Test sizes: 35216 8804
Class weights: [1.78670726 1.68417025 0.35394743 4.21748503 1.27524896]

=== TRAINING mBERT (bert-base-multilingual-cased) ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35216 [00:00<?, ? examples/s]

Map:   0%|          | 0/8804 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.2372,1.188092,0.440282,0.51926,0.458536
2,1.1264,1.162655,0.45495,0.529694,0.470627
3,1.0093,1.20358,0.475564,0.533136,0.490592
4,0.8619,1.292437,0.477112,0.527379,0.482648
5,0.7103,1.394825,0.473898,0.524445,0.489802


Eval (macro metrics): {'eval_loss': 1.2035801410675049, 'eval_precision': 0.4755641503192688, 'eval_recall': 0.533136497486879, 'eval_f1': 0.4905923777718984, 'eval_runtime': 13.1546, 'eval_samples_per_second': 669.269, 'eval_steps_per_second': 41.886}

Classification report for mBERT:
                precision    recall  f1-score   support

Mixed_feelings     0.2490    0.4331    0.3162       986
      Negative     0.3885    0.4465    0.4155      1046
      Positive     0.8441    0.5965    0.6990      4974
     not-Tamil     0.5054    0.6763    0.5785       417
 unknown_state     0.3908    0.5134    0.4438      1381

      accuracy                         0.5511      8804
     macro avg     0.4756    0.5331    0.4906      8804
  weighted avg     0.6362    0.5511    0.5767      8804

Saved artifacts to ./bert_models/mBERT

=== TRAINING MuRIL (google/muril-base-cased) ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35216 [00:00<?, ? examples/s]

Map:   0%|          | 0/8804 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.2819,1.251731,0.439326,0.489737,0.452495
2,1.1787,1.167046,0.449414,0.528049,0.466251
3,1.0731,1.165188,0.479841,0.547958,0.501697
4,0.9429,1.21889,0.48225,0.548923,0.499156
5,0.8082,1.270329,0.499176,0.548663,0.515357


Eval (macro metrics): {'eval_loss': 1.2703287601470947, 'eval_precision': 0.49917624428446816, 'eval_recall': 0.548662615270271, 'eval_f1': 0.5153571331190253, 'eval_runtime': 10.3214, 'eval_samples_per_second': 852.985, 'eval_steps_per_second': 53.384}

Classification report for MuRIL:
                precision    recall  f1-score   support

Mixed_feelings     0.2612    0.3905    0.3130       986
      Negative     0.4395    0.4723    0.4553      1046
      Positive     0.8312    0.6425    0.7248      4974
     not-Tamil     0.5585    0.6978    0.6205       417
 unknown_state     0.4054    0.5402    0.4632      1381

      accuracy                         0.5806      8804
     macro avg     0.4992    0.5487    0.5154      8804
  weighted avg     0.6411    0.5806    0.6007      8804

Saved artifacts to ./bert_models/MuRIL

=== TRAINING IndicBERT (ai4bharat/indic-bert) ===


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35216 [00:00<?, ? examples/s]

Map:   0%|          | 0/8804 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.2675,1.251445,0.433247,0.484975,0.427018
2,1.1926,1.207057,0.446312,0.511822,0.442486
3,1.0874,1.216555,0.452779,0.51304,0.46913
4,0.9325,1.279472,0.45093,0.509117,0.46273
5,0.7437,1.378282,0.448772,0.500579,0.464173


Eval (macro metrics): {'eval_loss': 1.216555118560791, 'eval_precision': 0.45277938004513996, 'eval_recall': 0.5130401133454611, 'eval_f1': 0.46913039312760957, 'eval_runtime': 10.7191, 'eval_samples_per_second': 821.34, 'eval_steps_per_second': 51.404}

Classification report for IndicBERT:
                precision    recall  f1-score   support

Mixed_feelings     0.2443    0.4118    0.3066       986
      Negative     0.3498    0.3920    0.3697      1046
      Positive     0.8239    0.5899    0.6875      4974
     not-Tamil     0.4612    0.6691    0.5460       417
 unknown_state     0.3847    0.5025    0.4358      1381

      accuracy                         0.5365      8804
     macro avg     0.4528    0.5130    0.4691      8804
  weighted avg     0.6166    0.5365    0.5609      8804

Saved artifacts to ./bert_models/IndicBERT

=== TRAINING XLM-R (xlm-roberta-base) ===


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/35216 [00:00<?, ? examples/s]

Map:   0%|          | 0/8804 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,1.2482,1.240818,0.476665,0.49581,0.451732
2,1.1526,1.198082,0.448077,0.535034,0.450798
3,1.1035,1.188108,0.483497,0.535248,0.496976
4,0.9907,1.234576,0.480192,0.535457,0.487824
5,0.8899,1.263428,0.486389,0.545547,0.50233


Eval (macro metrics): {'eval_loss': 1.2634284496307373, 'eval_precision': 0.48638940237402545, 'eval_recall': 0.5455467549822048, 'eval_f1': 0.5023300190021518, 'eval_runtime': 10.4661, 'eval_samples_per_second': 841.189, 'eval_steps_per_second': 52.646}

Classification report for XLM-R:
                precision    recall  f1-score   support

Mixed_feelings     0.2555    0.4097    0.3148       986
      Negative     0.3756    0.5210    0.4365      1046
      Positive     0.8511    0.5919    0.6982      4974
     not-Tamil     0.5442    0.6787    0.6041       417
 unknown_state     0.4055    0.5264    0.4581      1381

      accuracy                         0.5569      8804
     macro avg     0.4864    0.5455    0.5023      8804
  weighted avg     0.6435    0.5569    0.5821      8804

Saved artifacts to ./bert_models/XLM-R

Model comparison (macro metrics):
       model                      model_id  precision    recall        f1
0      MuRIL       google/muril-base-cased   0.499176  0