In [1]:
from google.colab import files
uploaded = files.upload()  # pick the three files
uploaded = files.upload()
uploaded = files.upload()

Saving tamil_sentiment_full_train.csv to tamil_sentiment_full_train.csv


Saving tamil_sentiment_full_test.csv to tamil_sentiment_full_test.csv


Saving tamil_sentiment_full_dev.csv to tamil_sentiment_full_dev.csv


In [2]:
# Cell 1 (shell + python imports)
!pip install -q -U transformers datasets accelerate evaluate safetensors

import os, random, time
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
print("PyTorch available:", torch.cuda.is_available(),
      "GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hPyTorch available: True GPU: Tesla T4


In [3]:
# Cell 2: seeds + robust parsing if train_df/dev_df/test_df not already in session
import torch
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

TRAIN_F = "tamil_sentiment_full_train.csv"
DEV_F   = "tamil_sentiment_full_dev.csv"
TEST_F  = "tamil_sentiment_full_test.csv"

def detect_best_delim(fname, sample_lines=4000, candidates=None):
    if candidates is None:
        candidates = ["\t", "|||", "|", ",", ";", "/"]
    counts = {}
    lines = []
    with open(fname, "r", encoding="utf-8", errors="replace") as f:
        for i, L in enumerate(f):
            lines.append(L.rstrip("\n"))
            if i+1 >= sample_lines: break
    n = max(1, len(lines))
    for d in candidates:
        ok = sum(1 for ln in lines if len(ln.rsplit(d,1))==2 and ln.rsplit(d,1)[1].strip()!="")
        counts[d] = ok / n
    best = max(counts, key=counts.get)
    return best, counts

def parse_file(fname):
    best, counts = detect_best_delim(fname)
    print(f"{os.path.basename(fname)} delim scores:", counts, "-> chosen:", repr(best))
    texts, labels = [], []
    skipped = 0
    with open(fname, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            ln = line.rstrip("\n")
            if not ln: continue
            parts = ln.rsplit(best,1)
            if len(parts)==2 and parts[1].strip()!="":
                t = parts[0].strip().strip('"').strip("'")
                l = parts[1].strip().strip('"').strip("'")
                texts.append(t); labels.append(l)
            else:
                # fallbacks
                done=False
                for d in ["\t","|",",",";"]:
                    p2 = ln.rsplit(d,1)
                    if len(p2)==2 and p2[1].strip()!="":
                        texts.append(p2[0].strip().strip('"').strip("'"))
                        labels.append(p2[1].strip().strip('"').strip("'"))
                        done=True; break
                if not done: skipped+=1
    df = pd.DataFrame({"text":texts,"label":labels})
    df["text"] = df["text"].astype(str).str.strip()
    df["label"] = df["label"].astype(str).str.strip().str.replace(r'^[\s\;\|"]+|[\s\;\|"]+$','',regex=True)
    df = df[(df["text"]!="") & (df["label"]!="")].reset_index(drop=True)
    print(f"Parsed {len(df)} rows from {os.path.basename(fname)} (skipped {skipped}).")
    return df

# Use existing dataframes if present and valid
use_existing = all(name in globals() and isinstance(globals()[name], pd.DataFrame) and set(["text","label"]).issubset(globals()[name].columns) for name in ["train_df","dev_df","test_df"])
if use_existing:
    print("Using existing train_df/dev_df/test_df in session.")
else:
    print("Parsing CSV files from working dir...")
    train_df = parse_file(TRAIN_F)
    dev_df   = parse_file(DEV_F)
    test_df  = parse_file(TEST_F)

# quick label overlap check and basic mapping if dev/test labels not in train
train_labels = sorted(list(set(train_df["label"].unique())))
print("Train labels:", train_labels)
def map_label_safe(x, known):
    s=str(x).strip()
    if s in known: return s
    # last token heuristics
    parts = re.split(r'[;|\t,/:]+', s)
    cand = parts[-1].strip()
    for kl in known:
        if cand.lower()==kl.lower(): return kl
    for kl in known:
        if kl.lower() in s.lower(): return kl
    return None

import re
# map dev/test labels to train labels where possible, drop unmapped
dev_mapped = []
unmapped_dev=[]
for l in dev_df["label"]:
    m = map_label_safe(l, train_labels)
    if m is None:
        unmapped_dev.append(l)
        dev_mapped.append(None)
    else:
        dev_mapped.append(m)
dev_df["label_mapped"]=dev_mapped

test_mapped=[]
unmapped_test=[]
for l in test_df["label"]:
    m = map_label_safe(l, train_labels)
    if m is None:
        unmapped_test.append(l); test_mapped.append(None)
    else:
        test_mapped.append(m)
test_df["label_mapped"]=test_mapped

print("Unmapped dev examples (sample):", unmapped_dev[:8])
print("Unmapped test examples (sample):", unmapped_test[:8])

# drop unmapped to be safe
dev_df = dev_df[dev_df["label_mapped"].notna()].copy().reset_index(drop=True)
test_df = test_df[test_df["label_mapped"].notna()].copy().reset_index(drop=True)
dev_df["label"] = dev_df["label_mapped"]; dev_df.drop(columns=["label_mapped"],inplace=True)
test_df["label"] = test_df["label_mapped"]; test_df.drop(columns=["label_mapped"],inplace=True)

print("Final sizes (train/dev/test):", len(train_df), len(dev_df), len(test_df))

Parsing CSV files from working dir...
tamil_sentiment_full_train.csv delim scores: {'\t': 1.0, '|||': 0.00025, '|': 0.001, ',': 0.084, ';': 0.0, '/': 0.0035} -> chosen: '\t'
Parsed 35219 rows from tamil_sentiment_full_train.csv (skipped 1).
tamil_sentiment_full_dev.csv delim scores: {'\t': 0.0, '|||': 0.0, '|': 0.0, ',': 0.08625, ';': 0.0, '/': 0.00225} -> chosen: ','
Parsed 378 rows from tamil_sentiment_full_dev.csv (skipped 4019).
tamil_sentiment_full_test.csv delim scores: {'\t': 0.0, '|||': 0.0, '|': 0.00025, ',': 0.07725, ';': 0.0, '/': 0.0025} -> chosen: ','
Parsed 340 rows from tamil_sentiment_full_test.csv (skipped 4062).
Train labels: ['Mixed_feelings', 'Negative', 'Positive', 'not-Tamil', 'unknown_state']
Unmapped dev examples (sample): []
Unmapped test examples (sample): []
Final sizes (train/dev/test): 35219 378 340


In [4]:
# Cell 3: tokenizer + HF datasets
MODEL_NAME = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

le = LabelEncoder()
y_train = le.fit_transform(train_df["label"].astype(str).tolist())
y_dev   = le.transform(dev_df["label"].astype(str).tolist())
y_test  = le.transform(test_df["label"].astype(str).tolist())
label_list = list(le.classes_)
num_labels = len(label_list)
print("Labels:", label_list, "num_labels:", num_labels)

MAX_LEN = 128
def tokenize_batch(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LEN)

train_ds_hf = Dataset.from_pandas(train_df[["text"]].assign(label=y_train))
dev_ds_hf   = Dataset.from_pandas(dev_df[["text"]].assign(label=y_dev))
test_ds_hf  = Dataset.from_pandas(test_df[["text"]].assign(label=y_test))

# remove pandas index column that datasets preserves
for d in (train_ds_hf, dev_ds_hf, test_ds_hf):
    if "_index" in d.column_names:
        d = d.remove_columns("_index")

train_ds_hf = train_ds_hf.map(tokenize_batch, batched=True)
dev_ds_hf   = dev_ds_hf.map(tokenize_batch, batched=True)
test_ds_hf  = test_ds_hf.map(tokenize_batch, batched=True)

# set format to PyTorch
train_ds_hf = train_ds_hf.remove_columns([c for c in train_ds_hf.column_names if c not in ["input_ids","attention_mask","label"]]).with_format("torch")
dev_ds_hf   = dev_ds_hf.remove_columns([c for c in dev_ds_hf.column_names if c not in ["input_ids","attention_mask","label"]]).with_format("torch")
test_ds_hf  = test_ds_hf.remove_columns([c for c in test_ds_hf.column_names if c not in ["input_ids","attention_mask","label"]]).with_format("torch")

print("Prepared HF datasets; sample shapes:")
print(train_ds_hf[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Labels: [np.str_('Mixed_feelings'), np.str_('Negative'), np.str_('Positive'), np.str_('not-Tamil'), np.str_('unknown_state')] num_labels: 5


Map:   0%|          | 0/35219 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Prepared HF datasets; sample shapes:
{'label': tensor(4), 'input_ids': tensor([  101, 12128, 11850, 10473, 11183, 13956, 78761, 11847, 95509, 10116,
        20950,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0, 

In [11]:
# Cell 4: model and trainer setup
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)
from transformers import EarlyStoppingCallback,Trainer, TrainingArguments
# TrainingArguments
output_dir = "mbert_finetune_output"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_weighted_f1",
    greater_is_better=True,
    fp16=True if torch.cuda.is_available() else False,
    report_to = "none"
)

# compute_metrics: returns weighted f1 + accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    wf1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "weighted_f1": wf1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_hf,
    eval_dataset=dev_ds_hf,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
print("Trainer ready. GPU:", torch.cuda.is_available())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Trainer ready. GPU: True


In [12]:
# Cell 5: train
train_result = trainer.train()
trainer.save_model(output_dir)
metrics = train_result.metrics
print("Training metrics:", metrics)

Epoch,Training Loss,Validation Loss,Accuracy,Weighted F1
1,1.2642,1.375979,0.465608,0.295838
2,1.2746,1.413922,0.465608,0.295838
3,1.2481,1.406858,0.465608,0.295838


Training metrics: {'train_runtime': 1202.1141, 'train_samples_per_second': 175.785, 'train_steps_per_second': 10.991, 'total_flos': 6950068394579712.0, 'train_loss': 1.2599793115674167, 'epoch': 3.0}


In [13]:
# Cell 6: evaluation and reports
# Evaluate on dev
dev_eval = trainer.predict(dev_ds_hf)
dev_logits, dev_labels, _ = dev_eval
dev_preds = np.argmax(dev_logits, axis=1)
dev_acc = accuracy_score(dev_labels, dev_preds)
dev_wf1 = f1_score(dev_labels, dev_preds, average="weighted")
print(f"Dev accuracy: {dev_acc:.4f}, Dev weighted F1: {dev_wf1:.4f}")
print("\nDev classification report:")
print(classification_report(dev_labels, dev_preds, target_names=label_list, digits=4))
print("\nDev confusion matrix:")
print(confusion_matrix(dev_labels, dev_preds))

# Evaluate on test
test_eval = trainer.predict(test_ds_hf)
test_logits, test_labels, _ = test_eval
test_preds = np.argmax(test_logits, axis=1)
test_acc = accuracy_score(test_labels, test_preds)
test_wf1 = f1_score(test_labels, test_preds, average="weighted")
print(f"\nTest accuracy: {test_acc:.4f}, Test weighted F1: {test_wf1:.4f}")
print("\nTest classification report:")
print(classification_report(test_labels, test_preds, target_names=label_list, digits=4))
print("\nTest confusion matrix:")
print(confusion_matrix(test_labels, test_preds))

# Save metrics
metrics_df = pd.DataFrame([{
    "model": "mBERT_pytorch_trainer",
    "dev_accuracy": float(dev_acc), "dev_weighted_f1": float(dev_wf1),
    "test_accuracy": float(test_acc), "test_weighted_f1": float(test_wf1)
}])
metrics_df.to_csv("mbert_pytorch_metrics.csv", index=False)
print("Saved mbert_pytorch_metrics.csv")

Dev accuracy: 0.4656, Dev weighted F1: 0.2958

Dev classification report:
                precision    recall  f1-score   support

Mixed_feelings     0.0000    0.0000    0.0000        65
      Negative     0.0000    0.0000    0.0000        55
      Positive     0.4656    1.0000    0.6354       176
     not-Tamil     0.0000    0.0000    0.0000         9
 unknown_state     0.0000    0.0000    0.0000        73

      accuracy                         0.4656       378
     macro avg     0.0931    0.2000    0.1271       378
  weighted avg     0.2168    0.4656    0.2958       378


Dev confusion matrix:
[[  0   0  65   0   0]
 [  0   0  55   0   0]
 [  0   0 176   0   0]
 [  0   0   9   0   0]
 [  0   0  73   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Test accuracy: 0.5029, Test weighted F1: 0.3366

Test classification report:
                precision    recall  f1-score   support

Mixed_feelings     0.0000    0.0000    0.0000        58
      Negative     0.0000    0.0000    0.0000        49
      Positive     0.5029    1.0000    0.6693       171
     not-Tamil     0.0000    0.0000    0.0000        10
 unknown_state     0.0000    0.0000    0.0000        52

      accuracy                         0.5029       340
     macro avg     0.1006    0.2000    0.1339       340
  weighted avg     0.2529    0.5029    0.3366       340


Test confusion matrix:
[[  0   0  58   0   0]
 [  0   0  49   0   0]
 [  0   0 171   0   0]
 [  0   0  10   0   0]
 [  0   0  52   0   0]]
Saved mbert_pytorch_metrics.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Multi-model finetune loop: runs mBERT and MuRIL sequentially and saves dev/test metrics
# Paste & run after you've prepared train_df/dev_df/test_df and label encoder (le or label_list)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch, numpy as np, pandas as pd
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import gc, os

# change/add models here
MODEL_LIST = [
    "bert-base-multilingual-cased",   # mBERT (already tested)
    "google/muril-base-cased"         # MuRIL (Google)
]

# GPU-aware defaults
use_gpu = torch.cuda.is_available()
print("GPU available:", use_gpu, "device:", torch.cuda.get_device_name(0) if use_gpu else "CPU")

# training defaults (tweak per-model if OOM)
DEFAULT_EPOCHS = 3
DEFAULT_BATCH = 16 if use_gpu else 4
MAX_LEN = 128
OUTPUT_DIR_BASE = "finetune_results"

# If you have a LabelEncoder 'le' or label_list from earlier, get labels
if 'le' in globals():
    label_list = list(le.classes_)
else:
    # infer from train_df
    label_list = sorted(train_df['label'].unique().tolist())
num_labels = len(label_list)
print("Labels:", label_list, "num_labels:", num_labels)

# helper metric for Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds), "weighted_f1": f1_score(labels, preds, average="weighted")}

all_metrics = []

for MODEL_NAME in MODEL_LIST:
    print("\n" + "="*80)
    print("RUNNING MODEL:", MODEL_NAME)
    model_output_dir = os.path.join(OUTPUT_DIR_BASE, MODEL_NAME.replace("/", "_"))
    os.makedirs(model_output_dir, exist_ok=True)

    # 1) tokenizer & tokenization (re-tokenize for each model)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

    def tokenize_texts(texts):
        return tokenizer(texts, truncation=True, padding="max_length", max_length=MAX_LEN)

    # Convert pandas -> datasets.Dataset and tokenize
    def make_hf_dataset(df, labels):
        ds = Dataset.from_pandas(pd.DataFrame({"text": df["text"].astype(str).tolist(), "label": labels}))
        ds = ds.map(lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LEN), batched=True)
        # keep only columns needed
        keep_cols = ["input_ids", "attention_mask", "label"]
        if "token_type_ids" in ds.column_names:
            keep_cols.append("token_type_ids")
        ds = ds.remove_columns([c for c in ds.column_names if c not in keep_cols]).with_format("torch")
        return ds

    # map labels using same label encoder (le) or recompute to integers in same order as label_list
    # ensure labels are ints matching label_list order
    label_to_id = {lab: i for i, lab in enumerate(label_list)}
    y_train = [label_to_id[x] for x in train_df['label'].astype(str).tolist()]
    y_dev   = [label_to_id[x] for x in dev_df['label'].astype(str).tolist()]
    y_test  = [label_to_id[x] for x in test_df['label'].astype(str).tolist()]

    train_hf = make_hf_dataset(train_df, y_train)
    dev_hf   = make_hf_dataset(dev_df, y_dev)
    test_hf  = make_hf_dataset(test_df, y_test)

    # 2) model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

    # 3) training arguments (GPU-aware)
    per_device_train_batch_size = DEFAULT_BATCH
    per_device_eval_batch_size = DEFAULT_BATCH * 2
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=DEFAULT_EPOCHS,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="steps",
        logging_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="weighted_f1",
        greater_is_better=True,
        fp16=True if use_gpu else False,
        report_to="none"   # disable wandb prompt
    )

    # 4) Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_hf,
        eval_dataset=dev_hf,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    # 5) train
    print("Starting training for", MODEL_NAME)
    train_result = trainer.train()
    trainer.save_model(model_output_dir)

    # 6) eval: dev & test
    print("Evaluating on dev...")
    dev_eval = trainer.predict(dev_hf)
    dev_logits, dev_labels, _ = dev_eval
    dev_preds = np.argmax(dev_logits, axis=1)
    dev_acc = accuracy_score(dev_labels, dev_preds)
    dev_wf1 = f1_score(dev_labels, dev_preds, average="weighted")
    print(f"Dev -> acc: {dev_acc:.4f}, weighted_f1: {dev_wf1:.4f}")
    print(classification_report(dev_labels, dev_preds, target_names=label_list, digits=4))

    print("Evaluating on test...")
    test_eval = trainer.predict(test_hf)
    test_logits, test_labels, _ = test_eval
    test_preds = np.argmax(test_logits, axis=1)
    test_acc = accuracy_score(test_labels, test_preds)
    test_wf1 = f1_score(test_labels, test_preds, average="weighted")
    print(f"Test -> acc: {test_acc:.4f}, weighted_f1: {test_wf1:.4f}")
    print(classification_report(test_labels, test_preds, target_names=label_list, digits=4))

    # 7) collect & save metrics
    metrics_row = {
        "model": MODEL_NAME,
        "dev_accuracy": float(dev_acc), "dev_weighted_f1": float(dev_wf1),
        "test_accuracy": float(test_acc), "test_weighted_f1": float(test_wf1)
    }
    all_metrics.append(metrics_row)
    pd.DataFrame(all_metrics).to_csv("all_finetune_metrics.csv", index=False)
    print("Saved metrics to all_finetune_metrics.csv")

    # cleanup to reduce GPU memory pressure between runs
    del model, trainer, train_hf, dev_hf, test_hf
    torch.cuda.empty_cache()
    gc.collect()

print("\nAll done. Summary:")
display(pd.DataFrame(all_metrics))

GPU available: True device: Tesla T4
Labels: [np.str_('Mixed_feelings'), np.str_('Negative'), np.str_('Positive'), np.str_('not-Tamil'), np.str_('unknown_state')] num_labels: 5

RUNNING MODEL: bert-base-multilingual-cased


Map:   0%|          | 0/35219 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training for bert-base-multilingual-cased


Epoch,Training Loss,Validation Loss,Accuracy,Weighted F1
1,1.0705,1.237322,0.492063,0.425684
2,0.9546,1.254484,0.494709,0.421344
3,0.8659,1.259369,0.502646,0.46603


Evaluating on dev...


Dev -> acc: 0.5026, weighted_f1: 0.4660
                precision    recall  f1-score   support

Mixed_feelings     0.3462    0.1385    0.1978        65
      Negative     0.3846    0.3636    0.3738        55
      Positive     0.5949    0.8011    0.6828       176
     not-Tamil     0.3750    0.3333    0.3529         9
 unknown_state     0.3091    0.2329    0.2656        73

      accuracy                         0.5026       378
     macro avg     0.4020    0.3739    0.3746       378
  weighted avg     0.4611    0.5026    0.4660       378

Evaluating on test...


Test -> acc: 0.5500, weighted_f1: 0.5153
                precision    recall  f1-score   support

Mixed_feelings     0.3214    0.1552    0.2093        58
      Negative     0.4500    0.3673    0.4045        49
      Positive     0.6311    0.8304    0.7172       171
     not-Tamil     0.6000    0.3000    0.4000        10
 unknown_state     0.3571    0.2885    0.3191        52

      accuracy                         0.5500       340
     macro avg     0.4719    0.3883    0.4100       340
  weighted avg     0.5094    0.5500    0.5153       340

Saved metrics to all_finetune_metrics.csv

RUNNING MODEL: google/muril-base-cased


tokenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

Map:   0%|          | 0/35219 [00:00<?, ? examples/s]

Map:   0%|          | 0/378 [00:00<?, ? examples/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/953M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/953M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training for google/muril-base-cased


Epoch,Training Loss,Validation Loss,Accuracy,Weighted F1
1,1.2661,1.373186,0.465608,0.295838
2,1.2646,1.396736,0.465608,0.295838
3,1.2484,1.404447,0.465608,0.295838


Evaluating on dev...


Dev -> acc: 0.4656, weighted_f1: 0.2958
                precision    recall  f1-score   support

Mixed_feelings     0.0000    0.0000    0.0000        65
      Negative     0.0000    0.0000    0.0000        55
      Positive     0.4656    1.0000    0.6354       176
     not-Tamil     0.0000    0.0000    0.0000         9
 unknown_state     0.0000    0.0000    0.0000        73

      accuracy                         0.4656       378
     macro avg     0.0931    0.2000    0.1271       378
  weighted avg     0.2168    0.4656    0.2958       378

Evaluating on test...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test -> acc: 0.5029, weighted_f1: 0.3366
                precision    recall  f1-score   support

Mixed_feelings     0.0000    0.0000    0.0000        58
      Negative     0.0000    0.0000    0.0000        49
      Positive     0.5029    1.0000    0.6693       171
     not-Tamil     0.0000    0.0000    0.0000        10
 unknown_state     0.0000    0.0000    0.0000        52

      accuracy                         0.5029       340
     macro avg     0.1006    0.2000    0.1339       340
  weighted avg     0.2529    0.5029    0.3366       340

Saved metrics to all_finetune_metrics.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



All done. Summary:


Unnamed: 0,model,dev_accuracy,dev_weighted_f1,test_accuracy,test_weighted_f1
0,bert-base-multilingual-cased,0.502646,0.46603,0.55,0.51527
1,google/muril-base-cased,0.465608,0.295838,0.502941,0.336606
