In [1]:
from google.colab import files
uploaded = files.upload()  # pick the three files
uploaded = files.upload()
uploaded = files.upload()

Saving tamil_sentiment_full_train.csv to tamil_sentiment_full_train.csv


Saving tamil_sentiment_full_test.csv to tamil_sentiment_full_test.csv


Saving tamil_sentiment_full_dev.csv to tamil_sentiment_full_dev.csv


In [2]:
!pip install -q -U transformers datasets accelerate evaluate safetensors

import os, random, time, gc
import numpy as np
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset

print("PyTorch available:", torch.cuda.is_available(),
      "GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m117.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hPyTorch available: True GPU: Tesla T4


In [13]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "weighted_f1": f1_score(labels, preds, average="weighted")
    }

In [14]:
# Robust CSV loader for your Tamil sentiment files
def load_split(path):
    rows = []
    try:
        with open(path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                # Try split by last tab
                if "\t" in line:
                    text, label = line.rsplit("\t", 1)
                elif ";" in line:  # fallback if mislabeled with semicolon
                    text, label = line.rsplit(";", 1)
                else:
                    continue
                rows.append((text.strip(), label.strip()))
    except FileNotFoundError:
        print(f"Warning: File not found at {path}. Returning empty DataFrame.")
        return pd.DataFrame(columns=["text", "label"])
    return pd.DataFrame(rows, columns=["text","label"])

# 1. Load Data
train_df = load_split("tamil_sentiment_full_train.csv")
dev_df   = load_split("tamil_sentiment_full_dev.csv")
test_df  = load_split("tamil_sentiment_full_test.csv")

# 2. Clean Data (remove rows with empty labels)
train_df = train_df[train_df["label"].str.strip() != ""].reset_index(drop=True)
dev_df   = dev_df[dev_df["label"].str.strip() != ""].reset_index(drop=True)
test_df  = test_df[test_df["label"].str.strip() != ""].reset_index(drop=True)

print("Data shapes after cleaning:")
print(f"Train: {train_df.shape}")
print(f"Dev:   {dev_df.shape}")
print(f"Test:  {test_df.shape}")

# 3. Label Encoding
le = LabelEncoder()
train_df["label_enc"] = le.fit_transform(train_df["label"])

# Only transform dev/test if they are not empty
if not dev_df.empty:
    dev_df["label_enc"] = le.transform(dev_df["label"])
if not test_df.empty:
    test_df["label_enc"] = le.transform(test_df["label"])

label_list = list(le.classes_)
num_labels = len(label_list)
print(f"\nLabels: {label_list}, num_labels: {num_labels}")
print("\nSample processed train data:\n", train_df.head())

Data shapes after cleaning:
Train: (35220, 2)
Dev:   (0, 2)
Test:  (0, 2)

Labels: ['Mixed_feelings', 'Negative', 'Positive', 'not-Tamil', 'unknown_state'], num_labels: 5

Sample processed train data:
                                                 text          label  label_enc
0              First like button vijay setupati fans  unknown_state          4
1         Vetri ne dhanusha pudiche thongitu iru....       Positive          2
2  Ithu romba naal ku munnadi Short film'a pathat...       Positive          2
3               Trending no1 in srilanka.... june 16       Positive          2
4                      Maja thala marana  mass thala       Positive          2


In [15]:
MODEL_NAME = "xlm-roberta-base"
MAX_LEN = 128

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def make_hf_dataset(df):
    # Return None if the DataFrame is empty or missing required columns
    if df.empty or "text" not in df.columns or "label_enc" not in df.columns:
        return None

    ds = Dataset.from_pandas(df[["text","label_enc"]].rename(columns={"label_enc":"label"}))
    ds = ds.map(lambda ex: tokenizer(ex["text"], truncation=True, padding="max_length", max_length=MAX_LEN), batched=True)

    # RoBERTa models don't use token_type_ids
    keep_cols = ["input_ids", "attention_mask", "label"]
    ds = ds.remove_columns([c for c in ds.column_names if c not in keep_cols])
    ds.set_format("torch")
    return ds

train_ds = make_hf_dataset(train_df)
dev_ds   = make_hf_dataset(dev_df)
test_ds  = make_hf_dataset(test_df)

print(f"Tokenized train dataset size: {len(train_ds) if train_ds else 0}")
print(f"Tokenized dev dataset size:   {len(dev_ds) if dev_ds else 0}")
print(f"Tokenized test dataset size:  {len(test_ds) if test_ds else 0}")

Map:   0%|          | 0/35220 [00:00<?, ? examples/s]

Tokenized train dataset size: 35220
Tokenized dev dataset size:   0
Tokenized test dataset size:  0


In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "weighted_f1": f1_score(labels, preds, average="weighted")
    }

# Check if an evaluation set is available
use_evaluation = dev_ds is not None

training_args = TrainingArguments(
    output_dir="./xlmr_results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    # Dynamically set evaluation-dependent arguments
    eval_strategy="epoch" if use_evaluation else "no",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=200,
    load_best_model_at_end=use_evaluation,
    # Corrected metric name (Trainer adds 'eval_')
    metric_for_best_model="weighted_f1" if use_evaluation else None,
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds, # Trainer handles None for eval_dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # Only use early stopping if there is a validation set
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] if use_evaluation else []
)

print("Trainer ready.")
if not use_evaluation:
    print("Warning: No evaluation dataset found. `load_best_model_at_end` and `eval_strategy` are disabled.")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Trainer ready.


In [17]:
# Train the model
train_result = trainer.train()
trainer.save_model("./xlmr_final_model")

# Evaluate on the Dev set, if available
if dev_ds:
    print("\n--- Evaluating on Dev Set ---")
    dev_preds_output = trainer.predict(dev_ds)
    dev_preds = np.argmax(dev_preds_output.predictions, axis=1)

    print("\nDev Classification Report:")
    print(classification_report(dev_preds_output.label_ids, dev_preds, target_names=label_list, digits=4))
else:
    print("\n--- No Dev Set to Evaluate ---")

# Evaluate on the Test set, if available
if test_ds:
    print("\n--- Evaluating on Test Set ---")
    test_preds_output = trainer.predict(test_ds)
    test_preds = np.argmax(test_preds_output.predictions, axis=1)

    print("\nTest Classification Report:")
    print(classification_report(test_preds_output.label_ids, test_preds, target_names=label_list, digits=4))
else:
    print("\n--- No Test Set to Evaluate ---")

Step,Training Loss
200,1.5398
400,1.2859
600,1.2387
800,1.2096
1000,1.1411
1200,1.1169
1400,1.0973
1600,1.0617
1800,1.0511
2000,1.0375



--- No Dev Set to Evaluate ---

--- No Test Set to Evaluate ---
