In [None]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
!pip install --upgrade transformers




## Laod dataset

In [None]:
# 1. Load dataset (CSV with 'text' and 'label' columns)
dataset = load_dataset('csv', data_files='/content/drive/MyDrive/kaushi/Sinhala_mithuru/Function1/dataset/sentences/Grade 3 & 4 Dataset  - Grade 3 & 4 (1).csv', split='train')

In [None]:
dataset

Dataset({
    features: ['Sentences', 'class'],
    num_rows: 2064
})

In [None]:
split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split['train']
val_dataset   = split['test']

## Load model

In [None]:
from huggingface_hub import login
login("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# 3. Initialize tokenizer and model
model_name = "xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
num_labels = len(set(train_dataset['class']))  # assume labels are integers 0..N-1
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
# (Optionally adjust dropout: model.config.hidden_dropout_prob = 0.1)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Tokenized

In [None]:
def tokenize_fn(batch):
    texts = [str(t) for t in batch["Sentences"]]   # convert everything to string
    return tokenizer(texts, padding='max_length', truncation=True, max_length=5)


In [None]:
train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset   = val_dataset.map(tokenize_fn, batched=True)


Map:   0%|          | 0/207 [00:00<?, ? examples/s]

In [None]:
# Convert label column to integers if they are strings like "correct"/"incorrect"
def convert_labels(batch):
    batch["labels"] = [0 if x=="incorrect" else 1 for x in batch["class"]]
    return batch

train_dataset = train_dataset.map(convert_labels, batched=True)
val_dataset = val_dataset.map(convert_labels, batched=True)

Map:   0%|          | 0/207 [00:00<?, ? examples/s]

In [None]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
val_dataset

Dataset({
    features: ['Sentences', 'class', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 207
})

In [None]:
val_dataset['input_ids']

Column([tensor([     0, 126754,  45561,  64994,      2]), tensor([     0, 173800, 151007, 215708,      2]), tensor([     0, 169727,   2148, 242752,      2]), tensor([     0,  14451, 225682, 122404,      2]), tensor([     0, 182886,   2148, 119990,      2])])

## Training model

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [None]:
import transformers
print(transformers.__file__)
print(transformers.__version__)



/usr/local/lib/python3.12/dist-packages/transformers/__init__.py
4.57.1


In [None]:
# 6. Set up training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,          # keep best model
#     metric_for_best_model="accuracy",
#     greater_is_better=True,
#     logging_steps=50,
#     save_total_limit=2
# )



from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    do_eval=True,
    metric_for_best_model="accuracy",# request evaluation
    eval_steps=200, # if supported in that version
    save_total_limit=2
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",  # evaluates every epoch
    save_strategy="epoch",         # saves every epoch (matches evaluation)
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=2
)

TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# 7. Initialize Trainer with early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)


  trainer = Trainer(


In [None]:
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# 8. Train and evaluate
trainer.train()
results = trainer.evaluate()
print("Validation metrics:", results)

Step,Training Loss
10,0.9727
20,0.9053
30,0.9173
40,0.8698
50,0.8417
60,0.7911
70,0.7439
80,0.7683
90,0.7301
100,0.7471


Validation metrics: {'eval_loss': 0.6980394721031189, 'eval_accuracy': 0.4492753623188406, 'eval_precision': 0.20184835118672548, 'eval_recall': 0.4492753623188406, 'eval_f1': 0.27855072463768116, 'eval_runtime': 0.602, 'eval_samples_per_second': 343.843, 'eval_steps_per_second': 43.188, 'epoch': 5.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# 1. Load fine-tuned model and tokenizer
model_name_or_path = "/content/results/checkpoint-1165"  # path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [None]:
# set model to evaluation mode

# 2. Prepare test sentences (new Sinhala sentences)
test_sentences = [
    "කියනවා බොරු එයා",
    "ගැටලුවක් නැහැ",
    "කරන්න ගැටලුවක් සද්ද"
]

# 3. Tokenize the sentences
inputs = tokenizer(
    test_sentences,
    padding=True,
    truncation=True,
    max_length=20,
    return_tensors="pt"
)

# 4. Get predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_classes = torch.argmax(logits, dim=1).cpu().numpy()

# 5. Print predictions
for sentence, pred in zip(test_sentences, predicted_classes):
    print(f"Sentence: {sentence} -> Predicted class: {pred}")


Sentence: කියනවා බොරු එයා -> Predicted class: 1
Sentence: ගැටලුවක් නැහැ -> Predicted class: 1
Sentence: කරන්න ගැටලුවක් සද්ද -> Predicted class: 1
