In [None]:
!pip install -U accelerate
!pip install --upgrade accelerate


In [1]:
import transformers, accelerate
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)


Transformers: 4.40.1
Accelerate: 0.27.2


In [2]:
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments)
from accelerate import Accelerator
help(Accelerator.__init__)

import torch




Help on function __init__ in module accelerate.accelerator:

__init__(self, device_placement: 'bool' = True, split_batches: 'bool' = False, mixed_precision: 'PrecisionType | str | None' = None, gradient_accumulation_steps: 'int' = 1, cpu: 'bool' = False, deepspeed_plugin: 'DeepSpeedPlugin | None' = None, fsdp_plugin: 'FullyShardedDataParallelPlugin | None' = None, megatron_lm_plugin: 'MegatronLMPlugin | None' = None, rng_types: 'list[str | RNGType] | None' = None, log_with: 'str | LoggerType | GeneralTracker | list[str | LoggerType | GeneralTracker] | None' = None, project_dir: 'str | os.PathLike | None' = None, project_config: 'ProjectConfiguration | None' = None, gradient_accumulation_plugin: 'GradientAccumulationPlugin | None' = None, dispatch_batches: 'bool | None' = None, even_batches: 'bool' = True, use_seedable_sampler: 'bool' = False, step_scheduler_with_optimizer: 'bool' = True, kwargs_handlers: 'list[KwargsHandler] | None' = None, dynamo_backend: 'DynamoBackend | str | None' 

In [3]:
df = pd.read_excel("urdu_annotation.xlsx")  


In [4]:
df.dropna(subset=["Sentences", "Emotion"], inplace=True)
df = df[df["Emotion"].str.lower() != "discard"] 
df["Emotion"] = df["Emotion"].str.lower().str.strip()


In [5]:
label_encoder = LabelEncoder()
df["EmotionLabel"] = label_encoder.fit_transform(df["Emotion"])


In [6]:
try:
    from emotion_dictionary import emotion_trigger_dict

    def has_trigger_word(row):
        emotion = row["Emotion"]
        sentence = row["Sentences"]
        if emotion in emotion_trigger_dict:
            return int(any(word in sentence for word in emotion_trigger_dict[emotion]))
        return 0

    df["Has_Trigger"] = df.apply(has_trigger_word, axis=1)
except:
    print("Trigger word dictionary not found or not used.")
    df["Has_Trigger"] = 0  # fallback


In [7]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=df["EmotionLabel"].nunique()
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
dataset = Dataset.from_pandas(df[["Sentences", "EmotionLabel", "Has_Trigger"]])

def tokenize(batch):
    encoding = tokenizer(
        batch["Sentences"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    encoding["labels"] = batch["EmotionLabel"]
    return encoding

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/932 [00:00<?, ? examples/s]

In [9]:
training_args = TrainingArguments(
    output_dir="./urdu-emotion-results",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=20,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)


In [10]:
from sklearn.metrics import accuracy_score, classification_report

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}



In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [12]:
print("Starting training...")
trainer.train()
print("Training completed.")

print("Saving model...")
trainer.save_model("urdu-emotion-model")
tokenizer.save_pretrained("urdu-emotion-model")
print("Model saved.")



Starting training...




Epoch,Training Loss,Validation Loss,Accuracy
1,1.7492,1.733969,0.247854
2,1.7874,1.733346,0.254292
3,1.7219,1.737916,0.247854
4,1.734,1.731156,0.247854




Training completed.
Saving model...
Model saved.


In [13]:
from sklearn.metrics import classification_report

true_labels = df["EmotionLabel"]

pred_output = trainer.predict(tokenized_dataset)
pred_labels = pred_output.predictions.argmax(axis=-1)

print(classification_report(true_labels, pred_labels, target_names=label_encoder.classes_))




              precision    recall  f1-score   support

       anger       0.40      0.03      0.05       226
        fear       0.00      0.00      0.00       125
         joy       0.00      0.00      0.00       106
        love       0.00      0.00      0.00       150
     neutral       0.25      1.00      0.40       231
         sad       0.00      0.00      0.00        94

    accuracy                           0.25       932
   macro avg       0.11      0.17      0.08       932
weighted avg       0.16      0.25      0.11       932



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch


model = AutoModelForSequenceClassification.from_pretrained("urdu-emotion-model")
tokenizer = AutoTokenizer.from_pretrained("urdu-emotion-model")

def predict_emotion(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]


predict_emotion("مجھے بہت خوشی محسوس ہو رہی ہے۔")




'neutral'