In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Device Count:", torch.cuda.device_count())
print("Current Device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

CUDA Available: True
CUDA Device Count: 1
Current Device: Tesla T4


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="FacebookAI/xlm-roberta-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForS

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/xlm-roberta-base")

Some weights of the model checkpoint at FacebookAI/xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import numpy as np
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight
import torch

In [None]:
# Load dataset
df = pd.read_csv('/content/final_dataset.csv')  # Update path if needed

In [None]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^\u0600-\u06FF\s]', '', str(text))  # Keep Arabic letters and spaces
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')  # Normalize Arabic
    return text.strip()

In [None]:
df['tweet'] = df['tweet'].apply(clean_text)

# Clean category column
df['category'] = df['category'].str.strip()  # Remove leading/trailing whitespace
df['category'] = df['category'].str.lower()  # Convert to lowercase for consistency

In [None]:
# Define label map (lowercase for consistency)
label_map = {
    'diminished ability to think or concentrate': 0,
    'feelings of worthlessness': 1,
    'psychomotor agitation or retardation': 2,
    'sleep disorder': 3,
    'suicidality': 4,
    'weight disorder': 5
}

In [None]:
# Map categories to labels
df['label'] = df['category'].map(label_map)

In [None]:
# Check for NaN labels and drop them
if df['label'].isna().sum() > 0:
    print(f"Found {df['label'].isna().sum()} rows with NaN labels. Dropping them.")
    print("Rows with NaN labels:")
    print(df[df['label'].isna()][['tweet', 'category']])
    df = df.dropna(subset=['label'])

Found 297 rows with NaN labels. Dropping them.
Rows with NaN labels:
                                              tweet  \
369                                                   
370                                         حيل ملل   
371                                             ملل   
372                              ملل حيل نبي اي شيء   
373                           نعرف انه ملل ومستمرين   
...                                             ...   
661   اف ضايق صدري صدق ابغي ابكي لين شي يطلع وارتاح   
662                                                   
783                                                   
901                                                   
1002                                                  

                                       category  
369                                         NaN  
370   losing interest or pleasure in activities  
371   losing interest or pleasure in activities  
372   losing interest or pleasure in activities  
373   losing interes

In [None]:
# Convert labels to integers
df['label'] = df['label'].astype(int)

In [None]:
# Split dataset (stratified)
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

In [None]:
# Tokenization
tokenizer = AutoTokenizer.from_pretrained('FacebookAI/xlm-roberta-base')
def tokenize_function(examples):
    return tokenizer(examples['tweet'], padding='max_length', truncation=True, max_length=128)

In [None]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['tweet', 'label']])
val_dataset = Dataset.from_pandas(val_df[['tweet', 'label']])
test_dataset = Dataset.from_pandas(test_df[['tweet', 'label']])

In [None]:
# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/652 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

In [None]:
#Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


In [None]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(df['label']), y=df['label'])
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda')

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
# Custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(labels.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
# Load and train model
model = AutoModelForSequenceClassification.from_pretrained('FacebookAI/xlm-roberta-base', num_labels=6)
model.to(device)
training_args = TrainingArguments(
    output_dir='/content/fine_tuned_xlmroberta',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='/content/logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    report_to="none",  # Disable W&B logging
)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6248,1.563784
2,0.9144,0.612629
3,0.6161,0.337139


TrainOutput(global_step=246, training_loss=1.1968995923918437, metrics={'train_runtime': 171.944, 'train_samples_per_second': 11.376, 'train_steps_per_second': 1.431, 'total_flos': 128665926862848.0, 'train_loss': 1.1968995923918437, 'epoch': 3.0})

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate on test set
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

# Classification report
print(classification_report(true_labels, preds, target_names=label_map.keys()))

# Confusion matrix
cm = confusion_matrix(true_labels, preds)
print("Confusion Matrix:\n", cm)

                                            precision    recall  f1-score   support

Diminished ability to think or concentrate       1.00      0.90      0.95        40
                 Feelings of worthlessness       0.93      0.87      0.90        15
      Psychomotor agitation or retardation       0.90      1.00      0.95        18
                            Sleep disorder       0.84      0.89      0.86        18
                               Suicidality       0.94      1.00      0.97        15
                           Weight disorder       0.97      1.00      0.99        34

                                  accuracy                           0.94       140
                                 macro avg       0.93      0.94      0.93       140
                              weighted avg       0.95      0.94      0.94       140

Confusion Matrix:
 [[36  0  1  2  0  1]
 [ 0 13  0  1  1  0]
 [ 0  0 18  0  0  0]
 [ 0  1  1 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  0  0 34]]
