In [1]:
!pip install transformers datasets torch




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import torch
from transformers import AutoConfig
import torch.nn as nn

In [3]:
# Define a compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Get predicted class by taking the argmax of logits
    preds = predictions.argmax(axis=1)

    # Calculate metrics
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    # Return as a dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [4]:
# Load your dataset
data_path = "/kaggle/input/orientation-tr-train/orientation-tr-train.tsv"  # Replace with your dataset file path
df = pd.read_csv(data_path, sep='\t')

# Check the distribution of labels
label_counts = df['label'].value_counts()

In [5]:
# Print class distribution
print("Class Distribution:")
for label, count in label_counts.items():
    print(f"Label {label}: {count} ({(count / len(df)) * 100:.2f}%)")


Class Distribution:
Label 1: 9390 (58.19%)
Label 0: 6748 (41.81%)


In [6]:
# Perform stratified split based on the 'label' column
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)

print(train_df)
print(test_df)

# Create a Hugging Face Dataset object from the DataFrame
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

            id                           speaker sex  \
11334  tr11334  0e89ab0a8d7fd030dcde1ac4c9bc835b   M   
15381  tr15381  8bd5bf78e67110b62f2422d286833b9f   M   
5431   tr05431  be82a4ade406ec6774a0a2e38f6957e3   M   
9486   tr09486  853677cdc5a5b426ca8a79036210baaf   F   
3549   tr03549  98fe6edd8a618c1c26a0dad468666275   M   
...        ...                               ...  ..   
7019   tr07019  9ce564673a5182dd246cd60e2fbc2aec   M   
2247   tr02247  3a4409d1f226bd84d3a26813966de314   F   
8358   tr08358  66460cda11160d4044ff006d3d6bdf49   M   
5748   tr05748  e3b89cafbddc575e6cd369a0c34b8be4   F   
9264   tr09264  64288895a2343df765f7a30b39e17070   M   

                                                    text  \
11334  Teşekkür ederim Sayın Başkan. <p> Tarım Bakanı...   
15381  Sayın Başkan, değerli milletvekilleri; sizleri...   
5431   Değerli arkadaşlar, bir örnek daha vereceğim: ...   
9486   Ama sonuçta halkımız bunun kararını verecek. G...   
3549   Sayın Komisyon Başka

In [7]:
device = torch.device("cuda:0")

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(df['label']),
    y=df['label']
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

In [8]:


# Load pre-trained model and tokenizer
model_name = "xlm-roberta-base"
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



In [9]:
# Replace the model's loss function
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)
original_forward = model.forward  # Save the original forward method

def custom_forward(input_ids=None, attention_mask=None, labels=None, **kwargs):
    outputs = original_forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
    logits = outputs.logits
    
    if labels is not None:
        loss = loss_fn(logits, labels)
        return (loss, logits)
    
    return outputs

model.forward = custom_forward 

In [10]:
# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/14524 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

In [11]:
# Rename columns for the trainer
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")

# Remove unnececcary columns (English column is removed)
train_dataset = train_dataset.remove_columns(["id", "text_en"])
test_dataset = test_dataset.remove_columns(["id", "text_en"])

print(train_dataset)
print(test_dataset)


Dataset({
    features: ['speaker', 'sex', 'text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 14524
})
Dataset({
    features: ['speaker', 'sex', 'text', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 1614
})


In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./orientation-tr-results-2",
    evaluation_strategy="epoch",  # Evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",  # Save model after each epoch
    logging_dir='./logs',  # Logging directory
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)



In [13]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Include metric computation
)

In [14]:
# Train the model
trainer.train()

# Save the model after training
trainer.save_model()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3903,0.373967,0.847584,0.842053,0.908413,0.873975
2,0.2523,0.32348,0.878563,0.914158,0.873269,0.893246
3,0.1854,0.361787,0.885378,0.922646,0.876464,0.898962


In [15]:
# Evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.32347989082336426, 'eval_accuracy': 0.8785625774473358, 'eval_precision': 0.9141583054626533, 'eval_recall': 0.873269435569755, 'eval_f1': 0.8932461873638344, 'eval_runtime': 29.1175, 'eval_samples_per_second': 55.431, 'eval_steps_per_second': 0.893, 'epoch': 3.0}
