In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers import TrainingArguments, Trainer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report


In [None]:
dev_test_list = ["dev_test_it.tsv", "dev_test_en.tsv", "dev_test_de.tsv", "dev_test_ar.tsv", "dev_test_bg.tsv"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 7
LEARNING_RATE = 1e-5

In [None]:
def train_monolingual_subjectivity_classifier(model_name, train_data, val_data, learning_rate, epoch):
    # Load tokenizer using AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create datasets
    train_dataset = SubjectivityDataset(train_data, tokenizer, MAX_LENGTH)

    val_dataset = SubjectivityDataset(val_data, tokenizer, MAX_LENGTH)

    # Use AutoModelForSequenceClassification instead of RobertaForSequenceClassification

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        ignore_mismatched_sizes=True  # For binary classification (subjective/objective)
    )

    class_weights = torch.tensor([1.5, 0.5]).to(device)  # Give more weight to SUBJ class
    #loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)




    # Define training arguments with compatibility for older transformers versions
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",  # Changed from evaluation_strategy
        learning_rate=learning_rate,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=epoch,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_strategy="epoch",  # This might need to be adjusted
        logging_dir="./logs",
        logging_steps=100,
        logging_strategy="epoch",  # This might need to be changed to logging_mode
        report_to='none'
    )
    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

    # If the above fails due to older transformers version, try this alternate version:
    # training_args = TrainingArguments(
    #     output_dir="./results",
    #     eval_steps=500,
    #     learning_rate=LEARNING_RATE,
    #     per_device_train_batch_size=BATCH_SIZE,
    #     per_device_eval_batch_size=BATCH_SIZE,
    #     num_train_epochs=EPOCHS,
    #     weight_decay=0.01,
    #     logging_dir="./logs",
    #     logging_steps=100,
    #     report_to=None
    # )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks = [early_stopping_callback],
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
    print(f"Evaluation results: {eval_result}")

    return model, tokenizer

In [None]:
def load_data(file_paths):
    """Load data from multiple TSV files and combine them."""
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t', header=0)
        # Add language tag based on filename
        language = os.path.basename(file_path).split('_')[1].split('.')[0]
        df['language'] = language
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

In [None]:
# 3. Create a Dataset class
class SubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension added by the tokenizer
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label
        encoding['labels'] = torch.tensor(label, dtype=torch.long)

        return encoding


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Create a Dataset class for test data
class TestSubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label and sentence ID
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        encoding['sentence_idx'] = idx

        return encoding

In [None]:
def evaluate_on_dev_test_set(test_file_path, model, tokenizer, filename, max_length=128, batch_size=16):
    # Load test data
    test_data = pd.read_csv(test_file_path, sep='\t')
    print(f"Loaded test data with {len(test_data)} examples")
    print(f"Columns: {test_data.columns.tolist()}")

    # Map labels to IDs
    test_data['label_id'] = test_data['label'].map({'OBJ': 0, 'SUBJ': 1})

    # Create dataset and dataloader
    test_dataset = TestSubjectivityDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_true_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in test_dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_idx'}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]
    true_text_labels = ["OBJ" if t == 0 else "SUBJ" for t in all_true_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'sentence': [test_data.iloc[idx]['sentence'] for idx in all_indices],
        'true_label': true_text_labels,
        'predicted_label': pred_text_labels,
        'obj_score': [round(p[0], 4) for p in all_predictions],
        'subj_score': [round(p[1], 4) for p in all_predictions],
    })


    # Calculate metrics
    accuracy = accuracy_score(all_true_labels, all_pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true_labels, all_pred_labels, average='macro'
    )


    # ========== NEW CODE FOR SUBMISSION FILE ==========
    # Create submission dataframe with required format
    submission_df = results_df[['sentence_id', 'predicted_label']].copy()
    submission_df.columns = ['sentence_id', 'label']  # Rename columns

    # Save to TSV
    submission_output_path = filename
    submission_df.to_csv(submission_output_path, sep='\t', index=False)
    print(f"\nSubmission file saved to {submission_output_path}")
    # ========== END NEW CODE ==========

    # [The rest of the original code remains unchanged...]


    # Detailed report
    class_report = classification_report(all_true_labels, all_pred_labels,
                                         target_names=['OBJ', 'SUBJ'], output_dict=True)

    # Print detailed metrics
    print(f"\n===== Model Performance on {test_file_path} =====")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}\n")

    # Print per-class metrics
    print("Class-wise Performance:")
    print(f"OBJ - Precision: {class_report['OBJ']['precision']:.4f}, "
          f"Recall: {class_report['OBJ']['recall']:.4f}, "
          f"F1: {class_report['OBJ']['f1-score']:.4f}")
    print(f"SUBJ - Precision: {class_report['SUBJ']['precision']:.4f}, "
          f"Recall: {class_report['SUBJ']['recall']:.4f}, "
          f"F1: {class_report['SUBJ']['f1-score']:.4f}")
    print(f"-----macro avg F1-------- {(class_report['OBJ']['f1-score']+class_report['SUBJ']['f1-score'])/2}" )

    # Confusion matrix
    cm = confusion_matrix(all_true_labels, all_pred_labels)
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             OBJ    SUBJ")
    print(f"Actual OBJ  {cm[0,0]:4d}   {cm[0,1]:4d}")
    print(f"      SUBJ  {cm[1,0]:4d}   {cm[1,1]:4d}")

    # Save results
    output_path = f"predictions_{test_file_path.split('/')[-1]}"
    results_df.to_csv(output_path, sep='\t', index=False)
    print(f"\nDetailed predictions saved to {output_path}")

    # Error analysis - find examples where model was wrong
    errors_df = results_df[results_df['true_label'] != results_df['predicted_label']]
    if not errors_df.empty:
        error_output_path = f"errors_{test_file_path.split('/')[-1]}"
        errors_df.to_csv(error_output_path, sep='\t', index=False)
        print(f"Examples of misclassifications saved to {error_output_path}")

        # Print a few examples of misclassifications
        print("\nExamples of misclassifications:")
        sample_errors = errors_df.sample(min(5, len(errors_df)))
        for _, row in sample_errors.iterrows():
            print(f"Sentence ID: {row['sentence_id']}")
            print(f"Sentence: {row['sentence']}")
            print(f"True: {row['true_label']}, Predicted: {row['predicted_label']}")
            print(f"Confidence scores - OBJ: {row['obj_score']}, SUBJ: {row['subj_score']}")
            print("")

    return results_df, {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'class_report': class_report
    }

In [None]:
def evaluate_on_test_set(test_file_path, model, tokenizer, filename, max_length=128, batch_size=16):
    # Load test data
    test_data = pd.read_csv(test_file_path, sep='\t')
    print(f"Loaded test data with {len(test_data)} examples")
    print(f"Columns: {test_data.columns.tolist()}")

    # Create dataset and dataloader without labels
    class UnlabeledSubjectivityDataset(torch.utils.data.Dataset):
        def __init__(self, data, tokenizer, max_length):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            sentence = self.data.iloc[idx]['sentence']
            sentence_id = self.data.iloc[idx]['sentence_id']

            encoding = self.tokenizer(
                sentence,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Remove batch dimension
            encoding = {k: v.squeeze(0) for k, v in encoding.items()}

            # Add sentence index for tracking
            encoding['sentence_idx'] = torch.tensor(idx)

            return encoding

    # Create dataset and dataloader
    test_dataset = UnlabeledSubjectivityDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in test_dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_idx'}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'sentence': [test_data.iloc[idx]['sentence'] for idx in all_indices],
        'predicted_label': pred_text_labels,
        'obj_score': [round(p[0], 4) for p in all_predictions],
        'subj_score': [round(p[1], 4) for p in all_predictions],
    })

    # Create submission dataframe with required format
    submission_df = results_df[['sentence_id', 'predicted_label']].copy()
    submission_df.columns = ['sentence_id', 'label']  # Rename columns to match expected format

    # Save to TSV
    submission_output_path = filename
    submission_df.to_csv(submission_output_path, sep='\t', index=False)
    print(f"\nSubmission file saved to {submission_output_path}")

    # Print prediction statistics
    obj_count = pred_text_labels.count("OBJ")
    subj_count = pred_text_labels.count("SUBJ")
    print(f"\n===== Prediction Statistics =====")
    print(f"Total predictions: {len(pred_text_labels)}")
    print(f"OBJ predictions: {obj_count} ({obj_count/len(pred_text_labels)*100:.2f}%)")
    print(f"SUBJ predictions: {subj_count} ({subj_count/len(pred_text_labels)*100:.2f}%)")

    # Save detailed results
    output_path = f"predictions_details_{test_file_path.split('/')[-1]}"
    results_df.to_csv(output_path, sep='\t', index=False)
    print(f"\nDetailed predictions saved to {output_path}")

    return results_df

## English

In [None]:
# 1. Set up constants
ENGLISH_MODEL =  "cardiffnlp/twitter-roberta-base-sentiment" #"microsoft/deberta-v3-base" #distilroberta-base" #"lighteternal/fact-or-opinion-xlmr-el" #"meta-llama/Llama-Prompt-Guard-2-22M" #"Elron/bleurt-tiny-512" #"cardiffnlp/twitter-xlm-roberta-base-sentiment" #"FacebookAI/roberta-base"   #"textattack/albert-base-v2-imdb" #"Elron/bleurt-tiny-512" #"philschmid/tiny-bert-sst2-distilled"#"MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c" #"oeg/BERT-Repository-Proposal" #"FacebookAI/roberta-base"


In [None]:
# Load training and validation data separately
train_data = load_data(['train_en.tsv'])
val_data = load_data(['dev_en.tsv'])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data['label_id'] = train_data['label'].map(label_map)
val_data['label_id'] = val_data['label'].map(label_map)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

print(f"Training with {len(train_data)} examples")
print(f"Validating with {len(val_data)} examples")


Training with 1492 examples
Validating with 462 examples


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(ENGLISH_MODEL, train_data, val_data, 2e-5, 5)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4314,0.45065,0.824675,0.82466,0.824652,0.824675
2,0.1892,0.196052,0.9329,0.932845,0.93322,0.9329
3,0.0989,0.129034,0.965368,0.965345,0.965648,0.965368
4,0.0516,0.076601,0.982684,0.982687,0.982726,0.982684
5,0.0246,0.056461,0.984848,0.984852,0.984938,0.984848


Evaluation results: {'eval_loss': 0.05646136775612831, 'eval_accuracy': 0.9848484848484849, 'eval_f1': 0.984851682584999, 'eval_precision': 0.984937987469633, 'eval_recall': 0.9848484848484849, 'eval_runtime': 4.1489, 'eval_samples_per_second': 111.355, 'eval_steps_per_second': 6.99, 'epoch': 5.0}


In [None]:
test_file = test_list[1]
results, metrics = evaluate_on_dev_test_set(test_file, model, tokenizer)
print("\nEvaluation complete!")

Loaded test data with 484 examples
Columns: ['sentence_id', 'sentence', 'label']

===== Model Performance on dev_test_en.tsv =====
Accuracy: 0.7624
F1 Score (weighted): 0.7668
Precision (weighted): 0.7726
Recall (weighted): 0.7624

Class-wise Performance:
OBJ - Precision: 0.8559, Recall: 0.8204, F1: 0.8378
SUBJ - Precision: 0.5255, Recall: 0.5902, F1: 0.5560
-----macro avg F1-------- 0.6968921369485545

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ   297     65
      SUBJ    50     72

Detailed predictions saved to predictions_dev_test_en.tsv
Examples of misclassifications saved to errors_dev_test_en.tsv

Examples of misclassifications:
Sentence ID: 418ad22f-1081-4699-8ce8-2bb315223bd9
Sentence: RULE 1: NO ENHANCEMENTS  Gina admitted that she won't "mess" with her face.
True: OBJ, Predicted: SUBJ
Confidence scores - OBJ: 0.00989999994635582, SUBJ: 0.9901000261306763

Sentence ID: 7b155107-2626-47e6-8240-4fd23aae1dee
Sentence: A WOMAN who has been dubbed 

## Italian

In [None]:
ITALIAN_MODEL =  "neuraly/bert-base-italian-cased-sentiment" # "dbmdz/bert-base-italian-cased" #"Musixmatch/umberto-commoncrawl-cased-v1"

In [None]:
# Load training and validation data separately
train_data_it = load_data(["train_it.tsv"])
val_data_it = load_data(["dev_it.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_it['label_id'] = train_data_it['label'].map(label_map)
val_data_it['label_id'] = val_data_it['label'].map(label_map)

# Reset indices
train_data_it = train_data_it.reset_index(drop=True)
val_data_it = val_data_it.reset_index(drop=True)

print(f"Training with {len(train_data_it)} examples")
print(f"Validating with {len(val_data_it)} examples")


Training with 1613 examples
Validating with 667 examples


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(ITALIAN_MODEL, train_data_it, val_data_it, 2e-5,5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuraly/bert-base-italian-cased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4397,0.421491,0.802099,0.803139,0.80432,0.802099
2,0.261,0.470124,0.803598,0.80269,0.801882,0.803598
3,0.1269,0.641167,0.797601,0.794649,0.792612,0.797601
4,0.0638,0.796361,0.809595,0.806006,0.803945,0.809595
5,0.0396,0.807014,0.802099,0.801738,0.801394,0.802099


Evaluation results: {'eval_loss': 0.7963613867759705, 'eval_accuracy': 0.8095952023988006, 'eval_f1': 0.8060063195957936, 'eval_precision': 0.8039449475232541, 'eval_recall': 0.8095952023988006, 'eval_runtime': 4.938, 'eval_samples_per_second': 135.074, 'eval_steps_per_second': 8.505, 'epoch': 5.0}


In [None]:
results, metrics = evaluate_on_dev_test_set("dev_test_it2.tsv", model, tokenizer, "task1_italian.tsv")
print("\nEvaluation complete!")

Loaded test data with 462 examples
Columns: ['sentence_id', 'sentence', 'label']

Submission file saved to task1_italian.tsv

===== Model Performance on dev_test_it2.tsv =====
Accuracy: 0.8268
F1 Score (weighted): 0.7614
Precision (weighted): 0.8021
Recall (weighted): 0.7405

Class-wise Performance:
OBJ - Precision: 0.8432, Recall: 0.9341, F1: 0.8864
SUBJ - Precision: 0.7609, Recall: 0.5469, F1: 0.6364
-----macro avg F1-------- 0.7613636363636364

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ   312     22
      SUBJ    58     70

Detailed predictions saved to predictions_dev_test_it2.tsv
Examples of misclassifications saved to errors_dev_test_it2.tsv

Examples of misclassifications:
Sentence ID: af989775-8ebf-4f44-9da6-9f4261b84ca3
Sentence: Valutazioni arrivate nelle stesse ore in cui il ministro degli Esteri e vicepremier, Antonio Tajani, pronunciava parole inequivocabili durante una commemorazione allo Yad Vashem, il memoriale dell'Olocausto, a Gerusa

In [None]:
results= evaluate_on_test_set("test_it_unlabeled.tsv", model, tokenizer, "subtask_italian.tsv")

Loaded test data with 299 examples
Columns: ['sentence_id', 'sentence']

Submission file saved to subtask_italian.tsv

===== Prediction Statistics =====
Total predictions: 299
OBJ predictions: 214 (71.57%)
SUBJ predictions: 85 (28.43%)

Detailed predictions saved to predictions_details_test_it_unlabeled.tsv


# Arabic

In [None]:
ARABIC_MODEL = "omarelshehy/Arabic-Retrieval-v1.0" #"CAMeL-Lab/bert-base-arabic-camelbert-mix"

In [None]:
# Load training and validation data separately
train_data_ar = load_data(["train_ar.tsv"])
val_data_ar = load_data(["dev_ar.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_ar['label_id'] = train_data_ar['label'].map(label_map)
val_data_ar['label_id'] = val_data_ar['label'].map(label_map)

# Reset indices
train_data_ar = train_data_ar.reset_index(drop=True)
val_data_ar = val_data_ar.reset_index(drop=True)

print(f"Training with {len(train_data_ar)} examples")
print(f"Validating with {len(val_data_ar)} examples")


Training with 2446 examples
Validating with 467 examples


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(ARABIC_MODEL, train_data_ar, val_data_ar, 1e-5,3)

tokenizer_config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/761k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/45.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at omarelshehy/Arabic-Retrieval-v1.0 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7,0.689171,0.56531,0.525694,0.544232,0.56531
2,0.6412,0.703846,0.563169,0.519793,0.54027,0.563169
3,0.5922,0.711345,0.563169,0.531768,0.543783,0.563169


Evaluation results: {'eval_loss': 0.711344838142395, 'eval_accuracy': 0.563169164882227, 'eval_f1': 0.5317682345195994, 'eval_precision': 0.5437830945012815, 'eval_recall': 0.563169164882227, 'eval_runtime': 3.2795, 'eval_samples_per_second': 142.399, 'eval_steps_per_second': 9.148, 'epoch': 3.0}


In [None]:
results, metrics = evaluate_on_dev_test_set("dev_test_ar.tsv", model, tokenizer, "subtask_arabic.tsv")
print("\nEvaluation complete!")

Loaded test data with 748 examples
Columns: ['sentence_id', 'sentence', 'label']

Submission file saved to subtask_arabic.tsv

===== Model Performance on dev_test_ar.tsv =====
Accuracy: 0.5749
F1 Score (weighted): 0.5354
Precision (weighted): 0.5555
Recall (weighted): 0.5453

Class-wise Performance:
OBJ - Precision: 0.5989, Recall: 0.7624, F1: 0.6708
SUBJ - Precision: 0.5121, Recall: 0.3282, F1: 0.4000
-----macro avg F1-------- 0.5354037267080746

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ   324    101
      SUBJ   217    106

Detailed predictions saved to predictions_dev_test_ar.tsv
Examples of misclassifications saved to errors_dev_test_ar.tsv

Examples of misclassifications:
Sentence ID: Almayadeen_5_21
Sentence: ويعرف القاموس الحديث لعلم الاجتماع المثقفين بأنهم أولئك الأفراد من أعضاء المجتمع الذين يكرسون أنفسهم لتطوير الأفكار الجديدة، والمشغولون بالممارسات الثقافية الخالقة. في الحقيقة، مثل هذا الوصف يوضح أن المثقفين هم جماعة من النخبة في المجتمع م

In [None]:
results = evaluate_on_test_set("test_ar_unlabeled.tsv", model, tokenizer, "subtask_arabic.tsv")
print("\nEvaluation complete!")

Loaded test data with 1036 examples
Columns: ['sentence_id', 'sentence']

Submission file saved to subtask_arabic.tsv

===== Prediction Statistics =====
Total predictions: 1036
OBJ predictions: 734 (70.85%)
SUBJ predictions: 302 (29.15%)

Detailed predictions saved to predictions_details_test_ar_unlabeled.tsv

Evaluation complete!


# German


In [None]:
GERMAN_MODEL = "ssary/XLM-RoBERTa-German-sentiment"

In [None]:
# Load training and validation data separately
train_data_de = load_data(["train_de.tsv"])
val_data_de = load_data(["dev_de.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_de['label_id'] = train_data_de['label'].map(label_map)
val_data_de['label_id'] = val_data_de['label'].map(label_map)

# Reset indices
train_data_de = train_data_de.reset_index(drop=True)
val_data_de = val_data_de.reset_index(drop=True)

print(f"Training with {len(train_data_de)} examples")
print(f"Validating with {len(val_data_de)} examples")


Training with 800 examples
Validating with 491 examples


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(GERMAN_MODEL, train_data_de, val_data_de, 2e-5,3)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ssary/XLM-RoBERTa-German-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6099,0.533432,0.733198,0.732666,0.73219,0.733198
2,0.4398,0.533084,0.757637,0.756116,0.755126,0.757637
3,0.3355,0.552114,0.767821,0.767515,0.767235,0.767821


Evaluation results: {'eval_loss': 0.5521141886711121, 'eval_accuracy': 0.7678207739307535, 'eval_f1': 0.7675147973813294, 'eval_precision': 0.7672351803929505, 'eval_recall': 0.7678207739307535, 'eval_runtime': 3.256, 'eval_samples_per_second': 150.797, 'eval_steps_per_second': 9.521, 'epoch': 3.0}


In [None]:
results, metrics = evaluate_on_dev_test_set("dev_test_de.tsv", model, tokenizer, "subtask_german.tsv")
print("\nEvaluation complete!")

Loaded test data with 224 examples
Columns: ['sentence_id', 'sentence', 'label']

Submission file saved to subtask_german.tsv

===== Model Performance on dev_test_de.tsv =====
Accuracy: 0.8170
F1 Score (weighted): 0.7894
Precision (weighted): 0.7884
Recall (weighted): 0.7905

Class-wise Performance:
OBJ - Precision: 0.8684, Recall: 0.8627, F1: 0.8656
SUBJ - Precision: 0.7083, Recall: 0.7183, F1: 0.7133
-----macro avg F1-------- 0.7894302418892583

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ   132     21
      SUBJ    20     51

Detailed predictions saved to predictions_dev_test_de.tsv
Examples of misclassifications saved to errors_dev_test_de.tsv

Examples of misclassifications:
Sentence ID: fa2f2ca43542e8f4fb05816eb0548c845d3da34e
Sentence: Die nächste logische Frage ist natürlich die nach der Impfquote in den afrikanischen Ländern.
True: SUBJ, Predicted: OBJ
Confidence scores - OBJ: 0.7856000065803528, SUBJ: 0.21439999341964722

Sentence ID: c66882ff

In [None]:
results = evaluate_on_test_set("test_de_unlabeled.tsv", model, tokenizer, "subtask_german.tsv")
print("\nEvaluation complete!")

Loaded test data with 347 examples
Columns: ['Unnamed: 0', 'sentence_id', 'sentence']

Submission file saved to subtask_german.tsv

===== Prediction Statistics =====
Total predictions: 347
OBJ predictions: 242 (69.74%)
SUBJ predictions: 105 (30.26%)

Detailed predictions saved to predictions_details_test_de_unlabeled.tsv

Evaluation complete!


# Bulgarian


In [None]:
BULGARIAN_MODEL = "ankitkupadhyay/xnli3.0_bulgarian_model"

In [None]:
# Load training and validation data separately
train_data_bg = load_data(["train_bg.tsv"])
val_data_bg = load_data(["dev_bg.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_bg['label_id'] = train_data_bg['label'].map(label_map)
val_data_bg['label_id'] = val_data_bg['label'].map(label_map)

# Reset indices
train_data_bg = train_data_bg.reset_index(drop=True)
val_data_bg = val_data_bg.reset_index(drop=True)

print(f"Training with {len(train_data_bg)} examples")
print(f"Validating with {len(val_data_bg)} examples")


Training with 691 examples
Validating with 306 examples


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(BULGARIAN_MODEL, train_data_bg, val_data_bg, 2e-5,3)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ankitkupadhyay/xnli3.0_bulgarian_model and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5941,0.611986,0.754902,0.740284,0.792782,0.754902


In [None]:
results, metrics = evaluate_on_dev_test_set("dev_test_bg.tsv", model, tokenizer, "subtask_bulgarian.tsv")
print("\nEvaluation complete!")

Loaded test data with 250 examples
Columns: ['sentence_id', 'sentence', 'label']

Submission file saved to subtask_bulgarian.tsv

===== Model Performance on dev_test_bg.tsv =====
Accuracy: 0.7560
F1 Score (weighted): 0.7559
Precision (weighted): 0.7664
Recall (weighted): 0.7691

Class-wise Performance:
OBJ - Precision: 0.8661, Recall: 0.6783, F1: 0.7608
SUBJ - Precision: 0.6667, Recall: 0.8598, F1: 0.7510
-----macro avg F1-------- 0.7559023609443778

Confusion Matrix:
              Predicted
             OBJ    SUBJ
Actual OBJ    97     46
      SUBJ    15     92

Detailed predictions saved to predictions_dev_test_bg.tsv
Examples of misclassifications saved to errors_dev_test_bg.tsv

Examples of misclassifications:
Sentence ID: df0cb8bc-c21b-416d-b930-6549f72c5494
Sentence: Факт е обаче, че няколко различни човека свидетелстват за едно и също – тя е предрекла жестоките порои у нас.
True: OBJ, Predicted: SUBJ
Confidence scores - OBJ: 0.14749999344348907, SUBJ: 0.8525000214576721

Senten