In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, EarlyStoppingCallback
from transformers import TrainingArguments, Trainer
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from sklearn.model_selection import train_test_split
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report


In [None]:
dev_test_list = ["dev_test_it.tsv", "dev_test_en.tsv", "dev_test_de.tsv", "dev_test_ar.tsv", "dev_test_bg.tsv"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
def train_monolingual_subjectivity_classifier(model_name, train_data, val_data, learning_rate, epoch, weight_decay=0.05):
    # Load tokenizer using AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create datasets
    train_dataset = SubjectivityDataset(train_data, tokenizer, MAX_LENGTH)
    val_dataset = SubjectivityDataset(val_data, tokenizer, MAX_LENGTH)

    # Use AutoModelForSequenceClassification instead of RobertaForSequenceClassification

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
        ignore_mismatched_sizes=True  # For binary classification (subjective/objective)
    )

    class_weights = torch.tensor([1.5, 0.5]).to(device)  # Give more weight to SUBJ class
    #loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)




    # Define training arguments with compatibility for older transformers versions
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",  # Changed from evaluation_strategy
        learning_rate=learning_rate,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=epoch,
        weight_decay= weight_decay,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        save_strategy="epoch",  # This might need to be adjusted
        logging_dir="./logs",
        logging_steps=100,
        logging_strategy="epoch",  # This might need to be changed to logging_mode
        report_to='none'
    )
    early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.001
)

    # If the above fails due to older transformers version, try this alternate version:
    # training_args = TrainingArguments(
    #     output_dir="./results",
    #     eval_steps=500,
    #     learning_rate=LEARNING_RATE,
    #     per_device_train_batch_size=BATCH_SIZE,
    #     per_device_eval_batch_size=BATCH_SIZE,
    #     num_train_epochs=EPOCHS,
    #     weight_decay=0.01,
    #     logging_dir="./logs",
    #     logging_steps=100,
    #     report_to=None
    # )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks = [early_stopping_callback],
    )

    # Train the model
    trainer.train()

    # Evaluate the model on the validation set
    eval_result = trainer.evaluate()
    print(f"Evaluation results: {eval_result}")

    return model, tokenizer

In [None]:
def load_data(file_paths):
    """Load data from multiple TSV files and combine them."""
    dfs = []
    for file_path in file_paths:
        df = pd.read_csv(file_path, sep='\t', header=0)
        # Add language tag based on filename
        language = os.path.basename(file_path).split('_')[1].split('.')[0]
        df['language'] = language
        dfs.append(df)

    return pd.concat(dfs, ignore_index=True)

In [None]:
# 3. Create a Dataset class
class SubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension added by the tokenizer
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label
        encoding['labels'] = torch.tensor(label, dtype=torch.long)

        return encoding


In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
# Create a Dataset class for test data
class TestSubjectivityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['sentence']
        label = self.data.iloc[idx]['label_id']

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Remove the batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        # Add the label and sentence ID
        encoding['labels'] = torch.tensor(label, dtype=torch.long)
        encoding['sentence_idx'] = idx

        return encoding

In [None]:
def evaluate_on_test_set(test_file_path, model, tokenizer, filename, max_length=128, batch_size=16):
    # Load test data
    test_data = pd.read_csv(test_file_path, sep='\t')
    print(f"Loaded test data with {len(test_data)} examples")
    print(f"Columns: {test_data.columns.tolist()}")

    # Map labels to IDs
    test_data['label_id'] = test_data['label'].map({'OBJ': 0, 'SUBJ': 1})

    # Create dataset and dataloader
    test_dataset = TestSubjectivityDataset(test_data, tokenizer, max_length)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # Evaluation mode
    model.eval()

    # Lists to store results
    all_predictions = []
    all_pred_labels = []
    all_true_labels = []
    all_indices = []

    # Perform predictions
    with torch.no_grad():
        for batch in test_dataloader:
            # Get the sentence indices
            indices = batch.pop('sentence_idx')

            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items() if k != 'sentence_idx'}

            # Get predictions
            outputs = model(**batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            pred_classes = torch.argmax(predictions, dim=1)

            # Store results
            all_predictions.extend(predictions.cpu().numpy())
            all_pred_labels.extend(pred_classes.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            all_indices.extend(indices.numpy())

    # Convert to text labels
    pred_text_labels = ["OBJ" if p == 0 else "SUBJ" for p in all_pred_labels]
    true_text_labels = ["OBJ" if t == 0 else "SUBJ" for t in all_true_labels]

    # Create results dataframe
    results_df = pd.DataFrame({
        'sentence_id': [test_data.iloc[idx]['sentence_id'] for idx in all_indices],
        'sentence': [test_data.iloc[idx]['sentence'] for idx in all_indices],
        'true_label': true_text_labels,
        'predicted_label': pred_text_labels,
        'obj_score': [round(p[0], 4) for p in all_predictions],
        'subj_score': [round(p[1], 4) for p in all_predictions],
    })


    # Calculate metrics
    accuracy = accuracy_score(all_true_labels, all_pred_labels)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_true_labels, all_pred_labels, average='macro'
    )


    # ========== NEW CODE FOR SUBMISSION FILE ==========
    # Create submission dataframe with required format
    submission_df = results_df[['sentence_id', 'predicted_label']].copy()
    submission_df.columns = ['sentence_id', 'label']  # Rename columns

    # Save to TSV
    submission_output_path = filename
    submission_df.to_csv(submission_output_path, sep='\t', index=False)
    print(f"\nSubmission file saved to {submission_output_path}")
    # ========== END NEW CODE ==========

    # [The rest of the original code remains unchanged...]


    # Detailed report
    class_report = classification_report(all_true_labels, all_pred_labels,
                                         target_names=['OBJ', 'SUBJ'], output_dict=True)

    # Print detailed metrics
    print(f"\n===== Model Performance on {test_file_path} =====")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score (weighted): {f1:.4f}")
    print(f"Precision (weighted): {precision:.4f}")
    print(f"Recall (weighted): {recall:.4f}\n")

    # Print per-class metrics
    print("Class-wise Performance:")
    print(f"OBJ - Precision: {class_report['OBJ']['precision']:.4f}, "
          f"Recall: {class_report['OBJ']['recall']:.4f}, "
          f"F1: {class_report['OBJ']['f1-score']:.4f}")
    print(f"SUBJ - Precision: {class_report['SUBJ']['precision']:.4f}, "
          f"Recall: {class_report['SUBJ']['recall']:.4f}, "
          f"F1: {class_report['SUBJ']['f1-score']:.4f}")
    print(f"-----macro avg F1-------- {(class_report['OBJ']['f1-score']+class_report['SUBJ']['f1-score'])/2}" )

    # Confusion matrix
    cm = confusion_matrix(all_true_labels, all_pred_labels)
    print("\nConfusion Matrix:")
    print("              Predicted")
    print("             OBJ    SUBJ")
    print(f"Actual OBJ  {cm[0,0]:4d}   {cm[0,1]:4d}")
    print(f"      SUBJ  {cm[1,0]:4d}   {cm[1,1]:4d}")

    # Save results
    output_path = f"predictions_{test_file_path.split('/')[-1]}"
    results_df.to_csv(output_path, sep='\t', index=False)
    print(f"\nDetailed predictions saved to {output_path}")

    # Error analysis - find examples where model was wrong
    errors_df = results_df[results_df['true_label'] != results_df['predicted_label']]
    if not errors_df.empty:
        error_output_path = f"errors_{test_file_path.split('/')[-1]}"
        errors_df.to_csv(error_output_path, sep='\t', index=False)
        print(f"Examples of misclassifications saved to {error_output_path}")

        # Print a few examples of misclassifications
        print("\nExamples of misclassifications:")
        sample_errors = errors_df.sample(min(5, len(errors_df)))
        for _, row in sample_errors.iterrows():
            print(f"Sentence ID: {row['sentence_id']}")
            print(f"Sentence: {row['sentence']}")
            print(f"True: {row['true_label']}, Predicted: {row['predicted_label']}")
            print(f"Confidence scores - OBJ: {row['obj_score']}, SUBJ: {row['subj_score']}")
            print("")

    return results_df, {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'class_report': class_report
    }

## English

In [None]:
# 1. Set up constants
ENGLISH_MODEL =  "cardiffnlp/twitter-roberta-base-sentiment" #"microsoft/deberta-v3-base" #distilroberta-base" #"lighteternal/fact-or-opinion-xlmr-el" #"meta-llama/Llama-Prompt-Guard-2-22M" #"Elron/bleurt-tiny-512" #"cardiffnlp/twitter-xlm-roberta-base-sentiment" #"FacebookAI/roberta-base"   #"textattack/albert-base-v2-imdb" #"Elron/bleurt-tiny-512" #"philschmid/tiny-bert-sst2-distilled"#"MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c" #"oeg/BERT-Repository-Proposal" #"FacebookAI/roberta-base"
MAX_LENGTH = 128
BATCH_SIZE = 16
EPOCHS = 7
LEARNING_RATE = 1e-5

In [None]:
# Load training and validation data separately
train_data = load_data(['train_en.tsv'])
val_data = load_data(['dev_en.tsv'])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data['label_id'] = train_data['label'].map(label_map)
val_data['label_id'] = val_data['label'].map(label_map)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

print(f"Training with {len(train_data)} examples")
print(f"Validating with {len(val_data)} examples")


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier_it(ENGLISH_MODEL, train_data, val_data, 2e-5, 5)

In [None]:
test_file = test_list[1]
results, metrics = evaluate_on_test_set(test_file, model, tokenizer)
print("\nEvaluation complete!")

## Italian

In [None]:
ITALIAN_MODEL =  "neuraly/bert-base-italian-cased-sentiment" # "dbmdz/bert-base-italian-cased" #"Musixmatch/umberto-commoncrawl-cased-v1"

In [None]:
# Load training and validation data separately
train_data_it = load_data(["train_it.tsv"])
val_data_it = load_data(["dev_it.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_it['label_id'] = train_data_it['label'].map(label_map)
val_data_it['label_id'] = val_data_it['label'].map(label_map)

# Reset indices
train_data_it = train_data_it.reset_index(drop=True)
val_data_it = val_data_it.reset_index(drop=True)

print(f"Training with {len(train_data_it)} examples")
print(f"Validating with {len(val_data_it)} examples")


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier_it(ITALIAN_MODEL, train_data_it, val_data_it, 2e-5,5)

In [None]:
test_file = test_list[0]
results, metrics = evaluate_on_test_set(test_file, model, tokenizer)
print("\nEvaluation complete!")

# Arabic

In [None]:
ARABIC_MODEL = "omarelshehy/Arabic-Retrieval-v1.0" #"CAMeL-Lab/bert-base-arabic-camelbert-mix"

In [None]:
# Load training and validation data separately
train_data_ar = load_data(["train_ar.tsv"])
val_data_ar = load_data(["dev_ar.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_ar['label_id'] = train_data_ar['label'].map(label_map)
val_data_ar['label_id'] = val_data_ar['label'].map(label_map)

# Reset indices
train_data_ar = train_data_ar.reset_index(drop=True)
val_data_ar = val_data_ar.reset_index(drop=True)

print(f"Training with {len(train_data_ar)} examples")
print(f"Validating with {len(val_data_ar)} examples")


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier_it(ARABIC_MODEL, train_data_ar, val_data_ar, 1e-5,3)

In [None]:
test_file = test_list[3]
results, metrics = evaluate_on_test_set(test_file, model, tokenizer)
print("\nEvaluation complete!")

# German


In [None]:
GERMAN_MODEL = "ssary/XLM-RoBERTa-German-sentiment"

In [None]:
# Load training and validation data separately
train_data_bg = load_data(["train_bg.tsv"])
val_data_bg = load_data(["dev_bg.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_bg['label_id'] = train_data_bg['label'].map(label_map)
val_data_bg['label_id'] = val_data_bg['label'].map(label_map)

# Reset indices
train_data_bg = train_data_bg.reset_index(drop=True)
val_data_bg = val_data_bg.reset_index(drop=True)

print(f"Training with {len(train_data_bg)} examples")
print(f"Validating with {len(val_data_bg)} examples")


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(GERMAN_MODEL, train_data_de, val_data_de, 2e-5,5)

In [None]:
results, metrics = evaluate_on_test_set("dev_test_de.tsv", model, tokenizer, "subtask_german.tsv")
print("\nEvaluation complete!")

In [None]:
results, metrics = evaluate_on_test_set("test_de_unlabeled.tsv", model, tokenizer, "subtask_german.tsv")
print("\nEvaluation complete!")

# Bulgarian


In [None]:
BULGARIAN_MODEL = "ankitkupadhyay/xnli3.0_bulgarian_model"

In [None]:
# Load training and validation data separately
train_data_bg = load_data(["train_bg.tsv"])
val_data_bg = load_data(["dev_bg.tsv"])

# Convert labels to integers
label_map = {'OBJ': 0, 'SUBJ': 1}
train_data_bg['label_id'] = train_data_bg['label'].map(label_map)
val_data_bg['label_id'] = val_data_bg['label'].map(label_map)

# Reset indices
train_data_bg = train_data_bg.reset_index(drop=True)
val_data_bg = val_data_bg.reset_index(drop=True)

print(f"Training with {len(train_data_bg)} examples")
print(f"Validating with {len(val_data_bg)} examples")


In [None]:
# Train the model
model, tokenizer = train_monolingual_subjectivity_classifier(BULGARIAN_MODEL, train_data_bg, val_data_bg, 2e-5,3, 0.01)

In [None]:
results, metrics = evaluate_on_test_set("dev_test_bg.tsv", model, tokenizer, "subtask_bulgarian.tsv")
print("\nEvaluation complete!")