## **1.Subtask 1 - Polarization detection**


### **1.1 Imports**

In [1]:
#!unzip dev_phase.zip

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install iterative-stratification

Collecting iterative-stratification
  Downloading iterative_stratification-0.1.9-py3-none-any.whl.metadata (1.3 kB)
Downloading iterative_stratification-0.1.9-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.9


In [4]:
!pip install -qq optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import torch
import random
from sklearn.metrics import f1_score

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from torch.utils.data import Dataset

import os
import zipfile
from typing import Dict

In [6]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)


In [7]:
from transformers import set_seed
def set_global_seed(seed: int = 42):
    """
    Set seed for reproducibility across Python, NumPy, PyTorch, and Hugging Face Transformers.

    Args:
        seed (int): Seed value to use.
    """


    # Python built-in random
    random.seed(seed)

    # NumPy
    np.random.seed(seed)

    # PyTorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if using multi-GPU

    # CUDNN settings for deterministic behavior
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Hugging Face Transformers
    set_seed(seed)

    print(f"[INFO] Global seed set to {seed}")


In [8]:
set_global_seed(42)

[INFO] Global seed set to 42


### 1.2 Data Import

In [9]:
# Load the training and validation data for subtask 1

train_eng = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask1/train/eng.csv')
train_hau = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask1/train/hau.csv')


train_eng, val_eng = train_test_split(
    train_eng,
    test_size=0.1,
    shuffle=True,
    stratify=train_eng['polarization'],
    random_state=40
)

train_hau, val_hau = train_test_split(
    train_hau,
    test_size=0.1,
    shuffle=True,
    stratify=train_hau['polarization'],
    random_state=40
)



train = pd.concat([train_eng, train_hau], ignore_index=True)
val = pd.concat([val_eng, val_hau], ignore_index=True)



dev_set_eng=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask1/dev/eng.csv')
dev_set_hau=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask1/dev/hau.csv')


train.head()

Unnamed: 0,id,text,polarization
0,eng_54dde3b05be9510ecb5a32e59c3f5654,NATO has taken over coordination of Western mi...,0
1,eng_ca99e417ba56c186664ad50b13aee3bf,"TRUMP LIE 718 ""Not statistically possible. Rig...",1
2,eng_2e927c0378772abd03b9d9dfe77cdd3d,Russians didnt kill anyone in Ukraine invasion,0
3,eng_928a32380ac569d1dc82cb13c477e07f,Kamala Harris is the funniest person Kamala Ha...,0
4,eng_702a151aa2145b678a2379320dbd604b,"Mike Pence R, IN Norm Coleman R, MN",0


#### **1.2.1 Cleaning the data**

In [10]:
#finding what needs to be cleaned
missing=train.isna().sum()
print('Missing values:')
print(missing)
#displaying the data statistics
train.describe(include='all')
train.head(5)


Missing values:
id              0
text            0
polarization    0
dtype: int64


Unnamed: 0,id,text,polarization
0,eng_54dde3b05be9510ecb5a32e59c3f5654,NATO has taken over coordination of Western mi...,0
1,eng_ca99e417ba56c186664ad50b13aee3bf,"TRUMP LIE 718 ""Not statistically possible. Rig...",1
2,eng_2e927c0378772abd03b9d9dfe77cdd3d,Russians didnt kill anyone in Ukraine invasion,0
3,eng_928a32380ac569d1dc82cb13c477e07f,Kamala Harris is the funniest person Kamala Ha...,0
4,eng_702a151aa2145b678a2379320dbd604b,"Mike Pence R, IN Norm Coleman R, MN",0


In [11]:
missing=val.isna().sum()
print('Missing values:')
print(missing)
#displaying the data statistics
val.describe(include='all')
val.head(5)

Missing values:
id              0
text            0
polarization    0
dtype: int64


Unnamed: 0,id,text,polarization
0,eng_6815d71d128500b9c1fad80f07866288,Can Canada foster the blue states for four yea...,1
1,eng_eb2b137c58b9092d98e7aec6ce711d44,Attorney Gurfinkel Answers Immigration Questio...,0
2,eng_7f55d9f8da0a4238293f02b469f27b5b,We need more radical left tweeps following thi...,1
3,eng_fd4c3b9b6c6d5fb679fa759bd7ceed18,"Colombia Integration of Migrants, Refugees, an...",0
4,eng_154e8f522918708a2c86c85969578ef9,Putin says Zelensky is a Nazi Orban says Zelen...,1


In [12]:
#removing empty values
train = train[(train['polarization'] == 0) | (train['polarization'] == 1)]
train.head()

Unnamed: 0,id,text,polarization
0,eng_54dde3b05be9510ecb5a32e59c3f5654,NATO has taken over coordination of Western mi...,0
1,eng_ca99e417ba56c186664ad50b13aee3bf,"TRUMP LIE 718 ""Not statistically possible. Rig...",1
2,eng_2e927c0378772abd03b9d9dfe77cdd3d,Russians didnt kill anyone in Ukraine invasion,0
3,eng_928a32380ac569d1dc82cb13c477e07f,Kamala Harris is the funniest person Kamala Ha...,0
4,eng_702a151aa2145b678a2379320dbd604b,"Mike Pence R, IN Norm Coleman R, MN",0


In [13]:
#removing the empties values in validation set
val   = val[(val['polarization'] == 0) | (val['polarization'] == 1)]
val.head()

Unnamed: 0,id,text,polarization
0,eng_6815d71d128500b9c1fad80f07866288,Can Canada foster the blue states for four yea...,1
1,eng_eb2b137c58b9092d98e7aec6ce711d44,Attorney Gurfinkel Answers Immigration Questio...,0
2,eng_7f55d9f8da0a4238293f02b469f27b5b,We need more radical left tweeps following thi...,1
3,eng_fd4c3b9b6c6d5fb679fa759bd7ceed18,"Colombia Integration of Migrants, Refugees, an...",0
4,eng_154e8f522918708a2c86c85969578ef9,Putin says Zelensky is a Nazi Orban says Zelen...,1


### **1.3 Dataset**


##### ***1.3.1 Labelled Dataset***

In [14]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset_T1(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=labels
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}

    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

##### ***1.3.2 Unlabelled Dataset***

In [15]:
class Unlabelled_PolarizationDataset_T1(torch.utils.data.Dataset):
  def __init__(self,texts,labels,tokenizer,max_length =128):
    self.texts=texts
    self.labels=[0 for  l in labels]
    self.tokenizer= tokenizer
    self.max_length = max_length # Store max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self,idx):
    text=self.texts[idx]
    label=self.labels[idx]
    encoding=self.tokenizer(text,truncation=True,padding=False,max_length=self.max_length,return_tensors='pt')

    # Ensure consistent tensor conversion for all items
    item = {key: encoding[key].squeeze() for key in encoding.keys()}

    item['labels'] = torch.tensor(label, dtype=torch.long)
    return item

#### **1.3.3 Tokenization**

In [16]:
# Load the tokenizer
tokenizer_task1 = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-base')

#making data_sets
train_dataset_t1 = PolarizationDataset_T1(train['text'].tolist(), train['polarization'].tolist(), tokenizer_task1)
val_dataset_t1 = PolarizationDataset_T1(val['text'].tolist(), val['polarization'].tolist(), tokenizer_task1)


dev_dataset_eng_t1 = Unlabelled_PolarizationDataset_T1(dev_set_eng['text'].tolist(), dev_set_eng['polarization'].tolist(), tokenizer_task1)
dev_dataset_hau_t1 = Unlabelled_PolarizationDataset_T1(dev_set_hau['text'].tolist(), dev_set_hau['polarization'].tolist(), tokenizer_task1)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

### **1.4 Model Training**

#### **1.4.1 Model Initialization**

In [17]:
model_task1 = AutoModelForSequenceClassification.from_pretrained('Davlan/afro-xlmr-base', num_labels=2)

config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **1.4.2 Metrics Function**

In [18]:
# function to compute metrics
def compute_custom_metrics_task1(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision_macro = precision_score(labels, preds, average='macro', zero_division=0)
    recall_macro = recall_score(labels, preds, average='macro', zero_division=0)
    f1_macro = f1_score(labels, preds, average='macro', zero_division=0)
    f1_micro = f1_score(labels, preds, average='micro', zero_division=0)

    return {
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro,
    }


#### **1.4.3  Model Parameters**

In [19]:
# Define training arguments
training_args_t1 = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=4,
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        seed=42,
        weight_decay=0.01,
        warmup_ratio=0.1,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )


#### **1.4.4 Model Trainer**

In [20]:
# Initialize the Trainer
trainer_task1 = Trainer(
    model=model_task1,                       # the instantiated 🤗 Transformers model to be trained
    args=training_args_t1,                  # training arguments, defined above
    train_dataset=train_dataset_t1,         # training dataset
    eval_dataset=val_dataset_t1,            # evaluation dataset
    compute_metrics=compute_custom_metrics_task1,     # the callback that computes metrics of interest
    data_collator=DataCollatorWithPadding(tokenizer_task1) # Data collator for dynamic padding
)

# Train the model
trainer_task1.train()

# Evaluate the model on the validation set
eval_results = trainer_task1.evaluate()
print(f"Macro F1 score on validation set: {eval_results['eval_f1_macro']}")

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,F1 Micro
1,0.3889,0.353628,0.843251,0.776341,0.804212,0.788314,0.843251
2,0.362,0.386433,0.869376,0.821413,0.794191,0.806383,0.869376
3,0.3134,0.537049,0.86357,0.807232,0.801656,0.80438,0.86357
4,0.1875,0.668866,0.867925,0.817751,0.795496,0.805657,0.867925


Macro F1 score on validation set: 0.8056574122577265


#### **1.4.6 Test on dev set**

In [21]:
pred_data_t1 = {
    "eng": {'data': dev_dataset_eng_t1, 'ids': dev_set_eng['id'].tolist()},
    "hau": {'data': dev_dataset_hau_t1, 'ids': dev_set_hau['id'].tolist()},
}

for lang in pred_data_t1:
    pred_dev=trainer_task1.predict(pred_data_t1[lang]['data'])
    y_pred = np.argmax(pred_dev.predictions, axis=1)
    y_true = pred_dev.label_ids
    pred_data_t1[lang]['results']=y_pred



#### **1.4.7 Making Submission Package**

In [22]:
# --- Helper: Save a single prediction CSV ---
def save_prediction_csv(df: pd.DataFrame, lang_code: str, folder: str):
    """Save a DataFrame as a csv."""
    file_path = os.path.join(folder, f"pred_{lang_code}.csv")
    df.to_csv(file_path, index=False)
    return file_path

# --- Helper: Zip a folder ---
def zip_folder(folder_path: str, zip_path: str):
    """Create a zip file of the folder."""
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
                zipf.write(file_path, arcname)

# --- Main function: Create submission zip ---
def create_submission_zip(task,predictions: Dict[str, pd.DataFrame], output_dir=".",):
    """
    function to create submission zip for Subtask.
    """
    zip_name=f"{task}.zip"
    # Folder for submission files
    submission_folder = os.path.join(output_dir, task)
    os.makedirs(submission_folder, exist_ok=True)

    # Save each prediction CSV
    for lang_code, df in predictions.items():
        save_prediction_csv(df, lang_code, submission_folder)

    # Create zip
    zip_path = os.path.join(output_dir, zip_name)
    zip_folder(submission_folder, zip_path)
    print(f"Submission zip created at: {zip_path}")


In [23]:
submission_dict = {}

for lang in pred_data_t1:
    df = pd.DataFrame({
        "id": pred_data_t1[lang]['ids'],
        "polarization": pred_data_t1[lang]['results']
    })
    submission_dict[lang] = df

create_submission_zip('subtask_1',submission_dict)

Submission zip created at: ./subtask_1.zip


#### **1.4.8 Saving the Model**

In [24]:
from datetime import datetime

In [25]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
naming='today-mdeberta'
save_path = f"saved_model_t1_{naming}_f1_{eval_results['eval_f1_macro']:.4f}_{timestamp}"
trainer_task1.save_model('drive/MyDrive/NLP/models'+'//'+save_path)

# **2.Subtask 2: Polarization Type Classification**

### **2.1 Data Import**

In [26]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

def multilabel_split(df, labels, test_size=0.1, seed=40):
    splitter = MultilabelStratifiedShuffleSplit(
        n_splits=1,
        test_size=test_size,
        random_state=seed
    )
    X = df.index.values
    y = df[labels].values

    for train_idx, val_idx in splitter.split(X, y):
        train_df = df.iloc[train_idx]
        val_df = df.iloc[val_idx]

    return train_df, val_df

In [28]:

train_eng = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask2/train/eng.csv')
train_hau = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask2/train/hau.csv')

labels = ['gender/sexual','political','religious','racial/ethnic','other']
train_eng, val_eng = multilabel_split(train_eng, labels)
train_hau, val_hau = multilabel_split(train_hau, labels)

train_t2 = pd.concat([train_eng, train_hau], ignore_index=True)
val_t2 = pd.concat([val_eng, val_hau], ignore_index=True)



dev_set_eng_t2=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask2/dev/eng.csv')
dev_set_hau_t2=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask2/dev/hau.csv')

train_t2.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0


### **2.2 Dataset**

#### **2.2.1 Labelled Dataset**

In [29]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset_t2(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}

        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


#### **2.2.2 Unlabelled Dataset**

In [30]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class Unlabelled_PolarizationDataset_t2(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = [[0, 0, 0, 0, 0] for i in labels]

        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}

        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item


#### **2.2.3 Tokenization**

In [31]:
# Load the tokenizer
tokenizer_task2 = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-base')

# Create train and Test dataset for multilabel
train_dataset_t2 = PolarizationDataset_t2(train_t2['text'].tolist(), train_t2[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer_task2)
val_dataset_t2 = PolarizationDataset_t2(val_t2['text'].tolist(), val_t2[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer_task2)

dev_dataset_eng_t2 = Unlabelled_PolarizationDataset_t2(dev_set_eng_t2['text'].tolist(), dev_set_eng_t2[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer_task2)
dev_dataset_hau_t2 = Unlabelled_PolarizationDataset_t2(dev_set_hau_t2['text'].tolist(), dev_set_hau_t2[['gender/sexual','political','religious','racial/ethnic','other']].values.tolist(), tokenizer_task2)

### **2.3 Model Training**

#### **2.3.1 Model Initialization**

In [32]:
# Load the model suitabled for  5 labels
model_task2 = AutoModelForSequenceClassification.from_pretrained('Davlan/afro-xlmr-base', num_labels=5, problem_type="multi_label_classification")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **2.3.2 Metrics Function**

In [33]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel_task2(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}



#### **2.3.3  Model Parameters**

In [34]:
# Define training arguments
training_args_task2 = TrainingArguments(
        output_dir=f"./",
        num_train_epochs=4,
        learning_rate=4e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        seed=42,
        weight_decay=0.01,
        warmup_ratio=0.1,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        disable_tqdm=False
    )

#### **2.3.4 Model Trainer**

In [35]:
# Initialize the Trainer
trainer_task2 = Trainer(
    model=model_task2,
    args=training_args_task2,
    train_dataset=train_dataset_t2,
    eval_dataset=val_dataset_t2,
    compute_metrics=compute_metrics_multilabel_task2,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer_task2)
)

# Train the model
trainer_task2 .train()

# Evaluate the model on the validation set
eval_results = trainer_task2 .evaluate()
print(f"Macro F1 score on validation set for Subtask 2: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.157,0.173785,0.050633
2,0.1289,0.137606,0.269954
3,0.1028,0.14133,0.307624
4,0.0763,0.150621,0.378051


Macro F1 score on validation set for Subtask 2: 0.37805125355927494


#### **2.3.5 Test on dev set**

In [36]:
from scipy.special import expit

In [37]:
pred_data_t2 = {
    "eng": {'data': dev_dataset_eng_t2, 'ids': dev_set_eng_t2['id'].tolist()},
    "hau": {'data': dev_dataset_hau_t2, 'ids': dev_set_hau_t2['id'].tolist()},
}

for lang in pred_data_t2:
    pred_dev=trainer_task2.predict(pred_data_t2[lang]['data'])

    #converting logits to probabilities using sigmoid
    probs = expit(pred_dev.predictions)

    y_pred = (probs > 0.5).astype(int)

    y_true = pred_dev.label_ids
    pred_data_t2[lang]['results']=y_pred



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### **2.3.6 Making Submission Package**

In [38]:
submission_dict = {}

for lang in pred_data_t2:
    df = pd.DataFrame({
        "id": pred_data_t2[lang]['ids'],
        'gender/sexual':pred_data_t2[lang]['results'][:,0],
        'political':pred_data_t2[lang]['results'][:,1],
        'religious':pred_data_t2[lang]['results'][:,2],
        'racial/ethnic':pred_data_t2[lang]['results'][:,3],
        'other':pred_data_t2[lang]['results'][:,4]
    })
    submission_dict[lang] = df

create_submission_zip('subtask_2',submission_dict)

Submission zip created at: ./subtask_2.zip


#### **2.3.7 Saving the Model**

In [39]:
from datetime import datetime

In [40]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
naming='today-mdeberta'
save_path = f"saved_model_t2_{naming}_f1_{eval_results['eval_f1_macro']:.4f}_{timestamp}"
trainer_task2.save_model('drive/MyDrive/NLP/models'+'//'+save_path)

# 3.Subtask 3: Manifestation Identification
Multi-label classification to classify how polarization is expressed, with multiple possible labels including Vilification, Extreme Language, Stereotype, Invalidation, Lack of Empathy, and Dehumanization.



### **3.1 Data Import**

In [41]:

train_eng = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask3/train/eng.csv')
train_hau = pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask3/train/hau.csv')
labels = ['stereotype', 'vilification', 'dehumanization',
          'extreme_language', 'lack_of_empathy', 'invalidation']

train_eng, val_eng = multilabel_split(train_eng, labels)
train_hau, val_hau = multilabel_split(train_hau, labels)


train_t3 = pd.concat([train_eng, train_hau], ignore_index=True)
val_t3 = pd.concat([val_eng, val_hau], ignore_index=True)



dev_set_eng_t3=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask3/dev/eng.csv')
dev_set_hau_t3=pd.read_csv('drive/MyDrive/NLP/dev_phase/subtask3/dev/hau.csv')


train_t3.head()

Unnamed: 0,id,text,stereotype,vilification,dehumanization,extreme_language,lack_of_empathy,invalidation
0,eng_973938b90b0ff5d87d35a582f83f5c89,is defending imperialism in the dnd chat,0,0,0,0,0,0
1,eng_07dfd4600426caca6e2c5883fcbea9ea,Still playing with this. I am now following Ra...,0,0,0,0,0,0
2,eng_f14519ff2302b6cd47712073f13bc461,.senate.gov Theres 3 groups out there Republic...,0,0,0,0,0,0
3,eng_e48b7e7542faafa544ac57b64bc80daf,"""ABC MD, David Anderson, said the additional f...",0,0,0,0,0,0
4,eng_7c581fb77bce8033aeba3d6dbd6273eb,"""bad people"" I have some conservative values s...",0,0,0,0,0,0


### **3.2 Dataset**

#### **3.2.1 Labelled Dataset**

In [42]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class PolarizationDataset_t3(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

#### **3.2.1 UnLabelled Dataset**

In [43]:
# Fix the dataset class by inheriting from torch.utils.data.Dataset
class UnlabelledPolarizationDataset_t3(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = [[0,0,0,0,0,0] for i in labels]
        self.tokenizer = tokenizer
        self.max_length = max_length # Store max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=False, max_length=self.max_length, return_tensors='pt')

        # Ensure consistent tensor conversion for all items
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        # CHANGE THIS LINE: Use torch.float instead of torch.long for multi-label classification
        item['labels'] = torch.tensor(label, dtype=torch.float)
        return item

#### **3.2.2 Tokenization**

In [44]:
# Load the tokenizer
tokenizer_task3 = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-base')

# Create train and Test dataset for multilabel
train_dataset_t3 = PolarizationDataset_t3(train_t3['text'].tolist(), train_t3[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer_task3)
val_dataset_t3 = PolarizationDataset_t3(val_t3['text'].tolist(), val_t3[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer_task3)

dev_dataset_eng_t3 = UnlabelledPolarizationDataset_t3(dev_set_eng_t3['text'].tolist(), dev_set_eng_t3[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer_task3)
dev_dataset_hau_t3 = UnlabelledPolarizationDataset_t3(dev_set_hau_t3['text'].tolist(), dev_set_hau_t3[['vilification','extreme_language','stereotype','invalidation','lack_of_empathy','dehumanization']].values.tolist(), tokenizer_task3)

### **3.3 Model Training**

#### **3.3.1 Model Initialization**

In [45]:
# Load the model
model_task3 = AutoModelForSequenceClassification.from_pretrained('Davlan/afro-xlmr-base', num_labels=6, problem_type="multi_label_classification") # use 6 labels

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### **3.3.2 Metrics Function**

In [46]:
# Define metrics function for multi-label classification
def compute_metrics_multilabel_task3(p):
    # Sigmoid the predictions to get probabilities
    probs = torch.sigmoid(torch.from_numpy(p.predictions))
    # Convert probabilities to predicted labels (0 or 1)
    preds = (probs > 0.5).int().numpy()
    # Compute macro F1 score
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

#### **3.3.3  Model Parameters**

In [47]:
# Define training arguments
training_args_t3 = TrainingArguments(
    output_dir=f"./",
    num_train_epochs=6,
    learning_rate=4e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=100,
    disable_tqdm=False
)



#### **3.3.4 Model Trainer**

In [48]:
# Initialize the Trainer
trainer_task3 = Trainer(
    model=model_task3,
    args=training_args_t3,
    train_dataset=train_dataset_t3,
    eval_dataset=val_dataset_t3,
    compute_metrics=compute_metrics_multilabel_task3,  # Use the new metrics function
    data_collator=DataCollatorWithPadding(tokenizer_task3)
)

# Train the model
trainer_task3.train()

# Evaluate the model on the validation set
eval_results = trainer_task3.evaluate()
print(f"Macro F1 score on validation set for Subtask 3: {eval_results['eval_f1_macro']}")

A ConfigError was raised whilst setting the number of model parameters in Weights & Biases config.


Epoch,Training Loss,Validation Loss,F1 Macro
1,0.2421,0.241052,0.126068
2,0.2278,0.243279,0.190858
3,0.2071,0.301725,0.157918
4,0.149,0.262554,0.243598
5,0.1456,0.298568,0.236469
6,0.1351,0.323367,0.251391


Macro F1 score on validation set for Subtask 3: 0.2513907614435154


#### **3.3.5 Test on dev set**

In [49]:
from scipy.special import expit

In [50]:
pred_data_t3 = {
    "eng": {'data': dev_dataset_eng_t3, 'ids': dev_set_eng_t3['id'].tolist()},
    "hau": {'data': dev_dataset_hau_t3, 'ids': dev_set_hau_t3['id'].tolist()},
}

for lang in pred_data_t3:
    pred_dev=trainer_task3.predict(pred_data_t3[lang]['data'])

    #converting logits to probabilities using sigmoid
    probs = expit(pred_dev.predictions)

    y_pred = (probs > 0.5).astype(int)

    y_true = pred_dev.label_ids
    pred_data_t3[lang]['results']=y_pred



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### **3.3.5 Making Submission Package**

In [51]:
submission_dict = {}

for lang in pred_data_t3:
    df = pd.DataFrame({
        "id": pred_data_t3[lang]['ids'],
        'vilification':pred_data_t3[lang]['results'][:,0],
        'extreme_language':pred_data_t3[lang]['results'][:,1],
        'stereotype':pred_data_t3[lang]['results'][:,2],
        'invalidation':pred_data_t3[lang]['results'][:,3],
        'lack_of_empathy':pred_data_t3[lang]['results'][:,4],
        'dehumanization':pred_data_t3[lang]['results'][:,5]
    })
    submission_dict[lang] = df

create_submission_zip('subtask_3',submission_dict)

Submission zip created at: ./subtask_3.zip


#### **3.3.6 Saving the Model**

In [52]:
from datetime import datetime

In [53]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
naming='micro-mdeberta'
save_path = f"saved_model_t3_{naming}_f1_{eval_results['eval_f1_macro']:.4f}_{timestamp}"
trainer_task3.save_model('drive/MyDrive/NLP/models'+'//'+save_path)