## Extracting Narratives and Sub Narratives

In [1]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.6-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250327-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pdfplumber
import re
from collections import defaultdict

def extract_narratives(pdf_path):
    # Separate the dictionaries for narratives and sub-narratives
    ukraine_narratives = {}
    climate_narratives = {}

    ukraine_subnarratives = defaultdict(list)
    climate_subnarratives = defaultdict(list)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue

            # Identify the taxonomy based on the page number
            if page_num == 1:
                narrative_dict = ukraine_narratives
                subnarrative_dict = ukraine_subnarratives
            elif page_num == 2:
                narrative_dict = climate_narratives
                subnarrative_dict = climate_subnarratives
            else:
                continue  # Ignore other pages if any have 

            lines = text.split("\n")
            current_narrative = None

            for line in lines:
                line = line.strip()
                if not line or line.startswith("Figure") or line.isdigit():
                    continue  # Skip the unwanted lines

                # If line are does not start with '-', it's a narrative
                if not line.startswith("-"):
                    current_narrative = line
                    if current_narrative != "Other":  # Exclude "Other"
                        narrative_dict[current_narrative] = len(narrative_dict)  # Assign the sequential number
                # If line are starts with '-', it's a sub-narrative
                elif current_narrative and current_narrative != "Other":
                    subnarrative_dict[current_narrative].append(line.lstrip("-").strip())

    return ukraine_narratives, climate_narratives, dict(ukraine_subnarratives), dict(climate_subnarratives)

# Example of usage
pdf_path = "NARRATIVE-TAXONOMIES.pdf" 
ukraine_narratives, climate_narratives, ukraine_subnarratives, climate_subnarratives = extract_narratives(pdf_path)

print("Ukraine War Narratives:", ukraine_narratives)
print("Climate Change Narratives:", climate_narratives)
print("Ukraine War Sub-Narratives:", ukraine_subnarratives)
print("Climate Change Sub-Narratives:", climate_subnarratives)



Ukraine War Narratives: {'Blaming the war on others rather than the invader': 0, 'Discrediting Ukraine': 1, 'Russia is the Victim': 2, 'Praise of Russia': 3, 'Overpraising the West': 4, 'Speculating war outcomes': 5, 'Discrediting the West, Diplomacy': 6, 'Negative Consequences for the West': 7, 'Distrust towards Media': 8, 'Amplifying war-related fears': 9, 'Hidden plots by secret schemes of powerful groups': 10}
Climate Change Narratives: {'Criticism of climate policies': 0, 'Criticism of institutions and authorities': 1, 'Climate change is beneficial': 2, 'Downplaying climate change': 3, 'Questioning the measurements and science': 4, 'Criticism of climate movement': 5, 'Controversy about green technologies': 6, 'Hidden plots by secret schemes of powerful groups': 7, 'Amplifying Climate Fears': 8, 'Green policies are geopolitical instruments': 9}
Ukraine War Sub-Narratives: {'Blaming the war on others rather than the invader': ['Ukraine is the aggressor', 'The West are the aggressors

## Data Loading

In [3]:
!unzip /content/train.zip -d /content/Train

Archive:  /content/train.zip
   creating: /content/Train/target_4_December_release/
   creating: /content/Train/target_4_December_release/BG/
   creating: /content/Train/target_4_December_release/BG/raw-documents/
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10015.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10345.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10380.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10468.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10525.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10556.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10565.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10575.txt  
  inflating: /content/Train/target_4_December_rele

In [4]:
import os
import torch
from transformers import XLMRobertaTokenizer
from torch.utils.data import Dataset

In [None]:
# Paths
DATA_DIR = "/content/Train" 
LANGUAGES = ["BG", "EN", "HI", "PT", "RU"]

ukraine_narratives, climate_narratives, ukraine_subnarratives, climate_subnarratives = extract_narratives(pdf_path)

# Combine the Ukraine and Climate Change narratives into a single dictionary
combined_narratives = {**ukraine_narratives, **climate_narratives}

# Reassign the values to ensure they are unique and continuously increasing
narratives = {k: i for i, k in enumerate(combined_narratives.keys())}

# Combine the Ukraine and Climate Change sub-narratives into a single dictionary
subnarratives = {**ukraine_subnarratives, **climate_subnarratives}

from collections import OrderedDict

# Flatten the subnarratives while preserving order and uniqueness
seen = set()
ordered_sub_narratives = []
for subs in subnarratives.values():
    for sub in subs:
        if sub not in seen:
            seen.add(sub)
            ordered_sub_narratives.append(sub)

# Assign indices into the original order
sub_narrative_indices = {sub: i for i, sub in enumerate(ordered_sub_narratives)}

# Optional: check count
num_sub_narratives = len(ordered_sub_narratives)
print(f"Total sub-narratives: {num_sub_narratives}")
print(f"Sub-narrative indices: {sub_narrative_indices}")



Total sub-narratives: 74
Sub-narrative indices: {'Ukraine is the aggressor': 0, 'The West are the aggressors': 1, 'Rewriting Ukraine’s history': 2, 'Discrediting Ukrainian nation and society': 3, 'Discrediting Ukrainian military': 4, 'Discrediting Ukrainian government and officials and policies': 5, 'Ukraine is a puppet of the West': 6, 'Ukraine is a hub for criminal activities': 7, 'Ukraine is associated with nazism': 8, 'Situation in Ukraine is hopeless': 9, 'The West is russophobic': 10, 'Russia actions in Ukraine are only self-defence': 11, 'UA is anti-RU extremists': 12, 'Praise of Russian military might': 13, 'Praise of Russian President Vladimir Putin': 14, 'Russia is a guarantor of peace and prosperity': 15, 'Russia has international support from a number of countries and people': 16, 'Russian invasion has strong national support': 17, 'NATO will destroy Russia': 18, 'The West belongs in the right side of history': 19, 'The West has the strongest international support': 20, 'Ru

In [6]:
narratives

{'Blaming the war on others rather than the invader': 0,
 'Discrediting Ukraine': 1,
 'Russia is the Victim': 2,
 'Praise of Russia': 3,
 'Overpraising the West': 4,
 'Speculating war outcomes': 5,
 'Discrediting the West, Diplomacy': 6,
 'Negative Consequences for the West': 7,
 'Distrust towards Media': 8,
 'Amplifying war-related fears': 9,
 'Hidden plots by secret schemes of powerful groups': 10,
 'Criticism of climate policies': 11,
 'Criticism of institutions and authorities': 12,
 'Climate change is beneficial': 13,
 'Downplaying climate change': 14,
 'Questioning the measurements and science': 15,
 'Criticism of climate movement': 16,
 'Controversy about green technologies': 17,
 'Amplifying Climate Fears': 18,
 'Green policies are geopolitical instruments': 19}

In [7]:
subnarratives

{'Blaming the war on others rather than the invader': ['Ukraine is the aggressor',
  'The West are the aggressors'],
 'Discrediting Ukraine': ['Rewriting Ukraine’s history',
  'Discrediting Ukrainian nation and society',
  'Discrediting Ukrainian military',
  'Discrediting Ukrainian government and officials and policies',
  'Ukraine is a puppet of the West',
  'Ukraine is a hub for criminal activities',
  'Ukraine is associated with nazism',
  'Situation in Ukraine is hopeless'],
 'Russia is the Victim': ['The West is russophobic',
  'Russia actions in Ukraine are only self-defence',
  'UA is anti-RU extremists'],
 'Praise of Russia': ['Praise of Russian military might',
  'Praise of Russian President Vladimir Putin',
  'Russia is a guarantor of peace and prosperity',
  'Russia has international support from a number of countries and people',
  'Russian invasion has strong national support'],
 'Overpraising the West': ['NATO will destroy Russia',
  'The West belongs in the right side o

In [None]:
# Load Tokenizer
TOKENIZER = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Dataset Class
class NarrativeDataset(Dataset):
    def __init__(self, texts, narrative_labels, sub_narrative_labels, tokenizer, max_len=512):
        self.texts = texts
        self.narrative_labels = narrative_labels
        self.sub_narrative_labels = sub_narrative_labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        narrative_label = self.narrative_labels[idx]
        sub_narrative_label = self.sub_narrative_labels[idx]

        # Tokenized text
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"].squeeze(0)
        attention_mask = inputs["attention_mask"].squeeze(0)

        # Convertthe labels to tensors
        narrative_label = torch.tensor(narrative_label, dtype=torch.float)
        sub_narrative_label = torch.tensor(sub_narrative_label, dtype=torch.float)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "narrative_labels": narrative_label,
            "sub_narrative_labels": sub_narrative_label

        }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [None]:
# Load and Clean the Data
def load_and_clean_data():
    texts, narrative_labels, sub_narrative_labels = [], [], []
    lang_counts = {lang: 0 for lang in LANGUAGES}
    lang_annotations = {lang: [] for lang in LANGUAGES}  # To store annotations per language

    for lang in LANGUAGES:
        lang_path = os.path.join(DATA_DIR, "target_4_December_release", lang)
        annotation_file = os.path.join(lang_path, "subtask-2-annotations.txt")

        # Debug: Print the annotation of file path
        #print(f"Checking annotation file for {lang}: {annotation_file}")

        if not os.path.exists(annotation_file):
            print(f"Warning: No annotations found for {lang}")
            continue

        with open(annotation_file, "r", encoding="utf-8") as file:
            for line in file:
                parts = line.strip().split("\t")
                if len(parts) < 3:
                    print(f"Skipping malformed line: {line}")
                    continue

                article_id, narratives_str, subnarratives_str = parts[0], parts[1], parts[2]

                # Skip the samples where both narratives and sub-narratives are "Other"
                if narratives_str == "Other" and subnarratives_str == "Other":
                    continue

                # Read thecorresponding text file
                text_file = os.path.join(lang_path, "raw-documents", article_id)

                # Debug: Print the text file path
                #print(f"Checking text file for {article_id}: {text_file}")

                if not os.path.exists(text_file):
                    print(f"Warning: Missing text file {article_id} in {lang}")
                    continue

                with open(text_file, "r", encoding="utf-8") as f:
                    text = f.read()

                # Split the narratives and sub-narratives
                narrative_list = [n.split(": ")[1] if ": " in n else n for n in narratives_str.split(";")]
                sub_narrative_list = []
                for s in subnarratives_str.split(";"):
                  s = (((s.split(";"))[0].split(": ")))[2]
                  sub_narrative_list.append(s)


                # sub_narrative_list = [s.split(": ")[1] if ": " in s else s for s in subnarratives_str.split(";")]

                # Encode the Narratives (Multi-Label Classification)
                narrative_label = [1 if narrative in narrative_list else 0 for narrative in narratives.keys()]

                # Encode the Sub-Narratives (Multi-Label Classification)
                sub_narrative_label = [0] * num_sub_narratives  # Initialize the zero vector
                for sub in sub_narrative_list:
                    if sub in sub_narrative_indices:
                        sub_narrative_label[sub_narrative_indices[sub]] = 1  # Set index to 1

                # Store annotation
                annotation = {
                    "text": text,
                    "article_id": article_id,
                    "narratives_str": narratives_str,
                    "sub_narratives_str": subnarratives_str,
                    "narrative_label": narrative_label,
                    "sub_narrative_label": sub_narrative_label,
                    "narrative_list": narrative_list,
                    "sub_narrative_list": sub_narrative_list,
                    "sub_narrative_indices": sub_narrative_indices
                }

                texts.append(text)
                narrative_labels.append(narrative_label)
                sub_narrative_labels.append(sub_narrative_label)
                lang_annotations[lang].append(annotation)
                lang_counts[lang] += 1

    return texts, narrative_labels, sub_narrative_labels, lang_counts, lang_annotations

In [None]:
# Execution 
if __name__ == "__main__":
    print("Loading data...")
    texts, narrative_labels, sub_narrative_labels, lang_counts, lang_annotations = load_and_clean_data()

    # Print the statistics
    print("\n### Annotations per Language ###")
    for lang, count in lang_counts.items():
        print(f"{lang}: {count} annotations")

    # lang = "EN"
    # value_380 = lang_annotations[lang][200]

    # for key, val in value_380.items():
    #     print(f"{key}: {val}")
    print("\n### Sample Preprocessed Output per Language ###")
    for lang, lang_anns in lang_annotations.items():
        print(f"\nLanguage: {lang}")
        lang_dataset = NarrativeDataset(
            [ann["text"] for ann in lang_anns],
            [ann["narrative_label"] for ann in lang_anns],
            [ann["sub_narrative_label"] for ann in lang_anns],
            TOKENIZER
        )

        # Display 2 examples per language
        for i in range(min(2, len(lang_dataset))):
            sample = lang_dataset[i]
            print(f"\nExample {i+1}:")
            print("Article ID:", lang_anns[i]["article_id"])
            print("Tokenized Input IDs:", sample["input_ids"][:20])  # Print 1st 20 tokens
            print("Decoded Text:", TOKENIZER.decode(sample["input_ids"][:100]))  # Decode 1st 100 tokens
            print("Narrative Labels:", sample["narrative_labels"].numpy())  # Print the narrative labels
            print("Sub-Narrative Labels:", sample["sub_narrative_labels"].numpy())  # Print the sub-narrative labels

Loading data...

### Annotations per Language ###
BG: 371 annotations
EN: 230 annotations
HI: 268 annotations
PT: 373 annotations
RU: 133 annotations

### Sample Preprocessed Output per Language ###

Language: BG

Example 1:
Article ID: BG_670.txt
Tokenized Input IDs: tensor([     0,   1089,  22617,   1669,     29,  47829,   2097,  32275,     69,
           137,    197,  35359,  53335,   2827,  40053,    155,    135, 128601,
            29,  12747])
Decoded Text: <s> Опитът на колективния Запад да „обезкърви Русия“ с ръцете на властите в Киев „се провали с гръм и трясък“ и скоро от Украйна ... Опитът на колективния Запад да „обезкърви Русия“ с ръцете на властите в Киев „се провали с гръм и трясък“ и скоро от Украйна няма да остане почти нищо, ако не започне процесът на разрешаване на този въоръжен конфликт
Narrative Labels: [1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Sub-Narrative Labels: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 

## Training the model

In [None]:
import torch.nn as nn
from transformers import XLMRobertaModel, XLMRobertaConfig
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
import torch
import numpy as np
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

class EnhancedNarrativeModel(nn.Module):
    def __init__(self, num_narratives, num_sub_narratives):
        super(EnhancedNarrativeModel, self).__init__()

        config = XLMRobertaConfig.from_pretrained(
            "xlm-roberta-base",
            output_hidden_states=True,
            hidden_dropout_prob=0.2,
            attention_probs_dropout_prob=0.2
        )
        self.xlm_roberta = XLMRobertaModel.from_pretrained("xlm-roberta-base", config=config)

        self.dropout = nn.Dropout(0.3)
        self.hidden = nn.Linear(config.hidden_size, config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.gelu = nn.GELU()

        self.narrative_classifier = nn.Linear(config.hidden_size, num_narratives)

        # Hierarchical sub-narrative classifier are takes both pooled_output and narrative_logits
        self.sub_narrative_hidden = nn.Linear(config.hidden_size + num_narratives, config.hidden_size)
        self.sub_narrative_classifier = nn.Linear(config.hidden_size, num_sub_narratives)

        # Weightof the Initialization
        nn.init.xavier_uniform_(self.narrative_classifier.weight)
        nn.init.xavier_uniform_(self.sub_narrative_classifier.weight)
        nn.init.zeros_(self.narrative_classifier.bias)
        nn.init.zeros_(self.sub_narrative_classifier.bias)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)

        last_hidden_states = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        pooled_output = sum_embeddings / sum_mask

        pooled_output = self.dropout(pooled_output)
        pooled_output = self.hidden(pooled_output)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.gelu(pooled_output)
        pooled_output = self.dropout(pooled_output)

        # Step 1: Predict the narrative logits
        narrative_logits = self.narrative_classifier(pooled_output)

        # Step 2: Concatenate of pooled_output with narrative_logits for sub-narrative prediction
        combined = torch.cat((pooled_output, narrative_logits), dim=1)
        sub_hidden = self.gelu(self.sub_narrative_hidden(combined))
        sub_narrative_logits = self.sub_narrative_classifier(self.dropout(sub_hidden))

        return narrative_logits, sub_narrative_logits

def calculate_focal_weights(labels, gamma=2):
    """Calculate focal weights for imbalanced datasets"""
    pos_counts = np.sum(labels, axis=0)
    neg_counts = len(labels) - pos_counts
    class_weights = neg_counts / (pos_counts + 1e-6)
    focal_weights = (1 / (1 + np.exp(-class_weights))) ** gamma
    return torch.tensor(focal_weights, dtype=torch.float32)

In [12]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = (1 - pt) ** self.gamma * BCE_loss

        if self.alpha is not None:
            F_loss = self.alpha * F_loss

        if self.reduction == 'mean':
            return torch.mean(F_loss)
        elif self.reduction == 'sum':
            return torch.sum(F_loss)
        else:
            return F_loss

In [None]:
# Enhanced the training setup
def train_enhanced_model(narrative_labels, sub_narrative_labels):
    # Initialized model with the correct number of classes
    num_narratives = len(narratives)
    num_sub_narratives = len(sub_narrative_indices)
    model = EnhancedNarrativeModel(num_narratives, num_sub_narratives)

    # Calculate the focal weights
    narrative_labels_all = np.array(narrative_labels)
    sub_narrative_labels_all = np.array(sub_narrative_labels)

    # Move model to device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    narrative_weights = calculate_focal_weights(narrative_labels_all, gamma=2).to(device)
    sub_narrative_weights = calculate_focal_weights(sub_narrative_labels_all, gamma=2).to(device)

    # Loss functions with focal loss
    narrative_criterion = FocalLoss(alpha=narrative_weights, gamma=2)
    sub_narrative_criterion = FocalLoss(alpha=sub_narrative_weights, gamma=2)


    # Enhanced optimizer
    optimizer = optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

    # Learning rate scheduler
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

    # Initialize dataset
    dataset = NarrativeDataset(texts, narrative_labels, sub_narrative_labels, TOKENIZER)

    # Stratified split (pseudo-stratification for multi-label)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    indices = list(range(len(dataset)))

    # Sort indices by label sum to approximate stratification
    label_sums = [sum(narrative_labels[i]) + sum(sub_narrative_labels[i]) for i in range(len(dataset))]
    sorted_indices = [i for _, i in sorted(zip(label_sums, indices))]

    train_indices = sorted_indices[:train_size]
    val_indices = sorted_indices[train_size:]

    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    val_dataset = torch.utils.data.Subset(dataset, val_indices)

    # Data loaders with larger batch size
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True)

    # Training loop with enhancements
    best_val_f1 = 0.0
    patience = 4
    epochs_without_improvement = 0

    for epoch in range(20):  # Increased max epochs
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            narrative_labels = batch['narrative_labels'].to(device)
            sub_narrative_labels = batch['sub_narrative_labels'].to(device)

            # Forward pass
            narrative_logits, sub_narrative_logits = model(input_ids, attention_mask)

            # Calculate losses
            narrative_loss = narrative_criterion(narrative_logits, narrative_labels)
            sub_narrative_loss = sub_narrative_criterion(sub_narrative_logits, sub_narrative_labels)
            total_loss = narrative_loss + sub_narrative_loss

            # Backward pass with gradient clipping
            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            train_loss += total_loss.item()

        # Validation phase
        model.eval()
        val_loss = 0.0
        all_narrative_preds = []
        all_narrative_labels = []
        all_sub_narrative_preds = []
        all_sub_narrative_labels = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                narrative_labels = batch['narrative_labels'].to(device)
                sub_narrative_labels = batch['sub_narrative_labels'].to(device)

                # Forward pass
                narrative_logits, sub_narrative_logits = model(input_ids, attention_mask)

                # Calculate losses
                narrative_loss = narrative_criterion(narrative_logits, narrative_labels)
                sub_narrative_loss = sub_narrative_criterion(sub_narrative_logits, sub_narrative_labels)
                total_loss = narrative_loss + sub_narrative_loss
                val_loss += total_loss.item()

                # Store predictions and labels
                narrative_preds = torch.sigmoid(narrative_logits) > 0.3  # Lower threshold
                sub_narrative_preds = torch.sigmoid(sub_narrative_logits) > 0.3

                all_narrative_preds.append(narrative_preds.cpu())
                all_narrative_labels.append(narrative_labels.cpu())
                all_sub_narrative_preds.append(sub_narrative_preds.cpu())
                all_sub_narrative_labels.append(sub_narrative_labels.cpu())

        # Calculate metrics
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        all_narrative_preds = torch.cat(all_narrative_preds).numpy()
        all_narrative_labels = torch.cat(all_narrative_labels).numpy()
        all_sub_narrative_preds = torch.cat(all_sub_narrative_preds).numpy()
        all_sub_narrative_labels = torch.cat(all_sub_narrative_labels).numpy()

        # Calculate F1 scores (using macro average for better class balance)
        narrative_f1 = f1_score(all_narrative_labels, all_narrative_preds, average='macro')
        sub_narrative_f1 = f1_score(all_sub_narrative_labels, all_sub_narrative_preds, average='macro')
        combined_f1 = (narrative_f1 + sub_narrative_f1) / 2

        # Update learning rate
        scheduler.step(val_loss)

        print(f"\nEpoch {epoch+1}:")
        print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        print(f"Narrative F1: {narrative_f1:.4f} | Sub-narrative F1: {sub_narrative_f1:.4f}")

        # Early stopping based on combined F1
        if combined_f1 > best_val_f1:
            best_val_f1 = combined_f1
            epochs_without_improvement = 0
            torch.save(model.state_dict(), "best_enhanced_model.pt")
            print("Saved new best model!")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"\nEarly stopping after {patience} epochs without improvement")
                break

    return model

# Run enhanced training
print("Starting enhanced training...")
enhanced_model = train_enhanced_model(narrative_labels, sub_narrative_labels)

Starting enhanced training...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]


Epoch 1:
Train Loss: 0.2626 | Val Loss: 0.2542
Narrative F1: 0.1179 | Sub-narrative F1: 0.0786
Saved new best model!

Epoch 2:
Train Loss: 0.1402 | Val Loss: 0.2145
Narrative F1: 0.2359 | Sub-narrative F1: 0.0269
Saved new best model!

Epoch 3:
Train Loss: 0.1124 | Val Loss: 0.2031
Narrative F1: 0.2649 | Sub-narrative F1: 0.0105
Saved new best model!

Epoch 4:
Train Loss: 0.0998 | Val Loss: 0.1941
Narrative F1: 0.2816 | Sub-narrative F1: 0.0057
Saved new best model!

Epoch 5:
Train Loss: 0.0930 | Val Loss: 0.2068
Narrative F1: 0.3145 | Sub-narrative F1: 0.0075
Saved new best model!

Epoch 6:
Train Loss: 0.0878 | Val Loss: 0.2047
Narrative F1: 0.3432 | Sub-narrative F1: 0.0113
Saved new best model!

Epoch 7:
Train Loss: 0.0839 | Val Loss: 0.2002
Narrative F1: 0.3633 | Sub-narrative F1: 0.0190
Saved new best model!

Epoch 8:
Train Loss: 0.0809 | Val Loss: 0.1950
Narrative F1: 0.3835 | Sub-narrative F1: 0.0221
Saved new best model!

Epoch 9:
Train Loss: 0.0778 | Val Loss: 0.1955
Narrativ

In [None]:
def predict_validation_data(model, val_loader, narratives_dict, sub_narrative_indices, tokenizer):
    """
    Enhanced prediction function that evaluates narrative-subnarrative pairs

    Args:
        model: Trained model
        val_loader: Validation DataLoader
        narratives_dict: Dictionary of narrative names to indices
        sub_narrative_indices: Dictionary of sub-narrative names to indices
        tokenizer: Tokenizer used for the model

    Returns:
        tuple: (texts, true_pairs, pred_pairs, narrative_probs, sub_narrative_probs)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    texts = []
    true_pairs = []  # List of the sets of (narrative, subnarrative) tuples
    pred_pairs = []   # List of the sets of (narrative, subnarrative) tuples
    narrative_probs = []
    sub_narrative_probs = []

    # Create the reverse mappings
    idx_to_narrative = {v: k for k, v in narratives_dict.items()}
    idx_to_subnarrative = {v: k for k, v in sub_narrative_indices.items()}

    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Get the true labels
            batch_true_narrative = batch['narrative_labels'].cpu().numpy()
            batch_true_subnarrative = batch['sub_narrative_labels'].cpu().numpy()

            # Get the original texts
            batch_texts = []
            for i in range(len(input_ids)):
                original_idx = val_dataset.indices[batch_idx * val_loader.batch_size + i]
                batch_texts.append(val_dataset.dataset.texts[original_idx])

            # Get the predictions
            narrative_logits, sub_narrative_logits = model(input_ids, attention_mask)

            # Convert to the probabilities and predictions (using threshold=0.3)
            batch_narrative_probs = torch.sigmoid(narrative_logits).cpu().numpy()
            batch_sub_narrative_probs = torch.sigmoid(sub_narrative_logits).cpu().numpy()

            batch_narrative_preds = (batch_narrative_probs > 0.3).astype(int)
            batch_sub_narrative_preds = (batch_sub_narrative_probs > 0.3).astype(int)

            # Store the probabilities
            narrative_probs.extend(batch_narrative_probs)
            sub_narrative_probs.extend(batch_sub_narrative_probs)

            # Process each sample in the batch
            for i in range(len(input_ids)):
                texts.append(batch_texts[i])

                # Get the true narrative-subnarrative pairs
                true_narratives = [idx_to_narrative[idx] for idx, val in enumerate(batch_true_narrative[i]) if val == 1]
                true_subnarratives = [idx_to_subnarrative[idx] for idx, val in enumerate(batch_true_subnarrative[i]) if val == 1]

                # Create all the possible true pairs (cartesian product)
                true_pair_set = set()
                for n in true_narratives:
                    for s in true_subnarratives:
                        true_pair_set.add((n, s))
                true_pairs.append(true_pair_set)

                # Get the predicted narrative-subnarrative pairs
                pred_narratives = [idx_to_narrative[idx] for idx, val in enumerate(batch_narrative_preds[i]) if val == 1]
                pred_subnarratives = [idx_to_subnarrative[idx] for idx, val in enumerate(batch_sub_narrative_preds[i]) if val == 1]

                # Create all the possible predicted pairs
                pred_pair_set = set()
                for n in pred_narratives:
                    for s in pred_subnarratives:
                        pred_pair_set.add((n, s))
                pred_pairs.append(pred_pair_set)

    return texts, true_pairs, pred_pairs, narrative_probs, sub_narrative_probs

def evaluate_pair_f1(true_pairs, pred_pairs):
    """
    Calculate document-level F1 scores for narrative-subnarrative pairs
    Returns micro and macro averaged F1 scores
    """
    f1_scores = []

    for true_set, pred_set in zip(true_pairs, pred_pairs):
        if not true_set and not pred_set:
            # Both are empty - perfect match
            f1_scores.append(1.0)
        elif not true_set or not pred_set:
            # One empty - zero score
            f1_scores.append(0.0)
        else:
            tp = len(true_set & pred_set)
            fp = len(pred_set - true_set)
            fn = len(true_set - pred_set)

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
            f1_scores.append(f1)

    micro_f1 = sum(f1_scores) / len(f1_scores)

    # Calculate macro F1 by first averaging precision and recall
    precisions = []
    recalls = []

    for true_set, pred_set in zip(true_pairs, pred_pairs):
        if not true_set and not pred_set:
            precisions.append(1.0)
            recalls.append(1.0)
        elif not true_set:
            precisions.append(0.0)
            recalls.append(1.0)  # We predicted something when shouldn't have
        elif not pred_set:
            precisions.append(1.0)  # We didnot predict anything (no false positives)
            recalls.append(0.0)
        else:
            tp = len(true_set & pred_set)
            fp = len(pred_set - true_set)
            fn = len(true_set - pred_set)

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            precisions.append(precision)
            recalls.append(recall)

    macro_precision = sum(precisions) / len(precisions)
    macro_recall = sum(recalls) / len(recalls)
    macro_f1 = 2 * macro_precision * macro_recall / (macro_precision + macro_recall) if (macro_precision + macro_recall) > 0 else 0

    return micro_f1, macro_f1

def analyze_pair_predictions(texts, true_pairs, pred_pairs, num_samples=5):
    """
    Analyze and display pair prediction results
    """
    # Calculate overall metrics
    micro_f1, macro_f1 = evaluate_pair_f1(true_pairs, pred_pairs)

    # Calculate exact match accuracy
    exact_matches = sum(1 for t, p in zip(true_pairs, pred_pairs) if t == p) / len(true_pairs)

    print("\n" + "="*80)
    print("NARRATIVE-SUBNARRATIVE PAIR EVALUATION")
    print("="*80)
    print(f"\nMicro-averaged F1: {micro_f1:.4f}")
    print(f"Macro-averaged F1: {macro_f1:.4f}")
    #print(f"Exact Match Accuracy: {exact_matches:.2%}")

    # Print sample predictions
    print(f"\nSample Predictions (showing {num_samples} samples):")
    print("-"*80)
    for i in range(min(num_samples, len(texts))):
        print(f"\nSample {i+1}:")
        print(f"Text: {texts[i][:100]}...")
        print(f"True Pairs: {true_pairs[i] if true_pairs[i] else 'None'}")
        print(f"Pred Pairs: {pred_pairs[i] if pred_pairs[i] else 'None'}")
        print(f"Correct: {len(true_pairs[i] & pred_pairs[i])}/{len(true_pairs[i])}")
        print(f"F1: {evaluate_pair_f1([true_pairs[i]], [pred_pairs[i]])[0]:.4f}")

# Load the best model
enhanced_model.load_state_dict(torch.load("best_enhanced_model.pt"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
enhanced_model = enhanced_model.to(device)

# Initialized dataset
dataset = NarrativeDataset(texts, narrative_labels, sub_narrative_labels, TOKENIZER)

# Stratified split (pseudo-stratification for multi-label)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
indices = list(range(len(dataset)))

# Sort indices by label sum to approximate stratification
label_sums = [sum(narrative_labels[i]) + sum(sub_narrative_labels[i]) for i in range(len(dataset))]
sorted_indices = [i for _, i in sorted(zip(label_sums, indices))]

train_indices = sorted_indices[:train_size]
val_indices = sorted_indices[train_size:]

train_dataset = torch.utils.data.Subset(dataset, train_indices)
val_dataset = torch.utils.data.Subset(dataset, val_indices)

# Data loaders with larger batch size
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, pin_memory=True) # Define val_loader in global scope

# Get predictions for validation data
val_texts, val_true_pairs, val_pred_pairs, val_narrative_probs, val_sub_narrative_probs = predict_validation_data(
    enhanced_model, val_loader, narratives, sub_narrative_indices, TOKENIZER
)

# Analyze the predictions
analyze_pair_predictions(val_texts, val_true_pairs, val_pred_pairs)


NARRATIVE-SUBNARRATIVE PAIR EVALUATION

Micro-averaged F1: 0.1073
Macro-averaged F1: 0.1360

Sample Predictions (showing 5 samples):
--------------------------------------------------------------------------------

Sample 1:
Text: Най-добрите климатолози очакват съвсем скоро затоплянето да прехвърли 1.5 С

Очакванията са до 2100 ...
True Pairs: {('Amplifying Climate Fears', 'Doomsday scenarios for humans'), ('Criticism of institutions and authorities', 'Criticism of national governments'), ('Criticism of institutions and authorities', 'Criticism of political organizations and figures'), ('Amplifying Climate Fears', 'Criticism of national governments'), ('Criticism of institutions and authorities', 'Amplifying existing fears of global warming'), ('Amplifying Climate Fears', 'Criticism of political organizations and figures'), ('Criticism of institutions and authorities', 'Doomsday scenarios for humans'), ('Amplifying Climate Fears', 'Amplifying existing fears of global warming')}
Pred 