### Import

In [None]:
import os
import pandas as pd
import re
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import spacy
from collections import defaultdict
from sklearn.model_selection import train_test_split
import ast
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Preprocessing

In [None]:
def preprocess_articles(article_dir):
    """
    Preprocess the article files by extracting filenames and text content into a DataFrame.

    Args:
    - article_dir (str): Directory path containing article `.txt` files.

    Returns:
    - pandas.DataFrame: DataFrame with columns ['article_id', 'text'].
    """
    # List to store article data
    articles = []

    # Loop through each file in the directory
    for filename in os.listdir(article_dir):
        if filename.endswith(".txt"):
            file_path = os.path.join(article_dir, filename)

            # Read the file content
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()

                # Extract the text: skip title (first row) and empty line (second row)
                title = lines[0].strip() if len(lines) > 0 else ""
                content = " ".join(line.strip() for line in lines[2:])

                # Append the extracted data
                articles.append({'article_id': filename, 'text': title + "\n" + content})

    # Create a DataFrame
    article_df = pd.DataFrame(articles)

    return article_df

# Preprocessing the annotations and merging with the article.
def preprocess_annotations(annotation_file, article_df):
    """
    Preprocess the annotation file and merge annotations with the article DataFrame.

    Args:
    - annotation_file (str): Path to the annotation file (tab-separated).
    - article_df (pandas.DataFrame): DataFrame with article IDs and texts.

    Returns:
    - pandas.DataFrame: Merged DataFrame with columns ['article_id', 'text', 'narratives', 'subnarratives'].
    """
    # Read the annotation file
    annotations = pd.read_csv(annotation_file, sep='\t', header=None,
                               names=['article_id', 'narratives', 'subnarratives'])

    # Merge the annotation data with the article DataFrame
    merged_df = pd.merge(article_df, annotations, on='article_id', how='left')

    # Fill missing values for narratives and subnarratives with 'Other'
    merged_df['narratives'] = merged_df['narratives'].fillna('Other')
    merged_df['subnarratives'] = merged_df['subnarratives'].fillna('Other')

    return merged_df

# Taxonomies Processing
def parse_taxonomy(taxonomy_str):
    """
    Parse a taxonomy string into a hierarchical dictionary.

    Args:
    - taxonomy_str (str): Taxonomy string, e.g., 'x:y; x:z; x:y:a'

    Returns:
    - dict: Hierarchical representation of the taxonomy.
    """
    taxonomy_dict = {}
    if taxonomy_str == "Other" or not taxonomy_str:
        return {"Other": {"level_1": [], "level_2": {}}}

    entries = taxonomy_str.split(";")
    for entry in entries:
        levels = entry.split(":")
        x = levels[0]  # Top-level narrative
        y = levels[1] if len(levels) > 1 else "Other"
        a = levels[2] if len(levels) > 2 else None

        if x not in taxonomy_dict:
            taxonomy_dict[x] = {"level_1": [], "level_2": {}}

        if y != "Other" and y not in taxonomy_dict[x]["level_1"]:
            taxonomy_dict[x]["level_1"].append(y)

        if a and y != "Other":
            if y not in taxonomy_dict[x]["level_2"]:
                taxonomy_dict[x]["level_2"][y] = []
            if a not in taxonomy_dict[x]["level_2"][y]:
                taxonomy_dict[x]["level_2"][y].append(a)

    return taxonomy_dict


def expand_taxonomies(df, narrative_col, subnarrative_col):
    """
    Expand taxonomies into hierarchical format.

    Args:
    - df (pd.DataFrame): DataFrame with taxonomy columns.
    - narrative_col (str): Column name for narrative-level taxonomies.
    - subnarrative_col (str): Column name for sub-narrative-level taxonomies.

    Returns:
    - pd.DataFrame: Expanded DataFrame with hierarchical taxonomy extraction.
    """
    expanded_data = []

    for _, row in df.iterrows():
        article_id = row['article_id']
        narratives = parse_taxonomy(row[narrative_col])
        subnarratives = parse_taxonomy(row[subnarrative_col])

        for narrative, details in narratives.items():
            for level_1 in details['level_1']:
                if level_1 in subnarratives[narrative]["level_2"]:
                    for level_2 in subnarratives[narrative]["level_2"][level_1]:
                        expanded_data.append({
                            "article_id": article_id,
                            "narrative": narrative,
                            "level_1": level_1,
                            "level_2": level_2
                        })
                else:
                    expanded_data.append({
                        "article_id": article_id,
                        "narrative": narrative,
                        "level_1": level_1,
                        "level_2": "Other"
                    })
            if not details['level_1']:  # If no level 1 for a narrative
                expanded_data.append({
                    "article_id": article_id,
                    "narrative": narrative,
                    "level_1": "Other",
                    "level_2": "Other"
                })

    return pd.DataFrame(expanded_data)


def load_annotations(file_path):
    """
    Load and process annotations from a file into a DataFrame.

    Args:
    - file_path (str): Path to the annotations file.

    Returns:
    - pandas.DataFrame: DataFrame with columns ['article_id', 'narratives', 'subnarratives'].
    """
    # Load the file into a DataFrame
    annotations = pd.read_csv(file_path, sep='\t', header=None,
                               names=['article_id', 'narratives', 'subnarratives'])

    # Fill missing values with 'Other'
    annotations['narratives'] = annotations['narratives'].fillna('Other')
    annotations['subnarratives'] = annotations['subnarratives'].fillna('Other')

    return annotations


# Training data = train_articles, Development data = dev_articles
train_articles_path = '/content/drive/MyDrive/NLP_Proj/target_4_December_release/RU/raw-documents'
train_articles = preprocess_articles(train_articles_path)
dev_articles_path = '/content/drive/MyDrive/NLP_Proj/cleaned_dev_10_january_2025/RU/subtask-2-documents'
dev_articles = preprocess_articles(dev_articles_path)
#print(article_df.head())


# Annotations for training and dev
annotations_train_path = '/content/drive/MyDrive/NLP_Proj/target_4_December_release/RU/subtask-2-annotations.txt'
annotations_train_df = load_annotations(annotations_train_path)
expanded_train_annotations = expand_taxonomies(annotations_train_df, 'narratives', 'subnarratives')

annotations_dev_path = '/content/drive/MyDrive/NLP_Proj/cleaned_dev_10_january_2025/RU/subtask-2-annotations.txt'
annotations_dev_df = load_annotations(annotations_dev_path)
expanded_dev_annotations = expand_taxonomies(annotations_dev_df, 'narratives', 'subnarratives')


In [None]:
np.unique(expanded_train_annotations['narrative'])


array(['URW'], dtype=object)

In [None]:
train_articles.head()

Unnamed: 0,article_id,text
0,RU-URW-1017.txt,Дмитрий Стешин: Россия тихо глушит GPS европей...
1,RU-URW-1021.txt,23:55 Лукашенко обвинил Запад в стремлении к е...
2,RU-URW-1031.txt,Лихачев: РФ ждет от ООН оценку действий Киева ...
3,RU-URW-1025.txt,Финский наемник Сирен раскритиковал подготовку...
4,RU-URW-1011.txt,"Если ВС РФ расширят фронт, США позволят Украин..."


### Statistics on data

In [None]:
print("Len of train articles:", len(train_articles))
print("Len of dev articles:", len(dev_articles))

print("Len of dev annotation:", len(expanded_train_annotations))
print("Len of dev annotation:", len(expanded_dev_annotations))

Len of train articles: 133
Len of dev articles: 32
Len of dev annotation: 272
Len of dev annotation: 79


In [None]:
narrative_unique = sorted(set(expanded_train_annotations['narrative']))
narrative_dev_unique = sorted(set(expanded_dev_annotations['narrative']))

print(len(narrative_unique), narrative_unique)
print(len(narrative_dev_unique), narrative_dev_unique)

1 ['URW']
2 ['Other', 'URW']


In [None]:
Level_1_unique = sorted(set(expanded_train_annotations['level_1']))
Level_2_unique = sorted(set(expanded_train_annotations['level_2']))

print(len(Level_1_unique), Level_1_unique)
print(len(Level_2_unique), Level_2_unique)

11 [' Amplifying war-related fears', ' Blaming the war on others rather than the invader', ' Discrediting Ukraine', ' Discrediting the West, Diplomacy', ' Distrust towards Media', ' Hidden plots by secret schemes of powerful groups', ' Negative Consequences for the West', ' Overpraising the West', ' Praise of Russia', ' Russia is the Victim', ' Speculating war outcomes']
30 [' By continuing the war we risk WWIII', ' Diplomacy does/will not work', ' Discrediting Ukrainian government and officials and policies', ' Discrediting Ukrainian military', ' Discrediting Ukrainian nation and society', ' NATO should/will directly intervene', ' Other', ' Praise of Russian President Vladimir Putin', ' Praise of Russian military might', ' Rewriting Ukraine’s history', ' Russia actions in Ukraine are only self-defence', ' Russia has international support from a number of countries and people', ' Russia is a guarantor of peace and prosperity', ' Russia will also attack other countries', ' Situation in 

In [None]:
Level_1_unique = sorted(set(expanded_train_annotations['level_1']))
Level_2_unique = sorted(set(expanded_train_annotations['level_2']))

print(len(Level_1_unique), Level_1_unique)
print(len(Level_2_unique), Level_2_unique)

11 [' Amplifying war-related fears', ' Blaming the war on others rather than the invader', ' Discrediting Ukraine', ' Discrediting the West, Diplomacy', ' Distrust towards Media', ' Hidden plots by secret schemes of powerful groups', ' Negative Consequences for the West', ' Overpraising the West', ' Praise of Russia', ' Russia is the Victim', ' Speculating war outcomes']
30 [' By continuing the war we risk WWIII', ' Diplomacy does/will not work', ' Discrediting Ukrainian government and officials and policies', ' Discrediting Ukrainian military', ' Discrediting Ukrainian nation and society', ' NATO should/will directly intervene', ' Other', ' Praise of Russian President Vladimir Putin', ' Praise of Russian military might', ' Rewriting Ukraine’s history', ' Russia actions in Ukraine are only self-defence', ' Russia has international support from a number of countries and people', ' Russia is a guarantor of peace and prosperity', ' Russia will also attack other countries', ' Situation in 

### Articles Embedding

In [None]:
## Download package for russian
# Install SpaCy and download the Russian model
!pip install spacy
!python -m spacy download ru_core_news_sm


Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.7.0)
  Downloading pymorphy3-2.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.2-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy3_d

In [None]:
# Load SpaCy's Russian model
nlp_ru = spacy.load("ru_core_news_sm")

# Check for CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load pre-trained XLM-R model and tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)  # Ensure model is on the correct device

# Function to extract entities from Russian text
def extract_entities_russian(text):
    """
    Extract entities from Russian text using SpaCy's Russian model.

    Args:
        text (str): Input text in Russian.

    Returns:
        list: A list of (entity, label) tuples.
    """
    doc = nlp_ru(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Function to map extracted entities to actants
def map_entities_to_actants_russian(entities):
    """
    Map extracted entities to actants based on their labels.

    Args:
        entities (list): List of tuples (entity, label).

    Returns:
        dict: Dictionary with actants (subject, object, helper, opponent).
    """
    actants = defaultdict(list)
    for ent, label in entities:
        if label == "PER":  # Person -> Subject
            actants["subject"].append(ent)
        elif label in {"LOC", "ORG"}:  # Locations/Organizations -> Object
            actants["object"].append(ent)
        else:
            actants["object"].append(ent)  # Default to object for other types
    return dict(actants)

# Function to convert actants to text format
def actants_to_text(actants):
    """
    Convert actants dictionary to a readable text format.

    Args:
        actants (dict): Dictionary with actants (subject, object, helper, opponent).

    Returns:
        str: Text representation of actants.
    """
    actants_text = []
    if actants.get("subject"):
        actants_text.append(f"Subject: {', '.join(actants['subject'])}.")
    if actants.get("object"):
        actants_text.append(f"Object: {', '.join(actants['object'])}.")
    return " ".join(actants_text)

# Function to compute embeddings using XLM-R
def compute_xlm_r_embedding(text):
    """
    Compute XLM-R embeddings for the given text.

    Args:
        text (str): Input text.

    Returns:
        numpy.ndarray: Embedding vector of the text.
    """
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding="max_length").to(device)

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Token embeddings (last hidden state)
    token_embeddings = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_size)

    # Apply mean pooling to aggregate token embeddings
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
    sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
    embedding = sum_embeddings / sum_mask  # Mean pooling

    # Move back to CPU and convert to NumPy
    return embedding.squeeze().cpu().numpy()

# Function to process Russian articles with embeddings
def process_russian_articles_with_embeddings(article_df):
    """
    Process Russian articles to extract actants and compute embeddings.

    Args:
        article_df (pd.DataFrame): DataFrame with article_id and text columns.

    Returns:
        pd.DataFrame: Updated DataFrame with actants and embeddings.
    """
    # Extract entities and map them to actants
    article_df['entities'] = article_df['text'].apply(extract_entities_russian)
    article_df['actants'] = article_df['entities'].apply(map_entities_to_actants_russian)

    # Convert actants to text
    article_df['actants_text'] = article_df['actants'].apply(actants_to_text)

    # Compute embeddings for article text and actant text
    article_df['text_embedding'] = article_df['text'].apply(compute_xlm_r_embedding)
    article_df['actants_embedding'] = article_df['actants_text'].apply(compute_xlm_r_embedding)

    # Combine text and actant embeddings
    article_df['combined_embedding'] = article_df.apply(
        lambda row: np.hstack([row['text_embedding'], row['actants_embedding']]),
        axis=1
    )

    return article_df

## Train and dev aritcles
processed_train_df = process_russian_articles_with_embeddings(train_articles)
processed_dev_df = process_russian_articles_with_embeddings(dev_articles)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
processed_train_df.head()

Unnamed: 0,article_id,text,entities,actants,actants_text,text_embedding,actants_embedding,combined_embedding
0,RU-URW-1017.txt,Дмитрий Стешин: Россия тихо глушит GPS европей...,"[(Дмитрий Стешин, PER), (Россия, LOC), (Россия...","{'subject': ['Дмитрий Стешин'], 'object': ['Ро...","Subject: Дмитрий Стешин. Object: Россия, Росси...","[0.007704131, 0.022058254, -0.007781817, 0.005...","[0.014929229, 0.054797117, 0.018662047, 0.0155...","[0.007704131, 0.022058254, -0.007781817, 0.005..."
1,RU-URW-1021.txt,23:55 Лукашенко обвинил Запад в стремлении к е...,"[(Лукашенко, PER), (Запад, LOC), (Украине, LOC...","{'subject': ['Лукашенко', 'Александр Лукашенко...","Subject: Лукашенко, Александр Лукашенко, Лукаш...","[0.0068586157, 0.038234632, -0.019731209, 0.02...","[0.022818657, 0.058647245, 0.016941959, 0.0069...","[0.0068586157, 0.038234632, -0.019731209, 0.02..."
2,RU-URW-1031.txt,Лихачев: РФ ждет от ООН оценку действий Киева ...,"[(РФ, LOC), (ООН, ORG), (Киева, LOC), (Энергод...","{'object': ['РФ', 'ООН', 'Киева', 'Энергодаре'...","Subject: Лихачев, Алексей Лихачев, Лихачев, Ра...","[-0.0023059421, 0.027885303, -0.0032359113, 0....","[0.019282598, 0.04593163, 0.01878873, 0.002279...","[-0.0023059421, 0.027885303, -0.0032359113, 0...."
3,RU-URW-1025.txt,Финский наемник Сирен раскритиковал подготовку...,"[(Сирен, PER), (ВСУ, ORG), (ВСУ, ORG), (Ральф ...","{'subject': ['Сирен', 'Ральф Сирен', 'Сирен'],...","Subject: Сирен, Ральф Сирен, Сирен. Object: ВС...","[-0.008707507, 0.04932406, -0.021118887, 0.021...","[0.02719694, 0.0851744, 0.0016649642, 0.037561...","[-0.008707507, 0.04932406, -0.021118887, 0.021..."
4,RU-URW-1011.txt,"Если ВС РФ расширят фронт, США позволят Украин...","[(ВС, ORG), (РФ, LOC), (США, LOC), (Украине, L...","{'object': ['ВС', 'РФ', 'США', 'Украине', 'Рос...","Subject: Джеймс О’Брайен, Джейк Салливан, Джей...","[-0.0027351298, 0.022900289, -0.010704674, 0.0...","[0.01587944, 0.04571804, 0.036064718, -0.00617...","[-0.0027351298, 0.022900289, -0.010704674, 0.0..."


In [None]:
processed_dev_df.head()

Unnamed: 0,article_id,text,entities,actants,actants_text,text_embedding,actants_embedding,combined_embedding
0,RU-URW-1120.txt,"Военная операция на Украине, день 869-й\n""Служ...","[(Украине, LOC), (Служба внешней разведки (СВР...","{'object': ['Украине', 'Служба внешней разведк...","Subject: Владимиру Зеленскому, Зеленского, Вал...","[0.024559716, 0.04935156, -0.009897733, 0.0196...","[0.02767332, 0.046520017, 0.019670695, 0.00079...","[0.024559716, 0.04935156, -0.009897733, 0.0196..."
1,RU-URW-1043.txt,08:15Главное за ночь 3 июля\nВладимир Путин пр...,"[(Владимир Путин, PER), (Астану, LOC), (ШОС, O...","{'subject': ['Владимир Путин', 'Эрдоганом', 'С...","Subject: Владимир Путин, Эрдоганом, Си Цзиньпи...","[0.00015725837, 0.038317386, -0.011712183, 0.0...","[0.0065622046, 0.0566163, 0.022719786, 0.01691...","[0.00015725837, 0.038317386, -0.011712183, 0.0..."
2,RU-URW-1073.txt,"""Украина не готова к компромиссу с Россией рад...","[(Украина, LOC), (Россией, LOC), (Офиса презид...","{'object': ['Украина', 'Россией', 'Офиса прези...","Subject: Трампа, Орбану, Ермак, Андрюша. Objec...","[0.0060347854, 0.03328179, 0.010227945, 0.0223...","[0.023116622, 0.07449146, 0.029456686, 0.03267...","[0.0060347854, 0.03328179, 0.010227945, 0.0223..."
3,RU-URW-1014.txt,Алаудинов: российские силы растянули и размыли...,"[(Алаудинов, PER), (ВСУ, ORG), (Харьковской об...","{'subject': ['Алаудинов', 'Апти Алаудинов'], '...","Subject: Алаудинов, Апти Алаудинов. Object: ВС...","[0.012648377, 0.017662816, -0.013254319, 0.029...","[0.018034656, 0.071815215, 0.025381723, 0.0336...","[0.012648377, 0.017662816, -0.013254319, 0.029..."
4,RU-URW-1023.txt,Для чего Украина провоцирует Белоруссию на вое...,"[(Украина, LOC), (Белоруссию, LOC), (Киев, LOC...","{'object': ['Украина', 'Белоруссию', 'Киев', '...","Subject: Константин Сивков. Object: Украина, Б...","[-0.0013787227, 0.026964093, -0.0036002013, 0....","[-0.0070581306, 0.04410064, 0.024095109, 0.012...","[-0.0013787227, 0.026964093, -0.0036002013, 0...."


In [None]:
# Save processed DataFrames
processed_train_df.to_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/processed_train_rus_articles.csv', index=False)
processed_dev_df.to_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/processed_dev_rus_articles.csv', index=False)


### One hot Labels

In [None]:
def one_hot_encode_hierarchical(df, columns):
    """
    One-hot encode hierarchically related columns in a DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame with hierarchical columns.
        columns (list): List of column names to encode hierarchically.

    Returns:
        pd.DataFrame: Updated DataFrame with encoded values in new columns.
        dict: Dictionary of mappings for each column.
    """
    encoder = OneHotEncoder(sparse_output=False, dtype=int)
    mappings = {}

    for column in columns:
        # Fit and transform the column
        encoded = encoder.fit_transform(df[[column]])

        # Extract class labels and create a mapping
        class_labels = encoder.categories_[0]
        mapping = {label: idx for idx, label in enumerate(class_labels)}

        # Add the encoded column to the DataFrame
        df[f"{column}_encoded"] = encoded.argmax(axis=1)

        # Save the mapping
        mappings[column] = mapping

    return df, mappings

def encode_dev_with_train_mapping(dev_df, train_mappings, columns):
    """
    Encode the development data using mappings from the training data.

    Args:
        dev_df (pd.DataFrame): Development DataFrame.
        train_mappings (dict): Mappings generated from training data.
        columns (list): List of column names to encode hierarchically.

    Returns:
        pd.DataFrame: Updated development DataFrame with encoded values.
        dict: Updated mappings including any new classes from dev data.
    """
    updated_mappings = train_mappings.copy()

    for column in columns:
        if column not in updated_mappings:
            raise ValueError(f"No mapping found for column {column} in train_mappings.")

        mapping = updated_mappings[column]

        # Check for new classes in dev data
        new_classes = set(dev_df[column].unique()) - set(mapping.keys())
        for cls in new_classes:
            mapping[cls] = len(mapping)

        # Encode the column in dev data
        dev_df[f"{column}_encoded"] = dev_df[column].map(mapping).fillna(-1).astype(int)

        # Update the mappings with new classes
        updated_mappings[column] = mapping

    return dev_df, updated_mappings

def save_mappings(mappings, filepath):
    """
    Save mappings to a JSON file for future use.

    Args:
        mappings (dict): Dictionary of mappings for hierarchical columns.
        filepath (str): Path to save the mappings file.
    """
    with open(filepath, 'w') as f:
        json.dump(mappings, f)
    print(f"Mappings saved to {filepath}")

def load_mappings(filepath):
    """
    Load mappings from a JSON file.

    Args:
        filepath (str): Path to the mappings file.

    Returns:
        dict: Loaded mappings dictionary.
    """
    with open(filepath, 'r') as f:
        mappings = json.load(f)
    return mappings



# Define columns to encode
columns_to_encode = ['narrative', 'level_1', 'level_2']

# Perform one-hot encoding on training annotations
train_encoded, train_mappings = one_hot_encode_hierarchical(expanded_train_annotations, columns_to_encode)

# Perform one-hot encoding on development annotations using train mappings
dev_encoded, updated_mappings = encode_dev_with_train_mapping(expanded_dev_annotations, train_mappings, columns_to_encode)

# Save mappings for future use
save_mappings(updated_mappings, "mappings.json")




Mappings saved to mappings.json


In [None]:
train_encoded.head()

Unnamed: 0,article_id,narrative,level_1,level_2,narrative_encoded,level_1_encoded,level_2_encoded
0,RU-URW-1080.txt,URW,Discrediting Ukraine,Discrediting Ukrainian government and officia...,0,2,2
1,RU-URW-1013.txt,URW,"Discrediting the West, Diplomacy","The West does not care about Ukraine, only ab...",0,3,17
2,RU-URW-1145.txt,URW,Praise of Russia,Praise of Russian military might,0,8,8
3,RU-URW-1048.txt,URW,Discrediting Ukraine,Discrediting Ukrainian military,0,2,3
4,RU-URW-1001.txt,URW,Praise of Russia,Russia is a guarantor of peace and prosperity,0,8,12


In [None]:
dev_encoded.head()

Unnamed: 0,article_id,narrative,level_1,level_2,narrative_encoded,level_1_encoded,level_2_encoded
0,RU-URW-1135.txt,URW,Russia is the Victim,The West is russophobic,0,9,19
1,RU-URW-1135.txt,URW,"Discrediting the West, Diplomacy",The EU is divided,0,3,15
2,RU-URW-1135.txt,URW,"Discrediting the West, Diplomacy",Other,0,3,6
3,RU-URW-1089.txt,Other,Other,Other,1,11,31
4,RU-URW-1093.txt,URW,Discrediting Ukraine,Discrediting Ukrainian military,0,2,3


In [None]:
print(updated_mappings)

{'narrative': {'URW': 0, 'Other': 1}, 'level_1': {' Amplifying war-related fears': 0, ' Blaming the war on others rather than the invader': 1, ' Discrediting Ukraine': 2, ' Discrediting the West, Diplomacy': 3, ' Distrust towards Media': 4, ' Hidden plots by secret schemes of powerful groups': 5, ' Negative Consequences for the West': 6, ' Overpraising the West': 7, ' Praise of Russia': 8, ' Russia is the Victim': 9, ' Speculating war outcomes': 10, 'Other': 11}, 'level_2': {' By continuing the war we risk WWIII': 0, ' Diplomacy does/will not work': 1, ' Discrediting Ukrainian government and officials and policies': 2, ' Discrediting Ukrainian military': 3, ' Discrediting Ukrainian nation and society': 4, ' NATO should/will directly intervene': 5, ' Other': 6, ' Praise of Russian President Vladimir Putin': 7, ' Praise of Russian military might': 8, ' Rewriting Ukraine’s history': 9, ' Russia actions in Ukraine are only self-defence': 10, ' Russia has international support from a number

In [None]:
# Save processed DataFrame
def save_processed_data(train_df, dev_df, train_path, dev_path):
    train_df.to_csv(train_path, index=False)
    dev_df.to_csv(dev_path, index=False)
    print(f"Train saved to {train_path}")
    print(f"Dev saved to {dev_path}")

save_processed_data(train_encoded, dev_encoded, "/content/drive/MyDrive/NLP_Proj/embedded_article/train_encoded_rus.csv", "/content/drive/MyDrive/NLP_Proj/embedded_article/dev_encoded_rus.csv")




Train saved to /content/drive/MyDrive/NLP_Proj/embedded_article/train_encoded_rus.csv
Dev saved to /content/drive/MyDrive/NLP_Proj/embedded_article/dev_encoded_rus.csv


### Train test split

In [None]:
# Load processed datasets
processed_train_df = pd.read_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/processed_train_rus_articles.csv')
train_processed = pd.read_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/train_encoded_rus.csv')

processed_dev_df = pd.read_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/processed_dev_rus_articles.csv')
dev_processed = pd.read_csv('/content/drive/MyDrive/NLP_Proj/embedded_article/dev_encoded_rus.csv')

# Merge article embeddings with annotation embeddings
merged_train = pd.merge(
    processed_train_df[['article_id', 'text_embedding', 'actants_embedding']],
    train_processed[['article_id', 'narrative_encoded', 'level_1_encoded', 'level_2_encoded']],
    on='article_id'
)

# Merge article embeddings with annotation embeddings
merged_dev = pd.merge(
    processed_dev_df[['article_id', 'text_embedding', 'actants_embedding']],
    dev_processed[['article_id', 'narrative_encoded', 'level_1_encoded', 'level_2_encoded']],
    on='article_id'
)
merged_dev = merged_dev[merged_dev['narrative_encoded'] != 1]

def string_to_array_embeds(embed_str):
    try:
        # Remove brackets and newline characters, then split into components
        embed_str = embed_str.strip("[]").replace("\n", "")
        # Convert each component to a float and return as a NumPy array
        return np.array([float(x) for x in embed_str.split() if x.strip()])
    except (ValueError, TypeError):
        # Return an empty array if parsing fails
        return np.array([])


def string_to_array(embed_str):
    try:
        # Safely evaluate the string to a Python list
        return np.array(ast.literal_eval(embed_str))
    except (ValueError, SyntaxError, TypeError):
        # Return an empty array if parsing fails
        return np.array([])


merged_train['text_embedding'] = merged_train['text_embedding'].apply(string_to_array_embeds)
merged_train['actants_embedding'] = merged_train['actants_embedding'].apply(string_to_array_embeds)

merged_dev['text_embedding'] = merged_dev['text_embedding'].apply(string_to_array_embeds)
merged_dev['actants_embedding'] = merged_dev['actants_embedding'].apply(string_to_array_embeds)

#merged_train.head()

# Pad or truncate arrays to the maximum length
def pad_or_truncate(array, target_length):
    if len(array) < target_length:
        return np.pad(array, (0, target_length - len(array)), mode='constant')
    elif len(array) > target_length:
        return array[:target_length]
    return array

# Find maximum lengths for padding
max_sentence_length = max(
    max(merged_train['text_embedding'].apply(len), default=0),
    max(merged_dev['text_embedding'].apply(len), default=0)
)

max_actants_length = max(
    max(merged_train['actants_embedding'].apply(len), default=0),
    max(merged_dev['actants_embedding'].apply(len), default=0)
)

# Pad embeddings for sentence and actants
for df in [merged_train, merged_dev]:
    df['text_embedding'] = df['text_embedding'].apply(lambda x: pad_or_truncate(x, max_sentence_length))
    df['actants_embedding'] = df['actants_embedding'].apply(lambda x: pad_or_truncate(x, max_actants_length))


# Count the number of samples per class for `level_1_encoded`
class_counts = merged_dev['level_1_encoded'].value_counts()

# Filter out classes with fewer than 2 samples
valid_classes = class_counts[class_counts >= 2].index
filtered_dev = merged_dev[merged_dev['level_1_encoded'].isin(valid_classes)]

print(f"Filtered dataset size: {filtered_train.shape}")
print(f"Remaining classes: {len(valid_classes)}")

# Perform stratified splitting
validation_df, test_df = train_test_split(
    filtered_dev,
    test_size=0.5,
    stratify=filtered_dev['level_1_encoded'],
    random_state=42
)


# Prepare training and validation splits
X_train = np.hstack([
    np.vstack(merged_train['text_embedding']),
    np.vstack(merged_train['actants_embedding']),
])

y_train = merged_train[['level_1_encoded', 'level_2_encoded']].to_dict(orient='list')

X_val = np.hstack([
    np.vstack(validation_df['text_embedding']),
    np.vstack(validation_df['actants_embedding']),
])

y_val = validation_df[['level_1_encoded', 'level_2_encoded']].to_dict(orient='list')

# Prepare test splits
X_test = np.hstack([
    np.vstack(test_df['text_embedding']),
    np.vstack(test_df['actants_embedding']),
])

y_test = test_df[['level_1_encoded', 'level_2_encoded']].to_dict(orient='list')

# Print updated train-validation-test split details
print(f"Train: X_train: {X_train.shape}, y_train: {[len(y) for y in y_train.values()]}")
print(f"Validation: X_val: {X_val.shape}, y_val: {[len(y) for y in y_val.values()]}")
print(f"Test: X_test: {X_test.shape}, y_test: {[len(y) for y in y_test.values()]}")


Filtered dataset size: (73, 6)
Remaining classes: 8
Train: X_train: (272, 1536), y_train: [272, 272]
Validation: X_val: (36, 1536), y_val: [36, 36]
Test: X_test: (37, 1536), y_test: [37, 37]


In [None]:
print(len(np.unique(merged_train['level_1_encoded'])))
print(len(np.unique(merged_train['level_2_encoded'])))

# Calculate the max and min values for level_1_encoded
max_value = merged_dev['level_2_encoded'].max()
min_value = merged_dev['level_2_encoded'].min()

# Print the results
print(f"Maximum value in level_2_encoded: {max_value}")
print(f"Minimum value in level_2_encoded: {min_value}")

11
30
Maximum value in level_2_encoded: 33
Minimum value in level_2_encoded: 1


### Modelling

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Dense, Dropout, Concatenate, Add, BatchNormalization, Activation
)
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
# Transformer-based hierarchical model
def create_hierarchical_model(input_dim, level_1_classes, level_2_classes):
    # Input Layer
    input_layer = Input(shape=(input_dim,), name="input_layer")

    # Shared Dense Layer 1 (Residual Block)
    shared_dense_1 = Dense(
        512, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )(input_layer)
    shared_dense_1 = BatchNormalization()(shared_dense_1)
    shared_dense_1 = Dropout(0.5)(shared_dense_1)
    residual_1 = Dense(512, activation="relu")(shared_dense_1)  # Residual connection
    residual_1 = Add()([shared_dense_1, residual_1])

    # Shared Dense Layer 2 (Residual Block)
    shared_dense_2 = Dense(
        256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )(residual_1)
    shared_dense_2 = BatchNormalization()(shared_dense_2)
    shared_dense_2 = Dropout(0.5)(shared_dense_2)
    residual_2 = Dense(256, activation="relu")(shared_dense_2)  # Residual connection
    residual_2 = Add()([shared_dense_2, residual_2])

    # Level 1 Classifier (Parent)
    level_1_output = Dense(
        level_1_classes, activation="softmax", name="level_1_output"
    )(residual_2)

    # Concatenate Level 1 predictions with shared embeddings for Level 2
    concatenated_features = Concatenate()([residual_2, level_1_output])

    # Level 2 Dense Layers (Enhanced with Residuals)
    level_2_dense_1 = Dense(
        256, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )(concatenated_features)
    level_2_dense_1 = BatchNormalization()(level_2_dense_1)
    level_2_dense_1 = Dropout(0.5)(level_2_dense_1)
    level_2_residual = Dense(256, activation="relu")(level_2_dense_1)  # Residual
    level_2_residual = Add()([level_2_dense_1, level_2_residual])

    # Final Dense Layer for Level 2
    level_2_dense_2 = Dense(
        128, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)
    )(level_2_residual)
    level_2_dense_2 = Dropout(0.5)(level_2_dense_2)

    # Level 2 Classifier (Child)
    level_2_output = Dense(
        level_2_classes, activation="softmax", name="level_2_output"
    )(level_2_dense_2)

    # Create Model
    model = Model(inputs=input_layer, outputs=[level_1_output, level_2_output])
    return model


# Create the model
input_dim = X_train.shape[1]
level_1_classes = 11
level_2_classes = 34

model = create_hierarchical_model(input_dim, level_1_classes, level_2_classes)
def dynamic_loss_weights(epoch):
    # Gradually increase Level 2 focus
    if epoch < 10:
        return {"level_1_output": 0.7, "level_2_output": 1.3}
    elif epoch < 20:
        return {"level_1_output": 0.5, "level_2_output": 1.5}
    else:
        return {"level_1_output": 0.3, "level_2_output": 1.7}



# Prepare y_train for hierarchical classification
y_train_level_1_onehot = to_categorical(y_train['level_1_encoded'], num_classes=level_1_classes)
y_train_level_2_onehot = to_categorical(y_train['level_2_encoded'], num_classes=level_2_classes)

y_val_level_1_onehot = to_categorical(y_val['level_1_encoded'], num_classes=level_1_classes)
y_val_level_2_onehot = to_categorical(y_val['level_2_encoded'], num_classes=level_2_classes)

y_test_level_1_onehot = to_categorical(y_test['level_1_encoded'], num_classes=level_1_classes)
y_test_level_2_onehot = to_categorical(y_test['level_2_encoded'], num_classes=level_2_classes)


# Define callbacks for better training performance
early_stopping = EarlyStopping(
    monitor="val_loss", patience=8, restore_best_weights=True, verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6, verbose=1
)

# Training
for epoch in range(2):
    weights = dynamic_loss_weights(epoch)
    model.compile(
        optimizer=tf.keras.optimizers.AdamW(learning_rate=1e-4),
        loss_weights=weights,
        loss={"level_1_output": "categorical_crossentropy", "level_2_output": "categorical_crossentropy"},
        metrics=["accuracy", "accuracy"],
    )
    history = model.fit(
        X_train,
        {"level_1_output": y_train_level_1_onehot, "level_2_output": y_train_level_2_onehot},
        validation_data=(
            X_val,
            {"level_1_output": y_val_level_1_onehot, "level_2_output": y_val_level_2_onehot},
        ),
        batch_size=32,
        epochs=50,
        callbacks=[early_stopping, reduce_lr],
        verbose=1,
    )


Epoch 1/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 652ms/step - level_1_output_accuracy: 0.0824 - level_1_output_loss: 4.5550 - level_2_output_accuracy: 0.0158 - level_2_output_loss: 6.0776 - loss: 26.4724 - val_level_1_output_accuracy: 0.0556 - val_level_1_output_loss: 3.4683 - val_level_2_output_accuracy: 0.0000e+00 - val_level_2_output_loss: 3.9133 - val_loss: 22.8598 - learning_rate: 1.0000e-04
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - level_1_output_accuracy: 0.0603 - level_1_output_loss: 4.5524 - level_2_output_accuracy: 0.0474 - level_2_output_loss: 5.6111 - loss: 25.6810 - val_level_1_output_accuracy: 0.0556 - val_level_1_output_loss: 3.2727 - val_level_2_output_accuracy: 0.0000e+00 - val_level_2_output_loss: 3.8187 - val_loss: 22.4211 - learning_rate: 1.0000e-04
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - level_1_output_accuracy: 0.0629 - level_1_output_loss: 4.1927 - lev

In [None]:
# Evaluation
evaluation = model.evaluate(
    X_test,
    {"level_1_output": y_test_level_1_onehot, "level_2_output": y_test_level_2_onehot},
    verbose=1,
)

print("Test Loss and Accuracy:", evaluation)

from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Get predictions from the model
predictions = model.predict(X_test)

# Extract predictions for level_1 and level_2
level_1_preds = np.argmax(predictions[0], axis=1)
level_2_preds = np.argmax(predictions[1], axis=1)

# Extract true labels for level_1 and level_2
level_1_true = np.argmax(y_test_level_1_onehot, axis=1)
level_2_true = np.argmax(y_test_level_2_onehot, axis=1)

# Compute Accuracy
level_1_accuracy = accuracy_score(level_1_true, level_1_preds)
level_2_accuracy = accuracy_score(level_2_true, level_2_preds)

# Compute F1-Score
level_1_f1 = f1_score(level_1_true, level_1_preds, average='weighted')
level_2_f1 = f1_score(level_2_true, level_2_preds, average='weighted')

# Print Evaluation Metrics
print(f"Level 1 Accuracy: {level_1_accuracy:.4f}")
print(f"Level 1 F1-Score: {level_1_f1:.4f}")
print(f"Level 2 Accuracy: {level_2_accuracy:.4f}")
print(f"Level 2 F1-Score: {level_2_f1:.4f}")


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 646ms/step - level_1_output_accuracy: 0.3773 - level_1_output_loss: 1.7705 - level_2_output_accuracy: 0.1498 - level_2_output_loss: 3.1568 - loss: 16.6227
Test Loss and Accuracy: [16.599348068237305, 1.8375935554504395, 3.0541582107543945, 0.37837839126586914, 0.1621621549129486]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 382ms/step
Level 1 Accuracy: 0.3784
Level 1 F1-Score: 0.3522
Level 2 Accuracy: 0.1622
Level 2 F1-Score: 0.1077


In [None]:
import random

# Extract true values
level_1_true = tf.argmax(y_test_level_1_onehot, axis=1).numpy()
level_2_true = tf.argmax(y_test_level_2_onehot, axis=1).numpy()

# Random 10 indices
random_indices = random.sample(range(len(X_test)), 10)


print("Random Sample of Predictions:")
for idx in random_indices:
    print(f"Index: {idx}")
    print(f"Level 1 True: {level_1_true[idx]} | Level 1 Pred: {level_1_preds[idx]}")
    print(f"Level 2 True: {level_2_true[idx]} | Level 2 Pred: {level_2_preds[idx]}")
    print("-" * 30)


Random Sample of Predictions:
Index: 13
Level 1 True: 3 | Level 1 Pred: 2
Level 2 True: 17 | Level 2 Pred: 2
------------------------------
Index: 9
Level 1 True: 3 | Level 1 Pred: 3
Level 2 True: 20 | Level 2 Pred: 6
------------------------------
Index: 12
Level 1 True: 3 | Level 1 Pred: 2
Level 2 True: 20 | Level 2 Pred: 2
------------------------------
Index: 0
Level 1 True: 8 | Level 1 Pred: 3
Level 2 True: 11 | Level 2 Pred: 6
------------------------------
Index: 27
Level 1 True: 8 | Level 1 Pred: 8
Level 2 True: 8 | Level 2 Pred: 8
------------------------------
Index: 33
Level 1 True: 8 | Level 1 Pred: 2
Level 2 True: 12 | Level 2 Pred: 2
------------------------------
Index: 32
Level 1 True: 2 | Level 1 Pred: 2
Level 2 True: 3 | Level 2 Pred: 3
------------------------------
Index: 35
Level 1 True: 2 | Level 1 Pred: 2
Level 2 True: 2 | Level 2 Pred: 2
------------------------------
Index: 25
Level 1 True: 8 | Level 1 Pred: 8
Level 2 True: 6 | Level 2 Pred: 2
-----------------