## Data Loading

In [1]:
!unzip /content/train.zip -d /content/Train

Archive:  /content/train.zip
   creating: /content/Train/target_4_December_release/
   creating: /content/Train/target_4_December_release/BG/
   creating: /content/Train/target_4_December_release/BG/raw-documents/
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10015.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10345.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10380.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10468.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10525.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10556.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10565.txt  
  inflating: /content/Train/target_4_December_release/BG/raw-documents/A6_CC_BG_10575.txt  
  inflating: /content/Train/target_4_December_rele

## Data Preprocessing

In [2]:
import os
import re
import torch
from transformers import MT5Tokenizer
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict

# all paths
DATA_DIR = "/content/Train"
TARGET_DIR = "target_4_December_release"
RAW_DOCS_FOLDER = "raw-documents"
LANGUAGES = ["BG", "EN", "HI", "PT", "RU"]

# Loaded T5 tokenizer
TOKENIZER = MT5Tokenizer.from_pretrained("google/mt5-small")

def clean_text(text):

    """Enhanced text cleaning function."""
    text = text.strip().replace("\n", " ")
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def load_data(data_dir):
    """Loads and processes all annotation data before tokenization."""
    all_data = []
    for lang in LANGUAGES:
        lang_path = os.path.join(data_dir, TARGET_DIR, lang)
        annotation_file = os.path.join(lang_path, "subtask-3-annotations.txt")

        if not os.path.exists(annotation_file):
            print(f"Warning: No annotations found for {lang}")
            continue

        with open(annotation_file, "r", encoding="utf-8") as file:
            for line in file:
                parts = line.strip().split("\t")
                if len(parts) < 4:
                    continue

                article_id, dominant_narrative, dominant_subnarrative, explanation = parts
                text_file = os.path.join(lang_path, RAW_DOCS_FOLDER, article_id)

                if not os.path.exists(text_file):
                    print(f"Warning: Missing text file {article_id} in {lang}")
                    continue

                with open(text_file, "r", encoding="utf-8") as f:
                    article_text = clean_text(f.read())

                # Handle "none" subnarratives
                if dominant_subnarrative.lower() == "none":
                    if lang == "PT":  # Replace 'none' with Narrative for PT language
                        dominant_subnarrative = dominant_narrative
                    else:
                        continue  # Skip all other samples languages

                input_text = f"Narrative: {dominant_narrative} | Subnarrative: {dominant_subnarrative} | Context: {article_text}"
                output_text = explanation

                all_data.append({"input_text": input_text, "output_text": output_text, "language": lang})

    return all_data

# Preprocess the data
cleaned_samples = load_data(DATA_DIR)

# Create a Dataset class
class NarrativeDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=512):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        data = self.samples[idx]

        inputs = self.tokenizer(
            data["input_text"], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        labels = self.tokenizer(
            data["output_text"], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
            "language": data["language"]
        }

# Create a dataset and dataloader
dataset = NarrativeDataset(cleaned_samples, TOKENIZER)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Display the statistics
language_counts = defaultdict(int)
for sample in cleaned_samples:
    language_counts[sample["language"]] += 1

print("\nUpdated Number of Samples per Language (After Handling 'none' Values):")
for lang, count in language_counts.items():
    print(f"{lang}: {count} samples")

# Print some of the processed samples for verification
def print_samples_for_language(dataset, language, num_samples=50):
    print(f"\nPreprocessed samples for language: {language}")
    samples = [sample for sample in dataset.samples if sample["language"] == language]
    for i in range(min(num_samples, len(samples))):
        sample = samples[i]
        print(f"Sample {i+1}:")
        print(f"  Input: {sample['input_text']}")
        print(f"  Output: {sample['output_text']}")
        print("="*80)

# Print some of the samples for each language
for lang in LANGUAGES:
    print_samples_for_language(dataset, lang)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



Updated Number of Samples per Language (After Handling 'none' Values):
BG: 303 samples
EN: 145 samples
HI: 147 samples
PT: 252 samples
RU: 118 samples

Preprocessed samples for language: BG
Sample 1:
  Input: Narrative: URW: Blaming the war on others rather than the invader | Subnarrative: URW: Blaming the war on others rather than the invader: The West are the aggressors | Context: опитът на колективния запад да обезкърви русия с ръцете на властите в киев се провали с гръм и трясък и скоро от украйна   опитът на колективния запад да обезкърви русия с ръцете на властите в киев се провали с гръм и трясък и скоро от украйна няма да остане почти нищо ако не започне процесът на разрешаване на този въоръжен конфликт тази гледна точка изрази пред тасс бившият началник на кабинета на държавния секретар на сащ колин пауъл пенсионирания полковник от армията на сащ лорънс уилкерсън подкрепата на сащ нато и други западни съюзници за войната в украйна срещу русия е безумна това води до смъртта на

In [3]:
# Function to print some of the samples for a specific language
def print_samples_for_language(dataset, language, num_samples=4):
    print(f"\nPreprocessed samples for language: {language}")
    samples = [sample for sample in dataset.samples if sample["language"] == language]
    for i in range(min(num_samples, len(samples))):
        sample = samples[i]
        print(f"Sample {i+1}:")
        print(f"  Input: {sample['input_text']}")
        print(f"  Output: {sample['output_text']}")
        print("="*80)
    # Print tokenized samples
    print("\nTokenized samples:")
    for i in range(3):
        sample = dataset[i]
        print(f"Sample {i+1}:")
        print(f"  Input IDs: {sample['input_ids']}")
        print(f"  Attention Mask: {sample['attention_mask']}")
        print(f"  Labels: {sample['labels']}")
        print("="*80)

# Print some of the samples for each language
for lang in LANGUAGES:
    print_samples_for_language(dataset, lang)


Preprocessed samples for language: BG
Sample 1:
  Input: Narrative: URW: Blaming the war on others rather than the invader | Subnarrative: URW: Blaming the war on others rather than the invader: The West are the aggressors | Context: опитът на колективния запад да обезкърви русия с ръцете на властите в киев се провали с гръм и трясък и скоро от украйна   опитът на колективния запад да обезкърви русия с ръцете на властите в киев се провали с гръм и трясък и скоро от украйна няма да остане почти нищо ако не започне процесът на разрешаване на този въоръжен конфликт тази гледна точка изрази пред тасс бившият началник на кабинета на държавния секретар на сащ колин пауъл пенсионирания полковник от армията на сащ лорънс уилкерсън подкрепата на сащ нато и други западни съюзници за войната в украйна срещу русия е безумна това води до смъртта на украински войници в името на загубена кауза ако не вземете предвид печеленето на пари от американски и европейски военни изпълнители както и бруталния 

## Building and Training the model

In [4]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [6]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import StepLR
from bert_score import score as bert_score

# Load the model and tokenizer
MODEL_NAME = "google/mt5-base"
TOKENIZER = MT5Tokenizer.from_pretrained(MODEL_NAME)

class MT5NarrativeModel(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(MT5NarrativeModel, self).__init__()
        self.model = MT5ForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,  # use bfloat16
            device_map="auto",            #load smartly on GPU
            low_cpu_mem_usage=True
        )

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MT5NarrativeModel().to(device)

# Constants
MAX_LEN = 128
BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 4

# Dataset class
class NarrativeDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=MAX_LEN):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        data = self.samples[idx]

        inputs = self.tokenizer(
            data["input_text"], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        labels = self.tokenizer(
            data["output_text"], truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )
        labels_input_ids = labels["input_ids"].squeeze(0)
        labels_input_ids[labels_input_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0),
            "output_text": data["output_text"],
            "input_text": data["input_text"],
            "language": data["language"]
        }

# Assuming that the cleaned_samples is available
dataset = NarrativeDataset(cleaned_samples, TOKENIZER)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = optim.AdamW(model.parameters(), lr=3e-4)
scaler = GradScaler()
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

# Evaluating the function
def evaluate_model(model, dataloader):
    model.eval()
    references, predictions, languages = [], [], []
    total_val_loss = 0.0
    steps = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast(dtype=torch.bfloat16):
                outputs = model(input_ids, attention_mask, labels)
                loss = outputs.loss
                total_val_loss += loss.item()

            generated_ids = model.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=MAX_LEN,
                num_beams=4
            )
            generated_texts = TOKENIZER.batch_decode(generated_ids, skip_special_tokens=True)

            references.extend(batch["output_text"])
            predictions.extend(generated_texts)
            languages.extend(batch["language"])

            steps += 1

    avg_val_loss = total_val_loss / steps

    all_P, all_R, all_F1 = [], [], []
    lang_to_samples = {}

    for ref, pred, lang in zip(references, predictions, languages):
        if lang not in lang_to_samples:
            lang_to_samples[lang] = {"predictions": [], "references": []}
        lang_to_samples[lang]["predictions"].append(pred)
        lang_to_samples[lang]["references"].append(ref)

    for lang, samples in lang_to_samples.items():
        bert_lang = {
            "EN": "en",
            "BG": "bg",
            "HI": "hi",
            "PT": "pt",
            "RU": "ru"
        }.get(lang, "en")

        P, R, F1 = bert_score(samples["predictions"], samples["references"], lang=bert_lang, verbose=False)
        all_P.append(P.mean().item())
        all_R.append(R.mean().item())
        all_F1.append(F1.mean().item())

    macro_precision = sum(all_P) / len(all_P)
    macro_recall = sum(all_R) / len(all_R)
    macro_f1 = sum(all_F1) / len(all_F1)

    return macro_precision, macro_recall, macro_f1, predictions, avg_val_loss

# Training the function
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, epochs=10):
    best_f1 = 0

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        total_loss = 0.0
        step = 0

        for i, batch in enumerate(train_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast(dtype=torch.bfloat16):
                outputs = model(input_ids, attention_mask, labels)
                loss = outputs.loss
                loss = loss / GRAD_ACCUM_STEPS

            loss.backward()
            total_loss += loss.item()

            if (i + 1) % GRAD_ACCUM_STEPS == 0 or (i + 1) == len(train_dataloader):
                optimizer.step()
                optimizer.zero_grad()

            step += 1

        avg_train_loss = total_loss / step

        # Evaluating after each epoch
        precision, recall, f1, predictions, val_loss = evaluate_model(model, val_dataloader)

        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Training Loss: {avg_train_loss:.4f}")
        print(f"Validation Loss: {val_loss:.4f}")
        print(f"Validation Precision = {precision:.4f}, Recall = {recall:.4f}, F1 = {f1:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), "best_model.pt")
            print(f"✅ Best model saved at epoch {epoch+1} with F1 = {f1:.4f}")

        scheduler.step()
        torch.cuda.empty_cache()


#  Start training
print("Starting training...")
train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, epochs=10)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
  scaler = GradScaler()
  with autocast(dtype=torch.bfloat16):


Starting training...


  with autocast(dtype=torch.bfloat16):
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1 Summary:
Training Loss: 1.6706
Validation Loss: 3.8706
Validation Precision = 0.5750, Recall = 0.5804, F1 = 0.5770
✅ Best model saved at epoch 1 with F1 = 0.5770


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 2 Summary:
Training Loss: 1.2831
Validation Loss: 3.5988
Validation Precision = 0.6349, Recall = 0.6355, F1 = 0.6344
✅ Best model saved at epoch 2 with F1 = 0.6344


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 3 Summary:
Training Loss: 1.1690
Validation Loss: 3.5243
Validation Precision = 0.6359, Recall = 0.6304, F1 = 0.6321


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 4 Summary:
Training Loss: 1.1648
Validation Loss: 3.8005
Validation Precision = 0.6070, Recall = 0.6121, F1 = 0.6081


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 5 Summary:
Training Loss: 1.1116
Validation Loss: 3.4671
Validation Precision = 0.6361, Recall = 0.6275, F1 = 0.6305


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 6 Summary:
Training Loss: 1.0957
Validation Loss: 3.6224
Validation Precision = 0.6020, Recall = 0.6182, F1 = 0.6084


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 7 Summary:
Training Loss: 1.0817
Validation Loss: 3.5728
Validation Precision = 0.6097, Recall = 0.6234, F1 = 0.6148


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 8 Summary:
Training Loss: 1.0727
Validation Loss: 3.5543
Validation Precision = 0.5995, Recall = 0.6227, F1 = 0.6092


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 9 Summary:
Training Loss: 1.0608
Validation Loss: 3.5345
Validation Precision = 0.6162, Recall = 0.6283, F1 = 0.6206


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 10 Summary:
Training Loss: 1.0539
Validation Loss: 3.4980
Validation Precision = 0.6126, Recall = 0.6271, F1 = 0.6180


## Prediction for Explanation generation

In [26]:
import torch
from torch import nn
from transformers import MT5ForConditionalGeneration, AutoTokenizer
from bert_score import score as bert_score
import random

# Configuration
MODEL_NAME = "google/mt5-base"
MODEL_PATH = "best_model.pt"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class MT5NarrativeModel(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(MT5NarrativeModel, self).__init__()
        self.model = MT5ForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            low_cpu_mem_usage=True
        )

    def forward(self, input_ids, attention_mask, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# Load Tokenizer and Model
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, legacy=False)
model = MT5NarrativeModel().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

print(" Model and Tokenizer Loaded Successfully!")

# Select 2 samples per language
lang_to_samples = {}
for sample in cleaned_samples:
    lang = sample["language"]
    lang_to_samples.setdefault(lang, []).append(sample)

selected_samples = []
for lang, samples in lang_to_samples.items():
    selected_samples.extend(random.sample(samples, min(2, len(samples))))

print(f" Selected {len(selected_samples)} total samples for prediction.")

# Make Predictions
all_predictions = []
all_references = []
all_languages = []

for sample in selected_samples:
    input_text = sample["input_text"]
    true_output_text = sample["output_text"]
    language = sample["language"]

    inputs = TOKENIZER(
        input_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        generated_ids = model.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=512,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    pred_text = TOKENIZER.decode(generated_ids[0], skip_special_tokens=True)
    pred_text = pred_text.replace("<extra_id_0>", "").strip()  # Clean special token if appears

    all_predictions.append(pred_text)
    all_references.append(true_output_text)
    all_languages.append(language)

    print(f"\n Language: {language}")
    print(f" Input: {input_text}")
    print(f" Actual: {true_output_text}")
    print(f" Predicted: {pred_text}")

# Calculate BERTScore
lang_code_mapping = {
    "EN": "en",
    "BG": "bg",
    "HI": "hi",
    "PT": "pt",
    "RU": "ru"
}

bert_results = {}

for lang in set(all_languages):
    pred_texts = [pred for pred, l in zip(all_predictions, all_languages) if l == lang]
    ref_texts = [ref for ref, l in zip(all_references, all_languages) if l == lang]

    bert_lang_code = lang_code_mapping.get(lang, "en")

    P, R, F1 = bert_score(pred_texts, ref_texts, lang=bert_lang_code, verbose=False)

    bert_results[lang] = {
        "Precision": P.mean().item(),
        "Recall": R.mean().item(),
        "F1": F1.mean().item()
    }

# Print BERT Scores
print("\n BERT Scores per language:")
for lang, scores in bert_results.items():
    print(f"Language {lang}: Precision={scores['Precision']:.4f}, Recall={scores['Recall']:.4f}, F1={scores['F1']:.4f}")


 Model and Tokenizer Loaded Successfully!
 Selected 10 total samples for prediction.

 Language: BG
 Input: Narrative: URW: Amplifying war-related fears | Subnarrative: URW: Amplifying war-related fears: Russia will also attack other countries | Context: бившият директор на цру путин ще влезе във война с нато и първата му цел ще е молдова  заплахите на владимир путин да нахлуе в прибалтика и да предизвика нова война в европа трябва да се приемат сериозно предупреди в интервю за the sun бившият директор на цру дейвид петреъс генерал петреъс е категоричен че руският президент владимир путин няма да се задоволи да спре с украйна пенсионираният четиризвезден американски генерал който командваше съюзническите сили в ирак е заявил че съюзниците на украйна на запад трябва да направят повече за да помогнат за спечелването на войната срещу путин експертите неведнъж са предупреждавали че руският президент е насочил погледа си към повече цели в безумната си мечта да си върне изгубената империя а 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 BERT Scores per language:
Language PT: Precision=0.7476, Recall=0.6071, F1=0.6699
Language RU: Precision=0.6566, Recall=0.5664, F1=0.6079
Language EN: Precision=0.8731, Recall=0.8357, F1=0.8540
Language BG: Precision=0.7606, Recall=0.6870, F1=0.7217
Language HI: Precision=0.6487, Recall=0.5598, F1=0.6009
