In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install evaluate
!pip install tqdm
!pip install torch
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install bert_score
!pip install sacrebleu

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is

In [20]:
import os
import torch
import shutil
from datasets import load_dataset, Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from evaluate import load
from tqdm import tqdm

# # BART + LORA code

# remove existing adapter directory if it exists
if os.path.exists("./bart-optimized"):
    shutil.rmtree("./bart-optimized")

# set cuda memory configuration
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()

# load evaluation metrics
bleu = load("sacrebleu")
bertscore = load("bertscore")

# load tokenizer and model
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
base_model = BartForConditionalGeneration.from_pretrained(model_name)

# configure lora
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(base_model, peft_config)

# load dataset
dataset = load_dataset("json", data_files={
    "train": "/kaggle/input/nlp-project/train_project.json",
    "val": "/kaggle/input/nlp-project/valid_project.json",
    "test": "/kaggle/input/nlp-project/test_project.json"
})

# define target summary labels
perspectives = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]

# clean invalid or placeholder summary values
def clean_summary(text):
    if not text or not isinstance(text, str):
        return ""
    stripped = text.strip()
    if stripped.lower() in ["false", "true", "not_duplicate", "n/a", "duplicate", ""]:
        return ""
    return stripped

# format examples into model input-output pairs
def format_example(example):
    input_text = (
        f"Context: {example.get('context', '').strip()}\n"
        f"Question: {example.get('question', '').strip()}\n"
        f"Answers: {' '.join(example.get('answers', [])).strip()}"
    )
    labelled_summaries = example.get("labelled_summaries", {})
    output_lines = []
    for label in perspectives:
        summary = clean_summary(labelled_summaries.get(f"{label}_SUMMARY", ""))
        if not summary:
            summary = "No summary available."
        output_lines.append(f"{label} SUMMARY: {summary}")
    return {"input": input_text.strip(), "output": "\n".join(output_lines).strip()}

# format datasets
train_data = [format_example(ex) for ex in tqdm(dataset["train"], desc="formatting train")]
val_data   = [format_example(ex) for ex in tqdm(dataset["val"], desc="formatting val")]
test_data  = [format_example(ex) for ex in tqdm(dataset["test"], desc="formatting test")]

train_dataset = Dataset.from_list(train_data)
val_dataset   = Dataset.from_list(val_data)
test_dataset  = Dataset.from_list(test_data)

# tokenize and encode datasets
def preprocess(examples):
    inputs = tokenizer(examples["input"], padding="max_length", max_length=512, truncation=True)
    targets = tokenizer(examples["output"], padding="max_length", max_length=256, truncation=True)
    inputs["labels"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in targets["input_ids"]
    ]
    return inputs

train_tok = train_dataset.map(preprocess, batched=True)
val_tok = val_dataset.map(preprocess, batched=True)
test_tok = test_dataset.map(preprocess, batched=True)


Formatting train: 100%|██████████| 2236/2236 [00:00<00:00, 5752.69it/s]
Formatting val: 100%|██████████| 959/959 [00:00<00:00, 5937.09it/s]
Formatting test: 100%|██████████| 640/640 [00:00<00:00, 5798.37it/s]


Map:   0%|          | 0/2236 [00:00<?, ? examples/s]

Map:   0%|          | 0/959 [00:00<?, ? examples/s]

Map:   0%|          | 0/640 [00:00<?, ? examples/s]

In [61]:
print(train_tok[0])
# print(dataset["train"][0])

{'input': 'Context: \nQuestion: what is parkinesonism?\nAnswers: u spelt it wrong !!\nParkinson\'s disease is one of the most common neurologic disorders of the elderly. The term "parkinsonism" refers to any condition that causes any combination of the types of movement abnormalities seen in Parkinson\'s disease by damaging or destroying dopamine neurons in a certain area of the brain. Parkinsonism describes the common symptoms of Parkinson\'s disease - tremor, rigidity, akinesia or bradykinesia and postural instability. Those patients who respond to drug treatment for Parkinson\'s disease are diagnosed with it, and those who do not have parkinsonism.', 'output': 'INFORMATION SUMMARY: Parkinson\'s disease is a prevalent neurologic disorder among the elderly. The term "parkinsonism" encompasses any condition leading to movement abnormalities similar to those observed in Parkinson\'s disease. This condition arises from the damage or destruction of dopamine neurons in a specific brain reg

In [17]:
from math import ceil
from transformers import TrainingArguments, Trainer
import os
from tqdm.auto import tqdm 

# disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# define batch size
train_batch_size = 4

# set training arguments
training_args = TrainingArguments(
    output_dir="./bart-optimized",
    do_eval=True,
    eval_steps=None,  # will be set after initialization
    save_strategy="epoch",
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    gradient_accumulation_steps=2,
    save_total_limit=2,
    logging_steps=50,
    fp16=False,
    local_rank=-1,
    label_names=["labels"],  # required for peft models
    logging_dir="./logs",
    report_to=[]  # prevent integration with external loggers
)

# compute evaluation steps based on dataset size
eval_steps = ceil(len(train_tok) / train_batch_size)
training_args.eval_steps = eval_steps

# initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer
)

# start training - with tqdm
trainer.train()

# BART + LORA MODEL SAVED
# save final model and tokenizer
model.save_pretrained("./bart-optimized")
tokenizer.save_pretrained("./bart-optimized")


  trainer = Trainer(


In [18]:
import os
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from peft import get_peft_model, LoraConfig, TaskType
from evaluate import load
from tqdm.auto import tqdm
from datasets import load_dataset

# EVALUATING BART + LORA MODEL

# setting device
device = "cuda" if torch.cuda.is_available() else "cpu"

# specifying path to the trained model
model_path = "/kaggle/input/bart-optimized/pytorch/default/1"

# loading tokenizer and base model
tokenizer = BartTokenizer.from_pretrained(model_path)
base_model = BartForConditionalGeneration.from_pretrained(model_path)

# configuring lora for inference
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=True,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(base_model, peft_config)
model.to(device)
model.eval()

# defining list of target perspectives
perspectives = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]

# generating summary for each perspective
def generate_summary(input_text):
    summaries = {}
    for perspective in perspectives:
        prompt = (
            f"Generate a {perspective} summary:\n"
            f"{input_text}\n"
            f"Provide a clear and structured {perspective.lower()} summary."
        )
        input_ids = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).input_ids.to(device)
        output_ids = model.generate(input_ids=input_ids, max_length=150, num_beams=5)
        generated = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        summaries[perspective] = generated
    return summaries

# parsing reference summaries from formatted output
def parse_reference(output_text):
    refs = {}
    for line in output_text.strip().split("\n"):
        if "SUMMARY:" in line:
            parts = line.split("SUMMARY:", 1)
            if len(parts) == 2:
                label = parts[0].strip().split()[0].upper()
                summary = parts[1].strip()
                if summary not in ["False", "True", "No summary available."]:
                    refs[label] = summary
    return refs

# loading evaluation metrics
bleu = load("sacrebleu")
bertscore = load("bertscore")

# evaluating model using bleu and bertscore
def evaluate(dataset, name="test"):
    results = {p: {"references": [], "predictions": []} for p in perspectives}
    for ex in tqdm(dataset, desc=f"evaluating {name}"):
        ref = parse_reference(ex["output"])
        pred = generate_summary(ex["input"])
        for p in perspectives:
            if p in ref:
                results[p]["references"].append([ref[p]])
                results[p]["predictions"].append(pred[p])
    for p in perspectives:
        refs = results[p]["references"]
        preds = results[p]["predictions"]
        if refs and preds:
            bleu_score = bleu.compute(predictions=preds, references=refs)["score"]
            bert_result = bertscore.compute(predictions=preds, references=[r[0] for r in refs], lang="en")
            bert_avg = sum(bert_result["f1"]) / len(bert_result["f1"])
            print(f"{p} - BLEU: {bleu_score:.8f}, BERTScore: {bert_avg:.4f}")
        else:
            print(f"{p} - not having enough data.")

# saving generated predictions to file
def save_predictions(dataset, file="bart_test_predictions.txt"):
    with open(file, "w") as f:
        for ex in tqdm(dataset, desc="saving predictions"):
            f.write("INPUT:\n" + ex["input"] + "\n\n")
            pred = generate_summary(ex["input"])
            for p in perspectives:
                f.write(f"{p} SUMMARY:\n{pred[p]}\n\n")
            f.write("-" * 80 + "\n")

# evaluating and saving predictions
evaluate(test_dataset, "test")
save_predictions(test_dataset, "bart_test_predictions.txt")



In [19]:
from evaluate import load
from tqdm import tqdm
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# # FIND HARD EXAMPLES FROM THE TRAINING DATATSET FOR FINE TUNING - ADVERSARIAL MODEL

bleu = load("bleu")
bertscore = load("bertscore")


# check if a sample is hard based on BLEU or BERTScore
def is_hard_example(pred, ref, threshold=0.84, metric="bertscore"):
    if metric == "bleu":
        score = bleu.compute(predictions=[pred], references=[[ref]])["score"] / 100
    else:
        score = bertscore.compute(predictions=[pred], references=[ref], lang="en", device=device)["f1"][0]
        # print(score)
    return score < threshold

# count=0
# get all hard examples from a dataset
def get_hard_examples(dataset, perspectives, threshold=0.84, metric="bertscore"):
    hard_inputs = []
    for ex in tqdm(dataset, desc="Mining Hard Examples"):
        ref_summaries = parse_reference(ex["output"])
        gen_summaries = generate_summary(ex["input"])
        for p in perspectives:
            if p in ref_summaries:
                if is_hard_example(gen_summaries[p], ref_summaries[p], threshold, metric):
                    # count+=1
                    hard_inputs.append(ex)
                    break  # At least one hard perspective is enough
    print(f"Total Hard Examples: {len(hard_inputs)}")
    return Dataset.from_list(hard_inputs)


# identify hard examples
# val_subset = random.sample(list(val_dataset), int(len(val_dataset)))

hard_train_set = get_hard_examples(train_dataset, perspectives)

# print(count)

print(f"Total hard examples found: {len(hard_train_set)}")

# convert to list (only safe for small datasets)

# # save hard examples
Dataset.from_list(hard_train_set).to_json("hard_train_examples.json", indent=2)

Mining Hard Examples:   0%|          | 0/2236 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Mining Hard Examples: 100%|██████████| 2236/2236 [3:42:51<00:00,  5.98s/it]  

Total Hard Examples: 598
Total hard examples found: 598





Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1530230

In [38]:
from datasets import Dataset
import json


# loading data from a text file containing JSON lines
def load_data(file_path):
    examples = []
    with open(file_path, 'r') as file:
        content = file.readlines()
        # parsing each line as a JSON object with 'input' and 'output' fields
        for line in content:
            try:
                example = json.loads(line)
                examples.append(example)
            except json.JSONDecodeError:
                continue
    return examples

# converting the list of examples into a HuggingFace Dataset
hard_examples = load_data('/kaggle/input/hard-textfile/hard_examples.txt')
hard_train_set = Dataset.from_dict({
    "input": [example['input'] for example in hard_examples],
    "output": [example['output'] for example in hard_examples]
})

# applying preprocessing to the dataset
def preprocess(examples):
    model_inputs = tokenizer(examples["input"], padding="max_length", max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], padding="max_length", max_length=256, truncation=True)

    # replacing padding tokens with -100 to ignore in loss computation
    model_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in labels_seq]
        for labels_seq in labels["input_ids"]
    ]
    return model_inputs

# mapping preprocessing function over the dataset
hard_tok = hard_train_set.map(preprocess, batched=True)

# now hard_tok is containing the preprocessed tokenized dataset


In [42]:
import json
import re


# CONVERTEDF THE HARD_TRAIN_EXAMPLES.JSON FILE TO TXT MANUALLY (there was some issue with the fjson object formatting)
# THEN CONVERTING THE TXT FILE TO JSON FILE AGAIN - CONVERTED_OUTPUT.JSON and using it for further work

def extract_json_objects_from_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    pattern = re.compile(r'\{\s*"input"\s*:.*?"output"\s*:.*?\}', re.DOTALL)
    matches = pattern.findall(text)

    json_objects = []
    for match in matches:
        try:
            json_obj = json.loads(match)
            json_objects.append(json_obj)
        except json.JSONDecodeError as e:
            print(f"Skipping malformed JSON object: {e}")
            continue

    return json_objects

def convert_txt_to_json(input_txt_path, output_json_path):
    data = extract_json_objects_from_text(input_txt_path)
   
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Converted {len(data)} examples to {output_json_path}")

convert_txt_to_json("/kaggle/input/hard-textfile/hard_examples.txt", "converted_output.json")


Converted 598 examples to converted_output.json


In [54]:
# load and preprocess hard examples
hard_dataset = load_dataset("json", data_files={"train": "/kaggle/working/converted_output.json"})["train"]

# print(hard_dataset[0])

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )

    # tokenize teh targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            max_length=256,
            padding="max_length",
            truncation=True,
        )

    # replace pad token ids in labels with -100 (for ignoring in loss)
    labels_input_ids = labels["input_ids"]
    labels_input_ids = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label_seq]
        for label_seq in labels_input_ids
    ]

    model_inputs["labels"] = labels_input_ids
    return model_inputs


tokenized_hard_dataset = hard_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/598 [00:00<?, ? examples/s]

In [51]:
print(tokenized_hard_dataset[0])
# print(val_tok[0])

{'input': "Context: My mom always told me that I talk a lot in my sleep and sometimes I wake up shouting too, but recently my husband told me that every alternate night I start fighting with someone or the other - in very loud tones and most of the time I also swear and abuse the person I'm fighting with.\n\nThis is really freaking me out and I dont remember who I fight with or what I dream, but this also disturbs him and we dont have kids yet, but what will my babies think of me when they hear all kinds of profanity from their mommy's mouthwhile she's in dreamland! \n\nI need to make it stop but I dont even know where to begin. Somebody?\nQuestion: I scream, shout and swear in my sleep. How do I stop?\nAnswers: hmm i would say duck tape but thats a lil to extreme tell your husband to record you one day and then you guys watch the tape and if you think its needed take it to a Psyc Dr to see what he thinks shove a sock in your mouth marry as soon as possible heavy drugs I think that you

In [None]:
import os
import torch
from math import ceil
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from safetensors.torch import load_file

# setting the visible cuda device (ensuring index 6 exists on your system)
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

# defining all perspectives used for multi-perspective summarization
PERSPECTIVES = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]

# defining helper function for cleaning summary text by filtering out empty or irrelevant strings
def clean_summary(text):
    if not text or not isinstance(text, str):
        return ""
    stripped = text.strip()
    if stripped.lower() in ["false", "true", "not_duplicate", "duplicate", "n/a", "", "no summary available."]:
        return ""
    return stripped

# defining function to format raw examples into consistent format with cleaned perspective summaries
def format_example(example):
    input_text = example["input"].strip()
    output_sections = example["output"].strip().split("\n")
    formatted_lines = []
    for label in PERSPECTIVES:
        matches = [line for line in output_sections if line.startswith(f"{label} SUMMARY:")]
        if matches:
            summary = clean_summary(matches[0].split("SUMMARY:", 1)[-1])
            if summary:
                formatted_lines.append(f"{label} SUMMARY: {summary}")
    return {"input": input_text, "output": "\n".join(formatted_lines)}

# loading tokenizer and base bart model
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
base_model = BartForConditionalGeneration.from_pretrained(model_name)

# applying lora (low-rank adaptation) for parameter-efficient fine-tuning
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(base_model, peft_config)

# loading lora weights from safetensors file
lora_weights_path = "./bart-optimized/adapter_model.safetensors"
lora_weights = load_file(lora_weights_path)
model.load_state_dict(lora_weights, strict=False)
model.train()

# loading raw dataset and formatting it for training
raw_data = load_dataset("json", data_files={"train": "./hard_examples.json"})["train"]
formatted_data = [format_example(ex) for ex in tqdm(raw_data, desc="formatting hard examples")]
dataset = Dataset.from_list(formatted_data)

# defining tokenization function for input-output pairs
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["output"],
            max_length=256,
            padding="max_length",
            truncation=True,
        )
    # masking pad tokens in labels for loss calculation
    labels_input_ids = labels["input_ids"]
    labels_input_ids = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label_seq]
        for label_seq in labels_input_ids
    ]
    model_inputs["labels"] = labels_input_ids
    return model_inputs

# applying preprocessing and removing original columns
tokenized = dataset.map(preprocess_function, batched=True)
tokenized = tokenized.remove_columns(dataset.column_names)

# defining training arguments for trainer api
training_args = TrainingArguments(
    output_dir="./hard_example_lora",
    do_eval=True,
    eval_steps=ceil(len(tokenized) / 4),
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=2,
    logging_steps=10,
    fp16=False,
    local_rank=-1,
    label_names=["labels"],
    logging_dir="./logs",
    report_to=[],  # disabling reporting to external services like wandb
)

# initializing trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer
)

# fine-tuning the model and saving artifacts
trainer.train()
model.save_pretrained("./bart-lora-hard")
tokenizer.save_pretrained("./bart-lora-hard")
torch.save(model.state_dict(), "bart-lora-hard.pt")

# to save lora weights in safetensors format instead:
# from safetensors.torch import save_file
# save_file(model.state_dict(), "bart-lora-hard.safetensors")


In [20]:
import os
import json
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm

# For evaluation
import evaluate


# --- Constants & Mappings ---

PERSPECTIVES = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]
BIO_TAGS = ["O"] + [f"{tag}-{p}" for p in PERSPECTIVES for tag in ["B", "I"]]
perspective2id = {p: i for i, p in enumerate(PERSPECTIVES)}
id2perspective = {i: p for p, i in perspective2id.items()}
bio2id = {t: i for i, t in enumerate(BIO_TAGS)}
id2bio = {i: t for t, i in bio2id.items()}


def load_json(path):
    with open(path, "r") as f:
        return json.load(f)

def join_answers(entry):
    answers = entry.get("answers", [])
    if isinstance(answers, str):
        return answers.strip()
    if isinstance(answers, list):
        return " ".join(a for a in answers if isinstance(a, str)).strip()
    return ""

def get_reference_summaries(example):
    
    # Extract reference summaries from the test example.
    # sssuming each example may contain a "labelled_summaries" field with keys like "INFORMATION_SUMMARY".
    refs = {}
    labelled_summaries = example.get("labelled_summaries", {})
    for perspective in PERSPECTIVES:
        key = f"{perspective}_SUMMARY"
        ref = labelled_summaries.get(key, "").strip()
        if ref and ref.lower() not in ["false", "true", "not_duplicate", "n/a", "duplicate"]:
            refs[perspective] = ref
    return refs



# --- Classifier Model ---

from transformers import AutoTokenizer, AutoModel

class DualHeadClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", num_perspectives=5, num_span_tags=len(BIO_TAGS)):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.encoder.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_perspectives)
        self.tagger = nn.Linear(hidden_size, num_span_tags)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        cls_token = last_hidden[:, 0, :]
        cls_logits = self.classifier(self.dropout(cls_token))
        tag_logits = self.tagger(self.dropout(last_hidden))
        return cls_logits, tag_logits



# --- Generator Function ---

from transformers import BartTokenizer, BartForConditionalGeneration

def generate_summary_for_perspective(input_text, perspective, generator_model, generator_tokenizer, device):
    prompt = (
        f"Generate a {perspective} summary:\n"
        f"{input_text}\n"
        f"Provide a clear and structured {perspective.lower()} summary."
    )
    inputs = generator_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    input_ids = inputs.input_ids.to(device)
    output_ids = generator_model.generate(input_ids=input_ids, max_length=150, num_beams=5)
    summary = generator_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return summary.strip()



# --- Evaluation Function ---

def evaluate_predictions(pipeline_results):
    # load evaluation metrics
    bleu_metric = evaluate.load("sacrebleu")
    bertscore_metric = evaluate.load("bertscore")
   
    # initialize results container per perspective.
    eval_results = {p: {"references": [], "predictions": []} for p in PERSPECTIVES}
   
    for item in pipeline_results:
        # pipeline_results items include a "reference_summaries" field (if available)
        ref_summaries = item.get("reference_summaries", {})
        pred_summaries = item.get("generated_summaries", {})
        for perspective in PERSPECTIVES:
            if perspective in ref_summaries and perspective in pred_summaries:
                eval_results[perspective]["references"].append(ref_summaries[perspective])
                eval_results[perspective]["predictions"].append(pred_summaries[perspective])
   
    # compute BLEU and BERTScore for each perspective
    for perspective in PERSPECTIVES:
        refs = eval_results[perspective]["references"]
        preds = eval_results[perspective]["predictions"]
        if refs and preds:
            bleu_score = bleu_metric.compute(
                predictions=preds,
                references=[[ref] for ref in refs]
            )["score"]
            bert_result = bertscore_metric.compute(
                predictions=preds,
                references=refs,
                lang="en"
            )
            bert_avg = np.mean(bert_result["f1"])
            print("{} - BLEU: {:.8f}, BERTScore: {:.4f}".format(perspective, bleu_score, bert_avg))
        else:
            print("{} - Not enough data for evaluation.".format(perspective))


    
# ---------------------------
# --- Main Pipeline ---
# ---------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- classifier model and tokenizer loading----------
    
classifier_model_path = "/kaggle/input/dual-classifier-model/pytorch/default/1/dual_classifier_final.pt"
classifier_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
classifier_model = DualHeadClassifier(model_name="roberta-base",
                                      num_perspectives=len(PERSPECTIVES),
                                      num_span_tags=len(BIO_TAGS))
classifier_state = torch.load(classifier_model_path, map_location=device)
classifier_model.load_state_dict(classifier_state)
classifier_model.to(device)
classifier_model.eval()


# ---------- generator model and tokenizer loading ----------

generator_model_path = "/kaggle/input/bard-lora-hard-2/pytorch/default/1"
generator_tokenizer = BartTokenizer.from_pretrained(generator_model_path)
generator_model = BartForConditionalGeneration.from_pretrained(generator_model_path)
generator_model.to(device)
generator_model.eval()

test_data_path = "/kaggle/input/nlp-project/test_project.json"
test_data = load_json(test_data_path)
# test_data = test_data[:int(0.01 * len(test_data))]

pipeline_results = []

for item in tqdm(test_data, desc="Processing Test Examples"):
    question = item.get("question", "").strip()
    answer = join_answers(item)

    # reference summaries
    reference_summaries = get_reference_summaries(item)

    if not question or not answer:
        continue

    # combine text input as in classifier prediction
    text = f"Question: {question} Answer: {answer}"

    # ---------- classifier prediction ----------
    
    encoding = classifier_tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512,
        return_offsets_mapping=True
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    
    with torch.no_grad():
        cls_logits, _ = classifier_model(input_ids, attention_mask)
        # apply sigmoid to get probabilities for multi-label classification
        pred_probs = torch.sigmoid(cls_logits).squeeze(0).cpu().numpy()

    # select predicted perspectives with probability > 0.5
    predicted_perspectives = [id2perspective[i] for i, prob in enumerate(pred_probs) if prob > 0.5]


    
    # ---------- generator summary for each predicted perspective ----------
    
    generated_summaries = {}
    if predicted_perspectives:
        for perspective in predicted_perspectives:
            summary = generate_summary_for_perspective(text, perspective, generator_model, generator_tokenizer, device)
            generated_summaries[perspective] = summary
    else:
        # in case no perspective exceeds threshold, generate summaries for all
        for perspective in PERSPECTIVES:
            summary = generate_summary_for_perspective(text, perspective, generator_model, generator_tokenizer, device)
            generated_summaries[perspective] = summary

    pipeline_results.append({
        "question": question,
        "answer": answer,
        "predicted_perspectives": predicted_perspectives,
        "generated_summaries": generated_summaries,
        "reference_summaries": reference_summaries
    })


# Saving pipeline results to file 
output_file = "pipeline_2_test_predictions.json"
with open(output_file, "w") as f:
    json.dump(pipeline_results, f, indent=2)
print(f"Pipeline complete. Results saved to {output_file}")


# ---------- Evaluatations ----------

evaluate_predictions(pipeline_results)



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  classifier_state = torch.load(classifier_model_path, map_location=device)
Processing Test Examples: 100%|██████████| 640/640 [26:48<00:00,  2.51s/it]


Pipeline complete. Results saved to pipeline_2_test_predictions.json


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


INFORMATION - BLEU: 10.43987691, BERTScore: 0.8770
SUGGESTION - BLEU: 6.43254635, BERTScore: 0.8635
EXPERIENCE - BLEU: 3.91803737, BERTScore: 0.8465
QUESTION - BLEU: 0.50806114, BERTScore: 0.8376
CAUSE - BLEU: 6.74203294, BERTScore: 0.8676
