In [1]:
print("HELLO WORLD")

HELLO WORLD


In [16]:
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_scheduler
from sklearn.metrics import classification_report
from tqdm import tqdm

#Constants
PERSPECTIVES = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]
BIO_TAGS = ["O"] + [f"{tag}-{p}" for p in PERSPECTIVES for tag in ["B", "I"]]
perspective2id = {p: i for i, p in enumerate(PERSPECTIVES)}
bio2id = {t: i for i, t in enumerate(BIO_TAGS)}
id2bio = {i: t for t, i in bio2id.items()}

# Dataset 
class DualHeadDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = f"Question: {item['question']} Answer: {item['answer']}"
        encoding = self.tokenizer(text, padding="max_length", truncation=True,
                                  max_length=self.max_length, return_offsets_mapping=True)
        input_ids = encoding["input_ids"]
        attention_mask = encoding["attention_mask"]
        offsets = encoding["offset_mapping"]
        del encoding["offset_mapping"]

        # Classification labels
        perspective_labels = [0] * len(PERSPECTIVES)
        span_dict = item.get("labelled_answer_spans", {})
        for p, spans in span_dict.items():
            if p in perspective2id:
                perspective_labels[perspective2id[p]] = 1

        # BIO span tagging
        span_labels = ["O"] * len(input_ids)
        answer_text = item["answer"]
        for p, spans in span_dict.items():
            if p not in perspective2id:
                continue
            for span_entry in spans:
                span_txt = span_entry.get("txt", "").strip()
                start_idx = answer_text.find(span_txt)
                if start_idx == -1:
                    continue
                end_idx = start_idx + len(span_txt)
                for i, (start, end) in enumerate(offsets):
                    if start == 0 and end == 0:
                        continue  # special tokens
                    if end <= start_idx or start >= end_idx:
                        continue
                    tag = f"B-{p}" if start == start_idx else f"I-{p}"
                    span_labels[i] = tag

        span_label_ids = [bio2id.get(tag, 0) for tag in span_labels]

        return {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "perspective_labels": torch.tensor(perspective_labels, dtype=torch.float),
            "span_labels": torch.tensor(span_label_ids)
        }

# Model 
class DualHeadClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", num_perspectives=5, num_span_tags=len(BIO_TAGS)):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.encoder.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_perspectives)
        self.tagger = nn.Linear(hidden_size, num_span_tags)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        cls_token = last_hidden[:, 0, :]
        cls_logits = self.classifier(self.dropout(cls_token))
        tag_logits = self.tagger(self.dropout(last_hidden))
        return cls_logits, tag_logits

# Utilities 
def load_json(path):
    with open(path) as f:
        return json.load(f)

def compute_metrics(preds, labels, threshold=0.5):
    preds_bin = (preds >= threshold).astype(int)
    report = classification_report(labels, preds_bin, target_names=PERSPECTIVES, zero_division=0, output_dict=True)
    print(json.dumps(report, indent=2))
    return report

# Training 
def train(model, dataloader, optimizer, scheduler, loss_fn_cls, loss_fn_tag, device, span_weight=0.5):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        perspective_labels = batch["perspective_labels"].to(device)
        span_labels = batch["span_labels"].to(device)

        cls_logits, tag_logits = model(input_ids, attention_mask)

        loss_cls = loss_fn_cls(cls_logits, perspective_labels)
        loss_tag = loss_fn_tag(tag_logits.view(-1, tag_logits.shape[-1]), span_labels.view(-1))
        loss = loss_cls + span_weight * loss_tag

        if torch.isnan(loss):
            print("Skipping NaN loss batch")
            continue

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Evaluation 
def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["perspective_labels"].cpu().numpy()

            cls_logits, _ = model(input_ids, attention_mask)
            preds = torch.sigmoid(cls_logits).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels)

    compute_metrics(np.array(all_preds), np.array(all_labels))

# Main 
def join_answers(entry):
    answers = entry.get("answers", [])
    if isinstance(answers, str):
        return answers.strip()
    if isinstance(answers, list):
        return " ".join(a for a in answers if isinstance(a, str)).strip()
    return ""

def main():
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    raw_train = [ex for ex in load_json("/kaggle/input/puma-dataset/NLP_project_dataset/train.json") if isinstance(ex, dict)]
    raw_val = [ex for ex in load_json("/kaggle/input/puma-dataset/NLP_project_dataset/valid.json") if isinstance(ex, dict)]

    for entry in raw_train:
        entry["answer"] = join_answers(entry)
    for entry in raw_val:
        entry["answer"] = join_answers(entry)

    train_data = [ex for ex in raw_train if ex.get("question") and ex.get("answer")]
    val_data = [ex for ex in raw_val if ex.get("question") and ex.get("answer")]

    print(f"Loaded {len(train_data)} train and {len(val_data)} val before subsampling.")
    print(f"Using {len(train_data)} train and {len(val_data)} validation examples.")

    train_set = DualHeadDataset(train_data, tokenizer)
    val_set = DualHeadDataset(val_data, tokenizer)
    train_loader = DataLoader(train_set, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=8)

    model = DualHeadClassifier(model_name=model_name).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)
    loss_fn_cls = nn.BCEWithLogitsLoss()
    loss_fn_tag = nn.CrossEntropyLoss(ignore_index=0)

    for epoch in range(3):
        print(f"\nEpoch {epoch + 1}")
        train_loss = train(model, train_loader, optimizer, scheduler, loss_fn_cls, loss_fn_tag, device)
        print(f"Train Loss: {train_loss:.4f}")
        evaluate(model, val_loader, device)

    torch.save(model.state_dict(), "dual_classifier_final.pt")

if __name__ == "__main__":
    main()


Loaded 2236 train and 959 val before subsampling.
Using 2236 train and 959 validation examples.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1


Training: 100%|██████████| 280/280 [02:45<00:00,  1.69it/s]


Train Loss: 0.9225


Evaluating: 100%|██████████| 120/120 [00:15<00:00,  7.82it/s]


{
  "INFORMATION": {
    "precision": 0.8793324775353016,
    "recall": 0.9319727891156463,
    "f1-score": 0.9048877146631439,
    "support": 735
  },
  "SUGGESTION": {
    "precision": 0.9152249134948097,
    "recall": 0.8890756302521008,
    "f1-score": 0.9019607843137255,
    "support": 595
  },
  "EXPERIENCE": {
    "precision": 0.8821548821548821,
    "recall": 0.8291139240506329,
    "f1-score": 0.8548123980424144,
    "support": 316
  },
  "QUESTION": {
    "precision": 1.0,
    "recall": 0.00980392156862745,
    "f1-score": 0.01941747572815534,
    "support": 102
  },
  "CAUSE": {
    "precision": 0.5217391304347826,
    "recall": 0.5179856115107914,
    "f1-score": 0.5198555956678701,
    "support": 139
  },
  "micro avg": {
    "precision": 0.8639152258784161,
    "recall": 0.8208797032326444,
    "f1-score": 0.8418478260869565,
    "support": 1887
  },
  "macro avg": {
    "precision": 0.8396902807239552,
    "recall": 0.6355903752995598,
    "f1-score": 0.6401867936830618,

Training: 100%|██████████| 280/280 [02:46<00:00,  1.68it/s]


Train Loss: 0.6541


Evaluating: 100%|██████████| 120/120 [00:15<00:00,  7.75it/s]


{
  "INFORMATION": {
    "precision": 0.8476977567886659,
    "recall": 0.9768707482993197,
    "f1-score": 0.9077117572692794,
    "support": 735
  },
  "SUGGESTION": {
    "precision": 0.8543543543543544,
    "recall": 0.9563025210084034,
    "f1-score": 0.9024583663758923,
    "support": 595
  },
  "EXPERIENCE": {
    "precision": 0.8267045454545454,
    "recall": 0.9208860759493671,
    "f1-score": 0.87125748502994,
    "support": 316
  },
  "QUESTION": {
    "precision": 0.8947368421052632,
    "recall": 0.3333333333333333,
    "f1-score": 0.48571428571428565,
    "support": 102
  },
  "CAUSE": {
    "precision": 0.5850340136054422,
    "recall": 0.6187050359712231,
    "f1-score": 0.6013986013986015,
    "support": 139
  },
  "micro avg": {
    "precision": 0.8282926829268292,
    "recall": 0.8998410174880763,
    "f1-score": 0.8625857251714503,
    "support": 1887
  },
  "macro avg": {
    "precision": 0.8017055024616543,
    "recall": 0.7612195429123294,
    "f1-score": 0.75370

Training: 100%|██████████| 280/280 [02:47<00:00,  1.67it/s]


Train Loss: 0.5440


Evaluating: 100%|██████████| 120/120 [00:15<00:00,  7.76it/s]


{
  "INFORMATION": {
    "precision": 0.8754716981132076,
    "recall": 0.9469387755102041,
    "f1-score": 0.9098039215686275,
    "support": 735
  },
  "SUGGESTION": {
    "precision": 0.8938906752411575,
    "recall": 0.934453781512605,
    "f1-score": 0.913722267871816,
    "support": 595
  },
  "EXPERIENCE": {
    "precision": 0.8502994011976048,
    "recall": 0.8987341772151899,
    "f1-score": 0.8738461538461538,
    "support": 316
  },
  "QUESTION": {
    "precision": 0.8545454545454545,
    "recall": 0.46078431372549017,
    "f1-score": 0.5987261146496815,
    "support": 102
  },
  "CAUSE": {
    "precision": 0.65,
    "recall": 0.5611510791366906,
    "f1-score": 0.6023166023166023,
    "support": 139
  },
  "micro avg": {
    "precision": 0.8624091381100727,
    "recall": 0.8802331743508214,
    "f1-score": 0.8712300026226069,
    "support": 1887
  },
  "macro avg": {
    "precision": 0.8248414458194849,
    "recall": 0.760412425420036,
    "f1-score": 0.7796830120505762,
  

In [8]:
import json
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

#Constants
PERSPECTIVES = ["INFORMATION", "SUGGESTION", "EXPERIENCE", "QUESTION", "CAUSE"]
BIO_TAGS = ["O"] + [f"{tag}-{p}" for p in PERSPECTIVES for tag in ["B", "I"]]
perspective2id = {p: i for i, p in enumerate(PERSPECTIVES)}
id2perspective = {i: p for p, i in perspective2id.items()}
bio2id = {t: i for i, t in enumerate(BIO_TAGS)}
id2bio = {i: t for t, i in bio2id.items()}

def load_json(path):
    with open(path) as f:
        return json.load(f)

def join_answers(entry):
    answers = entry.get("answers", [])
    if isinstance(answers, str):
        return answers.strip()
    if isinstance(answers, list):
        return " ".join(a for a in answers if isinstance(a, str)).strip()
    return ""

class DualHeadClassifier(nn.Module):
    def __init__(self, model_name="roberta-base", num_perspectives=5, num_span_tags=len(BIO_TAGS)):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        hidden_size = self.encoder.config.hidden_size
        self.classifier = nn.Linear(hidden_size, num_perspectives)
        self.tagger = nn.Linear(hidden_size, num_span_tags)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state
        cls_token = last_hidden[:, 0, :]
        cls_logits = self.classifier(self.dropout(cls_token))
        tag_logits = self.tagger(self.dropout(last_hidden))
        return cls_logits, tag_logits

# Span decoding 
def decode_bio(tags, offsets, input_ids, tokenizer):
    spans = {p: [] for p in PERSPECTIVES}
    current_tag = None
    current_tokens = []
    for i, tag_id in enumerate(tags):
        tag = id2bio.get(tag_id, "O")
        if tag == "O":
            if current_tag and current_tokens:
                span_text = tokenizer.decode(input_ids[current_tokens[0]:current_tokens[-1]+1], skip_special_tokens=True).strip()
                spans[current_tag].append(span_text)
                current_tag = None
                current_tokens = []
            continue
        prefix, label = tag.split("-")
        if prefix == "B":
            if current_tag and current_tokens:
                span_text = tokenizer.decode(input_ids[current_tokens[0]:current_tokens[-1]+1], skip_special_tokens=True).strip()
                spans[current_tag].append(span_text)
            current_tag = label
            current_tokens = [i]
        elif prefix == "I" and current_tag == label:
            current_tokens.append(i)
        else:
            if current_tag and current_tokens:
                span_text = tokenizer.decode(input_ids[current_tokens[0]:current_tokens[-1]+1], skip_special_tokens=True).strip()
                spans[current_tag].append(span_text)
            current_tag = None
            current_tokens = []
    if current_tag and current_tokens:
        span_text = tokenizer.decode(input_ids[current_tokens[0]:current_tokens[-1]+1], skip_special_tokens=True).strip()
        spans[current_tag].append(span_text)
    return {k: v for k, v in spans.items() if v}

# Prediction loop 
def predict(model, data, tokenizer, device):
    model.eval()
    results = []
    for item in tqdm(data, desc="Predicting"):
        question = item["question"]
        answer = join_answers(item)
        text = f"Question: {question} Answer: {answer}"
        encoding = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512, return_offsets_mapping=True)
        input_ids = encoding["input_ids"].to(device)
        attention_mask = encoding["attention_mask"].to(device)
        offsets = encoding["offset_mapping"][0].tolist()

        with torch.no_grad():
            cls_logits, tag_logits = model(input_ids, attention_mask)
            pred_labels = torch.sigmoid(cls_logits).squeeze(0).cpu().numpy()
            pred_bio = torch.argmax(tag_logits, dim=-1).squeeze(0).cpu().numpy()

        predicted_perspectives = [id2perspective[i] for i, prob in enumerate(pred_labels) if prob > 0.5]
        spans = decode_bio(pred_bio, offsets, input_ids[0].tolist(), tokenizer)

        results.append({
            "question": question,
            "answer": answer,
            "predicted_perspectives": predicted_perspectives,
            "predicted_spans": spans
        })
    return results

# main 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = DualHeadClassifier()
model.load_state_dict(torch.load("dual_classifier_full.pt", map_location=device))
model.to(device)

test_data = load_json("/kaggle/input/puma-dataset/NLP_project_dataset/test.json")
for entry in test_data:
    entry["answer"] = join_answers(entry)
test_data = [ex for ex in test_data if ex.get("question") and ex.get("answer")]

print(f"Loaded {len(test_data)} test samples.")
predictions = predict(model, test_data, tokenizer, device)

# Save predictions
with open("predictions.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("✅ Predictions saved to predictions.json")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("dual_classifier_full.pt", map_location=device))


Loaded 640 test samples.


Predicting: 100%|██████████| 640/640 [00:13<00:00, 48.97it/s]

✅ Predictions saved to predictions.json





In [None]:
!pip install -q peft accelerate evaluate datasets transformers sacrebleu bert-score
!pip install -U transformers
!pip install datasets
!pip install evaluate
!pip install bert-score
!pip install sacrebleu
