<a href="https://colab.research.google.com/github/ydeng9950/Learn/blob/main/Copy_of_DProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# File Uploading

In [None]:
# === Step 1: Upload the Excel file ===
from google.colab import files
import pandas as pd
import json

# Prompt file upload
uploaded = files.upload()

# Automatically get the uploaded filename
xlsx_filename = next(iter(uploaded))

# === Load Excel file ===
df = pd.read_csv(xlsx_filename)

# === Rename columns for consistency ===
# Eename 'dialogue_num' → 'Dialogue_ID' and 'text' → 'EDU_text'
df = df.rename(columns={
    "dialogue_num": "Dialogue_ID",
    "text": "EDU_text"
})

# === Check necessary columns exist ===
assert 'Dialogue_ID' in df.columns, "Missing column: 'Dialogue_ID'"
assert 'EDU_text' in df.columns, "Missing column: 'EDU_text'"

# === Group EDUs by dialogue ===
grouped = df.groupby("Dialogue_ID")["EDU_text"].apply(list)

# === Filter dialogues with >2 EDUs only ===
filtered_dialogues = [
    {"dialogue_id": str(dialogue_id), "edus": edus}
    for dialogue_id, edus in grouped.items()
    if 2 <= len(edus)
]

# === Save to JSON Lines format ===
output_path = "/content/preprocessed_dialogues.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for dialogue in filtered_dialogues:
        json.dump(dialogue, f, ensure_ascii=False)
        f.write("\n")

print(f"Preprocessing complete. {len(filtered_dialogues)} dialogues saved to: {output_path}")

Saving incoming_base_spect.csv to incoming_base_spect.csv
Preprocessing complete. 1101 dialogues saved to: /content/preprocessed_dialogues.jsonl


# BART Sentence Ordering Pre 5 Splits (NEW)

In [None]:
import json
import random
import pandas as pd
from collections import Counter
from pathlib import Path
from google.colab import files

# === Load CSV ===
df = pd.read_csv("incoming_base_spect.csv")  # Adjust path if needed

# === Group by dialogue (exclude rows with '0' relation) ===
grouped = df.groupby(['doc', 'dialogue_num'])

processed_dialogues = []

# === Preprocess each dialogue ===
for (doc_id, dlg_num), group in grouped:
    group = group.dropna(subset=['text', 'relation_type'])
    edus = [e.strip() for e in group['text'].tolist() if e.strip() and e.strip() != "0"]
    relations = [r.strip() for r in group['relation_type'].tolist() if r.strip() and r.strip() != "0"]

    if len(edus) < 3:
        continue

    shuffled = edus.copy()
    for _ in range(10):
        random.shuffle(shuffled)
        if shuffled != edus:
            break
    else:
        continue

    rel_counts = Counter(relations)
    if not rel_counts:
        continue

    max_count = max(rel_counts.values())
    dominant_relations = [rel for rel, count in rel_counts.items() if count == max_count]
    if len(dominant_relations) != 1:
        continue

    dominant_relation = dominant_relations[0]
    deduplicated_edus = list(dict.fromkeys(edus))

    processed_dialogues.append({
        "dialogue_id": f"{doc_id}_{dlg_num}",
        "source": " ||| ".join(shuffled).strip(),
        "target": " ||| ".join(deduplicated_edus).strip(),
        "dominant_relation": dominant_relation
    })

# === Save full dataset ===
with open("bart_sentence_ordering.jsonl", "w", encoding="utf-8") as f:
    for item in processed_dialogues:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")
files.download("bart_sentence_ordering.jsonl")

# === Split into 5 folds (train/test only) ===
def make_5_fold_train_test_splits(data, seed=42):
    random.seed(seed)
    random.shuffle(data)
    k = 5
    fold_size = len(data) // k
    remainder = len(data) % k

    fold_sizes = [fold_size] * k
    for i in range(remainder):
        fold_sizes[i] += 1

    folds = []
    idx = 0
    for size in fold_sizes:
        folds.append(data[idx:idx + size])
        idx += size

    return folds

# === Save all 5 train/test folds ===
def save_5_fold_splits(folds):
    for i in range(5):
        test = folds[i]
        train = [item for j, fold in enumerate(folds) if j != i for item in fold]

        for name, split_data in zip(["train", "test"], [train, test]):
            filename = f"bart_ordering_fold{i+1}_{name}.jsonl"
            with open(filename, "w", encoding="utf-8") as f:
                for item in split_data:
                    json.dump(item, f, ensure_ascii=False)
                    f.write("\n")
            print(f"✅ Fold {i+1} | {name}: {len(split_data)} examples → {filename}")
            files.download(filename)

# === Run everything ===
folds = make_5_fold_train_test_splits(processed_dialogues)
save_5_fold_splits(folds)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 1 | train: 572 examples → bart_ordering_fold1_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 1 | test: 143 examples → bart_ordering_fold1_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 2 | train: 572 examples → bart_ordering_fold2_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 2 | test: 143 examples → bart_ordering_fold2_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 3 | train: 572 examples → bart_ordering_fold3_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 3 | test: 143 examples → bart_ordering_fold3_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 4 | train: 572 examples → bart_ordering_fold4_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 4 | test: 143 examples → bart_ordering_fold4_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 5 | train: 572 examples → bart_ordering_fold5_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Fold 5 | test: 143 examples → bart_ordering_fold5_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# NEW BART Sentence Ordering Fine-tuning

In [None]:
# === Install Required Packages ===
!pip install transformers datasets scipy --quiet

# === Imports ===
import json
import gc
import torch
from torch.utils.data import Dataset
from transformers import (
    BartTokenizerFast, BartForConditionalGeneration,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# === Load Tokenizer and Add Custom Token ===
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base")
tokenizer.add_tokens(["[REL]"])

# === Dataset Class ===
class BartOrderingDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        source = item["source"]
        target = item["target"]

        inputs = self.tokenizer(
            source,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        targets = self.tokenizer(
            target,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        labels = targets.input_ids.squeeze()
        labels[labels == tokenizer.pad_token_id] = -100

        return {
            "input_ids": inputs.input_ids.squeeze(),
            "attention_mask": inputs.attention_mask.squeeze(),
            "labels": labels
        }

# === Helper to Load JSONL ===
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

# === 5-Fold Training Loop ===
for i in range(1, 6):
    print(f"\n🔁 Training Fold {i}...")

    torch.cuda.empty_cache()
    gc.collect()

    train_data = load_jsonl(f"bart_ordering_fold{i}_train.jsonl")
    test_data = load_jsonl(f"bart_ordering_fold{i}_test.jsonl")

    train_dataset = BartOrderingDataset(train_data, tokenizer)
    test_dataset = BartOrderingDataset(test_data, tokenizer)

    # ✅ Load smaller model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    model.resize_token_embeddings(len(tokenizer))

    # Training arguments
    training_args = TrainingArguments(
        output_dir=f"bart_ordering_model_fold{i}",
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=20,
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        weight_decay=0.01,
        logging_dir=f"logs_fold{i}",
        logging_steps=10,
        fp16=True,
        label_smoothing_factor=0.1,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    model.save_pretrained(f"bart_ordering_model_fold{i}")
    tokenizer.save_pretrained(f"bart_ordering_model_fold{i}")

    del model
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]


🔁 Training Fold 1...


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,5.3735,5.245552
2,5.32,5.101549
3,4.9565,5.060081
4,4.898,5.077864
5,4.5194,5.058058
6,4.4527,4.994711
7,4.5527,5.059235
8,4.5286,5.092228
9,4.1206,5.128806


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



🔁 Training Fold 2...


Epoch,Training Loss,Validation Loss
1,5.2805,5.362034
2,5.3022,5.231116
3,4.9754,5.157842
4,5.0727,5.132148
5,4.6323,5.114887
6,4.4376,5.111178
7,4.7154,5.136733
8,4.6502,5.133573
9,4.2719,5.176898


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



🔁 Training Fold 3...


Epoch,Training Loss,Validation Loss
1,5.3373,5.191348
2,5.3315,5.046008
3,5.0224,4.971747
4,5.1339,4.936211
5,4.6818,4.93367
6,4.574,4.92131
7,4.7434,4.978647
8,4.759,4.961386
9,4.3319,5.032495


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



🔁 Training Fold 4...


Epoch,Training Loss,Validation Loss
1,5.3413,5.236413
2,5.3081,5.079155
3,5.0156,5.031621
4,5.0365,5.046077
5,4.6795,5.026499
6,4.6657,5.072341
7,4.6457,5.102124
8,4.7433,5.118896


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



🔁 Training Fold 5...


Epoch,Training Loss,Validation Loss
1,5.4794,5.192919
2,5.1771,5.023526
3,5.0697,4.980801
4,5.0323,4.931146
5,4.7029,4.891454
6,4.6286,4.894261
7,4.5617,4.914441
8,4.7528,4.942461


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


# New BART Sentence Ordering Fine-tuing Evaluation

In [None]:
from collections import defaultdict
from scipy.stats import spearmanr
from difflib import SequenceMatcher
import math
import torch
import json
from transformers import BartTokenizerFast, BartForConditionalGeneration

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

def clean_edus_list(edus):
    return [e.strip() for e in edus if e.strip() and e.strip() != "0"]

def align_pred_to_gold(pred_edus, gold_edus):
    aligned = []
    used = set()
    for pred in pred_edus:
        best_match = None
        best_score = 0.0
        for gold in gold_edus:
            if gold in used:
                continue
            score = SequenceMatcher(None, pred, gold).ratio()
            if score > best_score:
                best_score = score
                best_match = gold
        if best_match:
            aligned.append(best_match)
            used.add(best_match)
    for g in gold_edus:
        if g not in aligned:
            aligned.append(g)
    return aligned[:len(gold_edus)]

# === Settings ===
max_len = 768
verbose = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Global Statistics ===
global_relation_scores = defaultdict(list)
global_relation_counts = defaultdict(int)
all_rhos = []

# === Run Evaluation for Each Fold ===
for i in range(1, 6):
    print(f"\n================ Fold {i} Evaluation ================\n")

    tokenizer = BartTokenizerFast.from_pretrained(f"bart_ordering_model_fold{i}")
    model = BartForConditionalGeneration.from_pretrained(f"bart_ordering_model_fold{i}")
    model.to(device)
    model.eval()

    test_data = load_jsonl(f"bart_ordering_fold{i}_test.jsonl")
    relation_scores = defaultdict(list)
    total_scores = []
    skipped = 0

    for example in test_data:
        gold_edus = clean_edus_list(example["target"].split("|||"))
        if len(gold_edus) < 3:
            skipped += 1
            continue

        inputs = tokenizer(
            example["source"],
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_len
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=max_len,
                num_beams=4
            )
        pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        pred_edus = clean_edus_list(pred_text.split("|||"))
        aligned_pred = align_pred_to_gold(pred_edus, gold_edus)

        if len(aligned_pred) != len(gold_edus):
            skipped += 1
            continue

        try:
            pred_ranks = [gold_edus.index(e) for e in aligned_pred]
            gold_ranks = list(range(len(gold_edus)))
            rho, _ = spearmanr(gold_ranks, pred_ranks)
            if math.isnan(rho):
                skipped += 1
                continue
        except:
            skipped += 1
            continue

        rel = example.get("dominant_relation", "Unknown")
        relation_scores[rel].append(rho)
        total_scores.append(rho)

        # Global stats update
        global_relation_scores[rel].append(rho)
        global_relation_counts[rel] += 1
        all_rhos.append(rho)

        if verbose:
            print(f"\n🧩 Relation: {rel}")
            print(f"✅ Spearman: {rho:.4f}")
            print("Gold:", gold_edus)
            print("Pred:", aligned_pred)

    print("\n--- Average Spearman’s ρ by Relation Type ---")
    total_n = 0
    weighted_sum = 0.0
    for rel, scores in relation_scores.items():
        mean_rho = sum(scores) / len(scores)
        print(f"{rel:<20} n={len(scores):>3}  |  Mean ρ: {mean_rho:.4f}")
        total_n += len(scores)
        weighted_sum += mean_rho * len(scores)

    if total_n > 0:
        weighted_avg = weighted_sum / total_n
        overall_avg = sum(total_scores) / len(total_scores)
        print(f"\nWeighted average across all relations: {weighted_avg:.4f} (n={total_n})")
        print(f"Overall unweighted mean Spearman’s ρ: {overall_avg:.4f} (n={len(total_scores)})")
    else:
        print("No valid samples.")

    print(f"\n❗ Skipped examples: {skipped}")

# === Global Summary ===
print("\n================ Overall Evaluation Summary ================\n")
print("--- Average Spearman’s ρ by Relation Type (All Folds) ---")
total_weighted_sum = 0.0
total_count = 0
for rel, scores in global_relation_scores.items():
    count = global_relation_counts[rel]
    mean_rho = sum(scores) / len(scores)
    print(f"{rel:<25} n={count:<4} | Mean ρ: {mean_rho:.4f}")
    total_weighted_sum += mean_rho * count
    total_count += count

if total_count > 0:
    weighted_avg_all = total_weighted_sum / total_count
    unweighted_avg_all = sum(all_rhos) / len(all_rhos)
    print(f"\nWeighted average across all relations: {weighted_avg_all:.4f}")
    print(f"Overall unweighted mean Spearman’s ρ: {unweighted_avg_all:.4f}")
else:
    print("No valid overall statistics.")




--- Average Spearman’s ρ by Relation Type ---
Question_answer_pair n= 77  |  Mean ρ: 0.5042
Comment              n= 35  |  Mean ρ: 0.4218
Continuation         n=  5  |  Mean ρ: 0.4698
Acknowledgement      n= 18  |  Mean ρ: 0.3130
Q_Elab               n=  1  |  Mean ρ: -0.7143
Elaboration          n=  3  |  Mean ρ: 0.2307
Result               n=  1  |  Mean ρ: 0.9000
Background           n=  1  |  Mean ρ: 0.6220

Weighted average across all relations: 0.4473 (n=141)
Overall unweighted mean Spearman’s ρ: 0.4473 (n=141)

❗ Skipped examples: 2



--- Average Spearman’s ρ by Relation Type ---
Question_answer_pair n= 82  |  Mean ρ: 0.4801
Comment              n= 34  |  Mean ρ: 0.4192
Result               n=  1  |  Mean ρ: 0.1538
Acknowledgement      n= 13  |  Mean ρ: 0.3453
Q_Elab               n=  1  |  Mean ρ: 0.1773
Contrast             n=  1  |  Mean ρ: 0.2000
Background           n=  1  |  Mean ρ: 0.2286
Explanation          n=  2  |  Mean ρ: 0.6108
Elaboration          n=  2  |  Mea

# Baseline Evaluation

In [None]:
import json
import math
import os
from collections import defaultdict
from scipy.stats import spearmanr

# === Load Test Data ===
def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line.strip()) for line in f]

# === Spearman Helper ===
def rank_order(edus):
    return [edus.index(e) for e in edus]

# === Align Heuristic to Gold ===
def align_heuristic_to_gold(heuristic_edus, gold_edus):
    seen = set()
    aligned = [e for e in heuristic_edus if not (e in seen or seen.add(e))]
    for g in gold_edus:
        if g not in aligned:
            aligned.append(g)
    return aligned[:len(gold_edus)]

# === Evaluation Loop ===
print("\n--- Heuristic Baseline Reordering: Spearman Evaluation (All Folds) ---")
relation_scores = defaultdict(list)
total_scores = []
skipped = 0

for i in range(1, 6):
    print(f"\n========== Evaluating Fold {i} ==========\n")
    test_data = load_jsonl(f"bart_ordering_fold{i}_test.jsonl")

    for idx, example in enumerate(test_data):
        gold_edus = [e.strip() for e in example["target"].split("|||") if e.strip() and e.strip() != "0"]
        shuffled_edus = [e.strip() for e in example["source"].split("|||") if e.strip() and e.strip() != "0"]
        relation = example.get("dominant_relation", "Unknown")

        if len(gold_edus) < 3 or len(shuffled_edus) < 3:
            skipped += 1
            continue

        heuristic_ordered = sorted(shuffled_edus)
        aligned_pred = align_heuristic_to_gold(heuristic_ordered, gold_edus)

        try:
            pred_ranks = [gold_edus.index(e) for e in aligned_pred]
            gold_ranks = list(range(len(gold_edus)))
            rho, _ = spearmanr(gold_ranks, pred_ranks)
        except Exception:
            skipped += 1
            continue

        if math.isnan(rho):
            skipped += 1
            continue

        relation_scores[relation].append(rho)
        total_scores.append(rho)

# === Report Summary by Relation ===
print("\n--- Average Spearman’s ρ by Relation Type (All Folds) ---")
total_n = 0
weighted_sum = 0.0
for rel, scores in relation_scores.items():
    mean_rho = sum(scores) / len(scores)
    print(f"{rel:<25} n={len(scores):>3}  |  Mean ρ: {mean_rho:.4f}")
    total_n += len(scores)
    weighted_sum += mean_rho * len(scores)

# === Final Averages ===
if total_n > 0:
    weighted_avg = weighted_sum / total_n
    overall_avg = sum(total_scores) / len(total_scores)
    print(f"\nWeighted average across all relations: {weighted_avg:.4f} (n={total_n})")
    print(f"Overall unweighted mean Spearman’s ρ: {overall_avg:.4f} (n={len(total_scores)})")
else:
    print("\n No valid Spearman scores computed.")

print(f"\nTotal evaluated: {len(total_scores)}, Skipped: {skipped}")


--- Heuristic Baseline Reordering: Spearman Evaluation (All Folds) ---











--- Average Spearman’s ρ by Relation Type (All Folds) ---
Question_answer_pair      n=428  |  Mean ρ: 0.1808
Comment                   n=162  |  Mean ρ: 0.0564
Continuation              n= 28  |  Mean ρ: -0.0537
Acknowledgement           n= 62  |  Mean ρ: 0.0829
Q_Elab                    n=  4  |  Mean ρ: -0.0601
Elaboration               n= 12  |  Mean ρ: -0.1718
Result                    n=  4  |  Mean ρ: -0.0599
Background                n=  2  |  Mean ρ: 0.1362
Contrast                  n=  3  |  Mean ρ: 0.1354
Explanation               n=  4  |  Mean ρ: 0.3249
Correction                n=  1  |  Mean ρ: -0.0357
Clarification_question    n=  1  |  Mean ρ: 0.1033
Parallel                  n=  1  |  Mean ρ: -0.0286

Weighted average across all relations: 0.1259 (n=712)
Overall unweighted mean Spearman’s ρ: 0.1259 (n=712)

Total evaluated: 712, Skipped: 3


# RoBERTa Sentence Ordering Pre

In [None]:
import json
import random
import pandas as pd
from collections import Counter
from google.colab import files

# === Load and clean the dataset ===
df = pd.read_csv("incoming_base_spect.csv")
df = df[df['relation_type'].notna() & (df['relation_type'] != "0")]

# === Group by dialogue ===
grouped = df.groupby(['doc', 'dialogue_num'])

examples = []

# === Process each dialogue ===
for (doc_id, dlg_num), group in grouped:
    edus = [e.strip() for e in group['text'].dropna().tolist() if e.strip()]
    relations = [r.strip() for r in group['relation_type'].dropna().tolist() if r.strip()]

    if not (3 <= len(edus) <= 10):
        continue

    dialogue_id = f"{doc_id}_{dlg_num}"
    rel_counter = Counter(relations)
    dominant_rel = rel_counter.most_common(1)[0][0]

    # === Positive example ===
    gold_sequence = " [EDU_SEP] ".join(edus)
    examples.append({
        "dialogue_id": dialogue_id,
        "input": gold_sequence,
        "label": 1,
        "relation_type": dominant_rel
    })

    # === Generate 3 distinct negative samples ===
    seen = {tuple(edus)}
    for _ in range(3):
        shuffled = edus.copy()
        for _ in range(10):
            random.shuffle(shuffled)
            if tuple(shuffled) not in seen:
                break
        seen.add(tuple(shuffled))

        examples.append({
            "dialogue_id": dialogue_id,
            "input": " [EDU_SEP] ".join(shuffled),
            "label": 0,
            "relation_type": dominant_rel
        })

# === Save full dataset ===
with open("roberta_ordering.jsonl", "w", encoding="utf-8") as f:
    for item in examples:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

files.download("roberta_ordering.jsonl")
print(f"Saved {len(examples)} examples to roberta_ordering.jsonl")

# === Split into Train/Dev/Test ===
def split_json_data(data, output_prefix, seed=42, train_ratio=0.8, dev_ratio=0.1):
    random.seed(seed)
    random.shuffle(data)

    total = len(data)
    train_end = int(train_ratio * total)
    dev_end = train_end + int(dev_ratio * total)

    splits = {
        "train": data[:train_end],
        "dev": data[train_end:dev_end],
        "test": data[dev_end:]
    }

    for split_name, split_data in splits.items():
        path = f"{output_prefix}_{split_name}.jsonl"
        with open(path, "w", encoding="utf-8") as f:
            for item in split_data:
                json.dump(item, f, ensure_ascii=False)
                f.write("\n")
        print(f"📄 Saved {len(split_data)} → {path}")
        files.download(path)

# Run the split
split_json_data(examples, "roberta_ordering")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Saved 2300 examples to roberta_ordering.jsonl
📄 Saved 1840 → roberta_ordering_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📄 Saved 230 → roberta_ordering_dev.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📄 Saved 230 → roberta_ordering_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# RoBERTa Sentence Ordering Fine-tuning

In [None]:
from collections import Counter
print("Label distribution (train):", Counter([x['label'] for x in train_data]))
print("Label distribution (dev):", Counter([x['label'] for x in dev_data]))

Label distribution (train): Counter({0: 1398, 1: 442})
Label distribution (dev): Counter({0: 150, 1: 80})


In [None]:
import json
import torch
import numpy as np
from torch.utils.data import Dataset
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter

# === Load JSONL files ===
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

train_data = load_jsonl("roberta_ordering_train.jsonl")
dev_data = load_jsonl("roberta_ordering_dev.jsonl")

# === Display label distribution ===
print("Label distribution (train):", Counter([ex["label"] for ex in train_data]))
print("Label distribution (dev):", Counter([ex["label"] for ex in dev_data]))

# === Add [EDU_SEP] token ===
special_token = "[EDU_SEP]"
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
if special_token not in tokenizer.get_vocab():
    tokenizer.add_tokens([special_token])

# === Dataset class ===
class SentenceOrderingDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item["input"]
        label = item["label"]
        encoding = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        encoding["labels"] = torch.tensor(label)
        return encoding

# === Load datasets ===
train_dataset = SentenceOrderingDataset(train_data, tokenizer)
dev_dataset = SentenceOrderingDataset(dev_data, tokenizer)

# === Compute class weights ===
y = np.array([ex["label"] for ex in train_data])
weights = compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
class_weights = torch.tensor(weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")

# === Load model and resize embeddings ===
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

# === Custom trainer with weighted loss ===
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# === Metrics ===
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# === Training arguments ===
training_args = TrainingArguments(
    output_dir="roberta_ordering_model",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    seed=42,
    report_to="none"
)

# === Trainer setup ===
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# === Train ===
print("🟢 Using device:", training_args.device)
trainer.train()

# === Save model ===
model.save_pretrained("roberta_ordering_model")
tokenizer.save_pretrained("roberta_ordering_model")

Label distribution (train): Counter({0: 1398, 1: 442})
Label distribution (dev): Counter({0: 150, 1: 80})


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


🟢 Using device: cuda:0


TypeError: WeightedTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

# RoBERTa Sentence Ordering Evaluation

In [None]:
# === Install & Import Required Libraries ===
!pip install transformers --quiet

import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict

# === Load Tokenizer and Model ===
model_dir = "roberta_pairwise_ordering_model"
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
model = RobertaForSequenceClassification.from_pretrained(model_dir)
model.eval()
model.cuda()

# === Helper: Load JSONL ===
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        return [json.loads(line.strip()) for line in f]

# === Dataset Class ===
class NSPDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = tokenizer(
            item["input"],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": item["label"],
            "relation_type": item["relation_type"],
            "input": item["input"]
        }

# === Load and Prepare Test Set ===
test_data = load_jsonl("roberta_ordering_pairs_test.jsonl")
test_dataset = NSPDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32)

# === Run Evaluation ===
all_preds, all_labels = [], []
rel_groups = defaultdict(lambda: {"preds": [], "labels": [], "examples": []})

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        labels = batch["label"]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().tolist()

        all_preds.extend(preds)
        all_labels.extend(labels)

        for i in range(len(preds)):
            rel = batch["relation_type"][i]
            rel_groups[rel]["preds"].append(preds[i])
            rel_groups[rel]["labels"].append(labels[i])
            rel_groups[rel]["examples"].append((batch["input"][i], labels[i], preds[i]))

# === Report Metrics ===
acc = accuracy_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"\n=== Overall Results ===")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

print(f"\n--- Results by Relation Type ---")
for rel, group in rel_groups.items():
    acc = accuracy_score(group["labels"], group["preds"])
    f1 = f1_score(group["labels"], group["preds"])
    print(f"{rel:<20} | n={len(group['labels']):<4} | Acc: {acc:.3f} | F1: {f1:.3f}")

# === Sample Qualitative Output ===
print("\n✅ Sample Correct Predictions:")
shown = 0
for rel in rel_groups:
    for inp, label, pred in rel_groups[rel]["examples"]:
        if label == pred:
            print(f"[{rel}] ✅ {inp}")
            shown += 1
        if shown >= 5: break
    if shown >= 5: break

print("\n❌ Sample Incorrect Predictions:")
shown = 0
for rel in rel_groups:
    for inp, label, pred in rel_groups[rel]["examples"]:
        if label != pred:
            print(f"[{rel}] ❌ Pred: {pred}, Gold: {label} | {inp}")
            shown += 1
        if shown >= 5: break
    if shown >= 5: break


=== Overall Results ===
Accuracy: 0.8233
F1 Score: 0.8302

--- Results by Relation Type ---
Question_answer_pair | n=650  | Acc: 0.817 | F1: 0.819
Acknowledgement      | n=139  | Acc: 0.727 | F1: 0.729
Clarification_question | n=58   | Acc: 0.793 | F1: 0.829
Comment              | n=336  | Acc: 0.753 | F1: 0.771
Continuation         | n=147  | Acc: 0.844 | F1: 0.835
Q_Elab               | n=135  | Acc: 0.837 | F1: 0.849
Result               | n=55   | Acc: 0.800 | F1: 0.814
Parallel             | n=20   | Acc: 0.800 | F1: 0.778
Elaboration          | n=238  | Acc: 0.924 | F1: 0.925
Explanation          | n=65   | Acc: 0.831 | F1: 0.841
Contrast             | n=69   | Acc: 0.855 | F1: 0.865
Alternation          | n=27   | Acc: 1.000 | F1: 1.000
Correction           | n=16   | Acc: 0.938 | F1: 0.947
Narration            | n=6    | Acc: 1.000 | F1: 1.000
Conditional          | n=30   | Acc: 0.967 | F1: 0.973
Background           | n=7    | Acc: 1.000 | F1: 1.000

✅ Sample Correct Predict

# RoBERTa Baseline Sentence Ordering Evaluation

In [None]:
import json
import math
import torch
import itertools
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from collections import defaultdict
from difflib import SequenceMatcher
from scipy.stats import spearmanr

# === Load test data ===
with open("roberta_ordering_test.jsonl", "r", encoding="utf-8") as f:
    test_data = [json.loads(line.strip()) for line in f]

# === Load baseline model and tokenizer ===
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)  # explicitly set 2 labels
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# === Helper functions ===
def clean_edus(text):
    return [e.strip() for e in text.split("|||") if e.strip() and e.strip() != "0"]

def align_pred_to_gold(pred_edus, gold_edus):
    aligned = []
    used = set()
    for pred in pred_edus:
        best_match, best_score = None, 0.0
        for gold in gold_edus:
            if gold in used:
                continue
            score = SequenceMatcher(None, pred, gold).ratio()
            if score > best_score:
                best_match = gold
                best_score = score
        if best_match:
            aligned.append(best_match)
            used.add(best_match)
    for g in gold_edus:
        if g not in aligned:
            aligned.append(g)
    return aligned[:len(gold_edus)]

def score_sequence(source_edus, candidate_edus):
    source_str = " </s> ".join(source_edus)
    candidate_str = " ||| ".join(candidate_edus)
    inputs = tokenizer(source_str, candidate_str, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.logits[:, 1].item()  # take the logit for class 1

# === Evaluation ===
relation_scores = defaultdict(list)
total_scores = []

print("\n--- RoBERTa Sentence Ordering (Baseline): Evaluation ---")

for example in test_data:
    source_edus = clean_edus(example["source"].replace("</s>", "|||"))
    gold_edus = clean_edus(example["target"])
    rel = example.get("dominant_relation", "Unknown")

    if len(source_edus) != len(gold_edus) or len(source_edus) > 6:
        continue

    best_score = float("-inf")
    best_perm = None
    for perm in itertools.permutations(source_edus):
        score = score_sequence(source_edus, perm)
        if score > best_score:
            best_score = score
            best_perm = perm

    if best_perm is None:
        continue

    aligned_pred = align_pred_to_gold(best_perm, gold_edus)
    if len(aligned_pred) != len(gold_edus):
        continue

    try:
        pred_ranks = [gold_edus.index(e) for e in aligned_pred]
        gold_ranks = list(range(len(gold_edus)))
        rho, _ = spearmanr(gold_ranks, pred_ranks)
    except:
        continue

    if math.isnan(rho):
        continue

    relation_scores[rel].append(rho)
    total_scores.append(rho)

    print(f"\n🧩 Relation: {rel}")
    print(f"✅ Spearman: {rho:.4f}")
    print("Gold:", gold_edus)
    print("Pred:", aligned_pred)

# === Summary ===
print("\n--- Average Spearman’s ρ by Relation Type ---")
total_n = 0
weighted_sum = 0.0
for rel, scores in relation_scores.items():
    mean_rho = sum(scores) / len(scores)
    print(f"{rel:<20} n={len(scores):>3}  |  Mean ρ: {mean_rho:.4f}")
    total_n += len(scores)
    weighted_sum += mean_rho * len(scores)

if total_n > 0:
    weighted_avg = weighted_sum / total_n
    print(f"\nWeighted average across all relations: {weighted_avg:.4f} (n={total_n})")
    print(f"Overall unweighted mean Spearman’s ρ: {sum(total_scores) / len(total_scores):.4f} (n={len(total_scores)})")
else:
    print("❌ No valid samples evaluated.")




Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- RoBERTa Sentence Ordering (Baseline): Evaluation ---

🧩 Relation: Comment
✅ Spearman: 0.4000
Gold: ['enjoy my sheep', 'i intend to', 'blast', 'revenge']
Pred: ['blast', 'enjoy my sheep', 'i intend to', 'revenge']

🧩 Relation: Question_answer_pair
✅ Spearman: -0.0857
Gold: ['?', 'usual offer', 'sorry', ':)', 'I may, in my turn...', "i'm not needing sheep right now"]
Pred: [':)', 'sorry', 'usual offer', 'I may, in my turn...', "i'm not needing sheep right now", '?']

🧩 Relation: Question_answer_pair
✅ Spearman: 0.2000
Gold: ['yep', 'nope', 'Sorry,', 'wrong player']
Pred: ['yep', 'wrong player', 'Sorry,', 'nope']

🧩 Relation: Comment
✅ Spearman: 0.0000
Gold: ['rough game...', 'lol', 'none to give,', 'alas']
Pred: ['lol', 'alas', 'rough game...', 'none to give,']

🧩 Relation: Comment
✅ Spearman: 0.7000
Gold: ['clayless :/', 'afraid not, not seen 5 in a while', 'ironic', 'heh', 'indeed']
Pred: ['clayless :/', 'afraid not, not seen 5 in a while', 'indeed', 'ironic', 'heh']

🧩 Relation: 

# BART NSP Pre

In [None]:
import pandas as pd
import json
import random
from collections import defaultdict
from google.colab import files

# === Load and clean the dataset ===
df = pd.read_csv("incoming_base_spect.csv")

# Drop rows with missing or invalid values
df = df[df["text"].notna() & df["relation_type"].notna() & df["incoming_relation_id"].notna()]
df = df[
    (df["text"].astype(str).str.strip() != "0") &
    (df["relation_type"].astype(str).str.strip() != "0")
]

# === Group EDUs by dialogue ===
dialogues = defaultdict(list)
for _, row in df.iterrows():
    dlg_id = row["dialogue_num"] if "dialogue_num" in df.columns else row["doc"]
    dialogues[dlg_id].append({
        "text": row["text"].strip(),
        "relation": row["relation_type"].strip()
    })

# === Shuffle and split dialogues ===
dialogue_ids = list(dialogues.keys())
random.seed(42)
random.shuffle(dialogue_ids)
n = len(dialogue_ids)

split_map = {
    "train": dialogue_ids[:int(0.8 * n)],
    "dev": dialogue_ids[int(0.8 * n):int(0.9 * n)],
    "test": dialogue_ids[int(0.9 * n):]
}

# === Create NSP-style pairs ===
for split, dlg_ids in split_map.items():
    examples = []
    for dlg_id in dlg_ids:
        edus = dialogues[dlg_id]
        if len(edus) < 3:
            continue  # skip short dialogues

        for i in range(len(edus) - 1):
            context = " <EDU_SEP> ".join(e["text"] for e in edus[:i+1])
            target = edus[i + 1]["text"]
            rel = edus[i + 1]["relation"]

            # Positive example
            examples.append({
                "input": context + " <SEP> " + target,
                "label": 1,
                "relation": rel
            })

            # Negative example: random target from another dialogue
            neg_dlg_id = random.choice([d for d in dialogue_ids if d != dlg_id])
            neg_edus = dialogues[neg_dlg_id]
            neg_target = random.choice(neg_edus)["text"]
            examples.append({
                "input": context + " <SEP> " + neg_target,
                "label": 0,
                "relation": rel
            })

    # Save to file
    out_file = f"bart_nsp_{split}.jsonl"
    with open(out_file, "w", encoding="utf-8") as f:
        for ex in examples:
            f.write(json.dumps(ex) + "\n")
    print(f"✅ Saved {len(examples)} examples to {out_file}")
    files.download(out_file)

✅ Saved 15896 examples to bart_nsp_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Saved 1890 examples to bart_nsp_dev.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Saved 1776 examples to bart_nsp_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# BART NSP Fine-tuning

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import os

# === Parameters ===
train_file = "bart_nsp_train.jsonl"
dev_file = "bart_nsp_dev.jsonl"
save_dir = "bart_nsp_model"
num_epochs = 3
batch_size = 4
lr = 2e-5
max_length = 512
gradient_accumulation_steps = 4

# === Custom Tokens ===
special_tokens = {"additional_special_tokens": ["[CTX_SEP]", "[EDU_SEP]"]}

# === Dataset ===
class BARTNSPDataset(Dataset):
    def __init__(self, path, tokenizer):
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                item = json.loads(line)
                self.data.append(item)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encodings = self.tokenizer(
            item["input"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": torch.tensor(item["label"], dtype=torch.long)
        }

# === Tokenizer and Model ===
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
tokenizer.add_special_tokens(special_tokens)

model = BartForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === DataLoaders ===
train_dataset = BARTNSPDataset(train_file, tokenizer)
dev_dataset = BARTNSPDataset(dev_file, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=lr)
best_dev_loss = float("inf")

# === Training Loop ===
for epoch in range(1, num_epochs + 1):
    model.train()
    train_loss = 0.0

    for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch}")):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss / gradient_accumulation_steps
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f"✅ Epoch {epoch}: Training Loss = {avg_train_loss:.4f}")

    # === Evaluation ===
    model.eval()
    dev_loss = 0.0
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            dev_loss += outputs.loss.item()

    avg_dev_loss = dev_loss / len(dev_loader)
    print(f"🧪 Epoch {epoch}: Dev Loss = {avg_dev_loss:.4f}")

    if avg_dev_loss < best_dev_loss:
        best_dev_loss = avg_dev_loss
        os.makedirs(save_dir, exist_ok=True)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"💾 Best model saved (Epoch {epoch})")

print("✅ Training complete.")

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Epoch 1: 100%|██████████| 3974/3974 [28:49<00:00,  2.30it/s]


✅ Epoch 1: Training Loss = 0.1674
🧪 Epoch 1: Dev Loss = 0.6541


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


💾 Best model saved (Epoch 1)


Epoch 2: 100%|██████████| 3974/3974 [28:58<00:00,  2.29it/s]


✅ Epoch 2: Training Loss = 0.1567
🧪 Epoch 2: Dev Loss = 0.6315
💾 Best model saved (Epoch 2)


Epoch 3: 100%|██████████| 3974/3974 [28:55<00:00,  2.29it/s]


✅ Epoch 3: Training Loss = 0.1448
🧪 Epoch 3: Dev Loss = 0.6094
💾 Best model saved (Epoch 3)
✅ Training complete.


# BART NSP Evaluation (Fine-tuned)

In [None]:
import json
from collections import defaultdict
from sklearn.metrics import accuracy_score, f1_score

# === Evaluation by relation type ===
model.eval()
all_preds = []
all_labels = []
rel_wise_data = defaultdict(lambda: {"labels": [], "preds": [], "examples": []})

with torch.no_grad():
    for batch in dev_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

        # Add predictions and labels
        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(batch["labels"].cpu().tolist())

# === Load relation types and input strings from dev set ===
with open(dev_file, "r", encoding="utf-8") as f:
    dev_data = [json.loads(line) for line in f]

for i, item in enumerate(dev_data):
    rel = item["relation"]
    label = all_labels[i]
    pred = all_preds[i]
    rel_wise_data[rel]["labels"].append(label)
    rel_wise_data[rel]["preds"].append(pred)
    rel_wise_data[rel]["examples"].append({
        "input": item["input"],
        "gold": label,
        "pred": pred
    })

# === Report by relation type ===
print("\n--- Evaluation by Discourse Relation Type ---")
for rel, vals in rel_wise_data.items():
    if len(set(vals["labels"])) > 1:
        acc = accuracy_score(vals["labels"], vals["preds"])
        f1 = f1_score(vals["labels"], vals["preds"])
        print(f"{rel:<25} | Accuracy: {acc:.3f} | F1: {f1:.3f}")
    else:
        print(f"{rel:<25} | Only one class in ground truth — skipping F1.")

# === Print examples ===
print("\n--- All Examples ---")
for rel, vals in rel_wise_data.items():
    print(f"\nRelation: {rel}")
    for ex in vals["examples"]:
        print(f"Input: {ex['input']}")
        print(f"Gold: {ex['gold']} | Pred: {ex['pred']}\n")

# === Overall scores ===
overall_acc = accuracy_score(all_labels, all_preds)
overall_f1 = f1_score(all_labels, all_preds)
print(f"\n✅ Overall Accuracy: {overall_acc:.4f}")
print(f"✅ Overall F1 Score: {overall_f1:.4f}")


--- Evaluation by Discourse Relation Type ---
Question_answer_pair      | Accuracy: 0.661 | F1: 0.631
Q_Elab                    | Accuracy: 0.681 | F1: 0.625
Elaboration               | Accuracy: 0.509 | F1: 0.424
Comment                   | Accuracy: 0.605 | F1: 0.544
Acknowledgement           | Accuracy: 0.724 | F1: 0.689
Parallel                  | Accuracy: 0.575 | F1: 0.605
Continuation              | Accuracy: 0.691 | F1: 0.636
Clarification_question    | Accuracy: 0.675 | F1: 0.618
Contrast                  | Accuracy: 0.603 | F1: 0.526
Result                    | Accuracy: 0.722 | F1: 0.727
Narration                 | Accuracy: 0.600 | F1: 0.556
Conditional               | Accuracy: 0.611 | F1: 0.588
Background                | Accuracy: 0.250 | F1: 0.000
Explanation               | Accuracy: 0.644 | F1: 0.579
Correction                | Accuracy: 0.548 | F1: 0.424
Alternation               | Accuracy: 0.500 | F1: 0.471

--- All Examples ---

Relation: Question_answer_pair
Inp

# BART NSP Baseline Evaluation

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict, Counter

# === Parameters ===
test_file = "bart_nsp_test.jsonl"
batch_size = 4
max_length = 512

# === Special tokens ===
special_tokens = {"additional_special_tokens": ["[CTX_SEP]", "[EDU_SEP]"]}

# === Dataset Class ===
class BARTNSPDataset(Dataset):
    def __init__(self, path, tokenizer):
        self.data = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                item = json.loads(line)
                self.data.append(item)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encodings = self.tokenizer(
            item["input"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encodings["input_ids"].squeeze(),
            "attention_mask": encodings["attention_mask"].squeeze(),
            "labels": torch.tensor(item["label"], dtype=torch.long),
            "relation": item["relation"],
            "input_text": item["input"]
        }

# === Load Tokenizer and Model ===
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
tokenizer.add_special_tokens(special_tokens)

model = BartForSequenceClassification.from_pretrained("facebook/bart-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# === Load Dataset ===
test_dataset = BARTNSPDataset(test_file, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# === Evaluation ===
all_preds = []
all_labels = []
all_rels = []
all_inputs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        all_rels.extend(batch["relation"])
        all_inputs.extend(batch["input_text"])

# === Class distribution check
print("✅ Prediction Distribution:", Counter(all_preds))
print("✅ Gold Label Distribution:", Counter(all_labels))

# === Evaluation by relation type ===
rel_wise = defaultdict(lambda: {"preds": [], "labels": [], "examples": []})
for inp, pred, label, rel in zip(all_inputs, all_preds, all_labels, all_rels):
    rel_wise[rel]["preds"].append(pred)
    rel_wise[rel]["labels"].append(label)
    rel_wise[rel]["examples"].append((inp, label, pred))

# === Print all examples first
print("\n--- All Evaluation Examples ---")
for rel, vals in rel_wise.items():
    print(f"\nRelation: {rel}")
    for inp, label, pred in vals["examples"]:
        print(f"Input: {inp}")
        print(f"Gold: {label} | Pred: {pred}\n")

# === Relation-wise scores before overall
print("\n--- Baseline Evaluation by Discourse Relation Type ---")
for rel, vals in rel_wise.items():
    acc = accuracy_score(vals["labels"], vals["preds"])
    f1 = f1_score(vals["labels"], vals["preds"], pos_label=1)
    print(f"{rel:<25} | Accuracy: {acc:.3f} | F1: {f1:.3f}")

# === Overall scores at the very bottom
overall_acc = accuracy_score(all_labels, all_preds)
overall_f1 = f1_score(all_labels, all_preds, pos_label=1)
print(f"\n✅ Baseline Overall Accuracy: {overall_acc:.4f}")
print(f"✅ Baseline Overall F1 Score: {overall_f1:.4f}")

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Prediction Distribution: Counter({0: 1319, 1: 457})
✅ Gold Label Distribution: Counter({1: 888, 0: 888})

--- All Evaluation Examples ---

Relation: Continuation
Input: no thanks <SEP> sheep?
Gold: 1 | Pred: 0

Input: no thanks <SEP> anyone need ore?
Gold: 0 | Pred: 0

Input: no thanks <EDU_SEP> sheep? <EDU_SEP> yes, <EDU_SEP> i want sheep:) <EDU_SEP> me too <EDU_SEP> me too <EDU_SEP> i want them too <EDU_SEP> lol <EDU_SEP> but i am trading... <EDU_SEP> robber <EDU_SEP> why put it on sheep? <EDU_SEP> it's literally at random where i put it <EDU_SEP> sorry <SEP> its no good!
Gold: 1 | Pred: 0

Input: no thanks <EDU_SEP> sheep? <EDU_SEP> yes, <EDU_SEP> i want sheep:) <EDU_SEP> me too <EDU_SEP> me too <EDU_SEP> i want them too <EDU_SEP> lol <EDU_SEP> but i am trading... <EDU_SEP> robber <EDU_SEP> why put it on sheep? <EDU_SEP> it's literally at random where i put it <EDU_SEP> sorry <SEP> Trade you an ore for a clay?
Gold: 0 | Pred: 0

Input: thanks!!!! <EDU_SEP> damn! <EDU_SEP> Nice one

# RoBERTa NSP Pre

In [None]:
import json
import random
import pandas as pd
from collections import Counter
from google.colab import files

# === Load and clean the dataset ===
df = pd.read_csv("incoming_base_spect.csv")
df = df[df['relation_type'].notna() & (df['relation_type'] != "0")]

# === Group by dialogue ===
grouped = df.groupby(['doc', 'dialogue_num'])

examples = []

# === Process each dialogue ===
for (doc_id, dlg_num), group in grouped:
    edus = group['text'].dropna().tolist()
    relations = group['relation_type'].dropna().tolist()

    if not (2 <= len(edus) <= 10):
        continue

    dialogue_id = f"{doc_id}_{dlg_num}"
    num_edus = len(edus)

    for i in range(2, min(num_edus, 9)):  # i is the index of the target EDU
        context = edus[:i]
        target = edus[i]
        rel_type = relations[i - 1]  # relation between last context EDU and target

        context_str = " [EDU_SEP] ".join(context)

        # === Positive example ===
        examples.append({
            "dialogue_id": dialogue_id,
            "input": context_str + " [CTX_SEP] " + target,
            "label": 1,
            "relation_type": rel_type
        })

        # === Negative example ===
        neg_candidates = [j for j in range(num_edus) if j != i and abs(j - i) > 1]
        if not neg_candidates:
            continue

        j = random.choice(neg_candidates)
        negative_target = edus[j]

        examples.append({
            "dialogue_id": dialogue_id,
            "input": context_str + " [CTX_SEP] " + negative_target,
            "label": 0,
            "relation_type": rel_type  # ✅ assign same relation type
        })

# === Save all NSP examples ===
full_path = "roberta_nsp.jsonl"
with open(full_path, "w", encoding="utf-8") as f:
    for item in examples:
        json.dump(item, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Saved {len(examples)} NSP examples → {full_path}")
files.download(full_path)

# === Split into Train/Dev/Test ===
def split_json_data(data, output_prefix, seed=42, train_ratio=0.8, dev_ratio=0.1):
    random.seed(seed)
    random.shuffle(data)

    total = len(data)
    train_end = int(train_ratio * total)
    dev_end = train_end + int(dev_ratio * total)

    splits = {
        "train": data[:train_end],
        "dev": data[train_end:dev_end],
        "test": data[dev_end:]
    }

    for split_name, split_data in splits.items():
        path = f"{output_prefix}_{split_name}.jsonl"
        with open(path, "w", encoding="utf-8") as f:
            for item in split_data:
                json.dump(item, f, ensure_ascii=False)
                f.write("\n")
        print(f"📄 Saved {len(split_data)} → {path}")
        files.download(path)

# Run the split
split_json_data(examples, "roberta_nsp")

✅ Saved 4520 NSP examples → roberta_nsp.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📄 Saved 3616 → roberta_nsp_train.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📄 Saved 452 → roberta_nsp_dev.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📄 Saved 452 → roberta_nsp_test.jsonl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# RoBERTa NSP Fine-tuning

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm

# === Parameters ===
train_file = "roberta_nsp_train.jsonl"
dev_file = "roberta_nsp_dev.jsonl"
batch_size = 8
lr = 2e-5
epochs = 3
max_length = 512

# === Load tokenizer and model ===
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["[CTX_SEP]", "[EDU_SEP]"]})

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Dataset class ===
class RobertaNSPDataset(Dataset):
    def __init__(self, path, tokenizer):
        with open(path, "r", encoding="utf-8") as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(
            item["input"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(item["label"], dtype=torch.long)
        }

# === Load data ===
train_dataset = RobertaNSPDataset(train_file, tokenizer)
dev_dataset = RobertaNSPDataset(dev_file, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

# === Optimizer and loss ===
optimizer = AdamW(model.parameters(), lr=lr)

best_dev_loss = float("inf")
for epoch in range(1, epochs + 1):
    # === Training ===
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)
    print(f"✅ Epoch {epoch}: Training Loss = {avg_train_loss:.4f}")

    # === Validation ===
    model.eval()
    dev_loss = 0
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            dev_loss += outputs.loss.item()
    avg_dev_loss = dev_loss / len(dev_loader)
    print(f"🧪 Epoch {epoch}: Dev Loss = {avg_dev_loss:.4f}")

    # === Save best model ===
    if avg_dev_loss < best_dev_loss:
        best_dev_loss = avg_dev_loss
        torch.save(model.state_dict(), "roberta_nsp_best.pt")
        print(f"💾 Best model saved (Epoch {epoch})")

print("✅ Training complete.")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'roberta_nsp_train.jsonl'

# RoBERTa NSP Evaluation (FIne-tuned)

In [None]:
import json
import torch
from collections import defaultdict, Counter
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score

# === Parameters ===
test_file = "roberta_nsp_test.jsonl"
batch_size = 8
max_length = 512

# === Load tokenizer and model ===
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["[CTX_SEP]", "[EDU_SEP]"]})

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load("roberta_nsp_best.pt", map_location="cpu"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# === Dataset class ===
class RobertaNSPTestDataset(Dataset):
    def __init__(self, path, tokenizer):
        with open(path, "r", encoding="utf-8") as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(
            item["input"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "label": torch.tensor(item["label"], dtype=torch.long),
            "input": item["input"],
            "relation_type": item.get("relation_type", "Unknown")
        }

# === Load data ===
test_dataset = RobertaNSPTestDataset(test_file, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# === Evaluate ===
all_preds, all_labels, all_inputs, all_rels = [], [], [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        all_inputs.extend(batch["input"])
        all_rels.extend(batch["relation_type"])

# === Organize results ===
rel_wise = defaultdict(lambda: {"preds": [], "labels": [], "examples": []})
for inp, pred, label, rel in zip(all_inputs, all_preds, all_labels, all_rels):
    rel_wise[rel]["preds"].append(pred)
    rel_wise[rel]["labels"].append(label)
    rel_wise[rel]["examples"].append((inp, label, pred))

# === Print a few examples per relation ===
print("\n--- Sample Evaluation Examples ---")
for rel, vals in rel_wise.items():
    print(f"\nRelation: {rel}")
    for i, (inp, gold, pred) in enumerate(vals["examples"][:5]):
        print(f"Input: {inp}")
        print(f"Gold: {gold} | Pred: {pred}\n")

# === Metrics by relation ===
print("\n--- Evaluation by Discourse Relation Type ---")
for rel, vals in rel_wise.items():
    labels = vals["labels"]
    preds = vals["preds"]
    if len(set(labels)) < 2:
        print(f"{rel:<25} | Only one class in gold — skipping F1.")
    else:
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        print(f"{rel:<25} | Accuracy: {acc:.3f} | F1: {f1:.3f}")

# === Overall metrics at the bottom ===
print(f"\n✅ Prediction Distribution: {Counter(all_preds)}")
print(f"✅ Gold Label Distribution: {Counter(all_labels)}")
print(f"\n✅ Overall Accuracy: {accuracy_score(all_labels, all_preds):.4f}")
print(f"✅ Overall F1 Score: {f1_score(all_labels, all_preds):.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'roberta_nsp_best.pt'

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score
from collections import defaultdict, Counter

# === Parameters ===
test_file = "roberta_nsp_test.jsonl"
batch_size = 8
max_length = 512

# === Load tokenizer and model ===
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["[CTX_SEP]", "[EDU_SEP]"]})

model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load("roberta_nsp_best.pt", map_location="cpu"))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# === Dataset class ===
class RobertaNSPDataset(Dataset):
    def __init__(self, path, tokenizer):
        with open(path, "r", encoding="utf-8") as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        enc = self.tokenizer(
            item["input"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": torch.tensor(item["label"], dtype=torch.long),
            "relation": item["relation_type"],
            "input": item["input"]
        }

# === Load test data ===
test_dataset = RobertaNSPDataset(test_file, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# === Evaluation ===
all_preds, all_labels, all_inputs, all_rels = [], [], [], []
rel_wise = defaultdict(lambda: {"preds": [], "labels": [], "examples": []})

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        for inp, rel, label, pred in zip(batch["input"], batch["relation"], labels.cpu(), preds.cpu()):
            all_inputs.append(inp)
            all_rels.append(rel)
            all_labels.append(label.item())
            all_preds.append(pred.item())
            rel_wise[rel]["labels"].append(label.item())
            rel_wise[rel]["preds"].append(pred.item())
            rel_wise[rel]["examples"].append((inp, label.item(), pred.item()))

# === Print all examples ===
print("\n--- All Evaluation Examples ---")
for rel, vals in rel_wise.items():
    print(f"\nRelation: {rel}")
    for inp, gold, pred in vals["examples"]:
        print(f"Input: {inp}")
        print(f"Gold: {gold} | Pred: {pred}\n")

# === Evaluation by relation type ===
print("\n--- Evaluation by Discourse Relation Type ---")
for rel, vals in rel_wise.items():
    labels = vals["labels"]
    preds = vals["preds"]
    if len(set(labels)) > 1:
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        print(f"{rel:<25} | n={len(labels):<3} | Accuracy: {acc:.3f} | F1: {f1:.3f}")
    else:
        print(f"{rel:<25} | n={len(labels):<3} | Only one class in gold — skipping F1.")

# === Overall scores ===
print(f"\n Prediction Distribution: {Counter(all_preds)}")
print(f"Gold Label Distribution: {Counter(all_labels)}")
overall_acc = accuracy_score(all_labels, all_preds)
overall_f1 = f1_score(all_labels, all_preds)
print(f"\n Overall Accuracy: {overall_acc:.4f}")
print(f"Overall F1 Score: {overall_f1:.4f}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


FileNotFoundError: [Errno 2] No such file or directory: 'roberta_nsp_best.pt'

# RoBERTa NSP Baseline Evaluation

In [None]:
import json
import random
from collections import defaultdict, Counter
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# === Load test data ===
test_file = "roberta_nsp_test.jsonl"
with open(test_file, "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

# === Generate more robust truly random predictions (average over runs) ===
num_runs = 5
all_preds_runs = [
    [random.randint(0, 1) for _ in test_data]
    for _ in range(num_runs)
]
# Average and round predictions across runs
all_preds = np.round(np.mean(all_preds_runs, axis=0)).astype(int).tolist()

all_labels = [item["label"] for item in test_data]
all_inputs = [item["input"] for item in test_data]
all_relations = [item["relation_type"] for item in test_data]

# === Evaluation by relation type ===
rel_groups = defaultdict(lambda: {"preds": [], "labels": [], "examples": []})
for inp, pred, gold, rel in zip(all_inputs, all_preds, all_labels, all_relations):
    rel_groups[rel]["preds"].append(pred)
    rel_groups[rel]["labels"].append(gold)
    rel_groups[rel]["examples"].append((inp, gold, pred))

print("\n--- All Evaluation Examples ---")
for rel, group in rel_groups.items():
    print(f"\nRelation: {rel}")
    for inp, gold, pred in group["examples"]:
        print(f"Input: {inp}")
        print(f"Gold: {gold} | Pred: {pred}\n")

print("\n--- Evaluation by Discourse Relation Type ---")
for rel, group in rel_groups.items():
    preds = group["preds"]
    labels = group["labels"]
    n = len(labels)
    try:
        acc = accuracy_score(labels, preds)
        f1 = f1_score(labels, preds)
        print(f"{rel:<25} | n={n:<3} | Accuracy: {acc:.3f} | F1: {f1:.3f}")
    except:
        print(f"{rel:<25} | n={n:<3} | Only one class in gold — skipping F1.")

# === Overall metrics ===
print(f"\n✅ Prediction Distribution: {Counter(all_preds)}")
print(f"✅ Gold Label Distribution: {Counter(all_labels)}")

overall_acc = accuracy_score(all_labels, all_preds)
overall_f1 = f1_score(all_labels, all_preds)
print(f"\n✅ Overall Accuracy: {overall_acc:.4f}")
print(f"✅ Overall F1 Score: {overall_f1:.4f}")


--- All Evaluation Examples ---

Relation: Comment
Input: I'll give you ore/ [EDU_SEP] no [EDU_SEP] thanks [CTX_SEP] no
Gold: 1 | Pred: 0

Input: Already...!!! [EDU_SEP] :) [CTX_SEP] ha..
Gold: 1 | Pred: 1

Input: for wood or sheep? [EDU_SEP] i only have one and i want it, [EDU_SEP] sorry [CTX_SEP] for wood or sheep?
Gold: 0 | Pred: 1

Input: I'm after ore or clay [EDU_SEP] thanks [CTX_SEP] how about I give you a wood for a sheep?
Gold: 1 | Pred: 0

Input: ja [EDU_SEP] Wood? [EDU_SEP] ok [EDU_SEP] nope [EDU_SEP] i don't have any sheep [EDU_SEP] i do [EDU_SEP] Sorry, [CTX_SEP] i do
Gold: 0 | Pred: 0

Input: I don't have any, [EDU_SEP] sorry [EDU_SEP] nnnoooo [CTX_SEP] :)
Gold: 1 | Pred: 1

Input: i stole from the wrong guy ;) [EDU_SEP] none still [EDU_SEP] need to see a T [EDU_SEP] likewise [EDU_SEP] likewise [EDU_SEP] then i'll let you know you can rob me [EDU_SEP] oh well [CTX_SEP] 0
Gold: 1 | Pred: 1

Input: nope [EDU_SEP] sorry, [CTX_SEP] sheep or wheat to trade, am afraid
Gold: 0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
