# 2025 COMP90042 Project Group 24
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1. EDA


## 1.1 Exmain the Training Data

Each train data has:
1. claim_text
2. claim_label
3. (multiple) evidences

### for claim_text

In [None]:
claim_lengths = [len(item["claim_text"]) for item in train_data.values()]
print("total number of training claims:", len(train_data))
print("max training claim length:", max(claim_lengths))
print("min training claim length:", min(claim_lengths)) 
print("mean training claim length:", sum(claim_lengths) / len(claim_lengths))

total number of training claims: 1228
max training claim length: 332
min training claim length: 26
mean training claim length: 122.95521172638436


### for evidences

In [None]:
evi_counts = [len(item["evidences"]) for item in train_data.values()]
print("max evidence count for one training data:", max(evi_counts))
print("min evidence count for one training data:", min(evi_counts))
print("mean evidence count for one training data:", sum(evi_counts) / len(evi_counts))

max evidence count for one training data: 5
min evidence count for one training data: 1
mean evidence count for one training data: 3.3566775244299674


### for label distribution - unbalanced

In [None]:
from collections import Counter

labels = [item["claim_label"] for item in train_data.values()]
print(Counter(labels))

Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})


In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:

import os

# Set directory path
data_dir = "/content/drive/MyDrive/NLP_content"

# Define file paths
dev_claims_path = os.path.join(data_dir, "dev-claims.json")
train_claims_path = os.path.join(data_dir, "train-claims.json")
test_claims_path = os.path.join(data_dir, "test-claims-unlabelled.json")
evidence_path = os.path.join(data_dir, "evidence.json")

with open(dev_claims_path, "r") as f:
    dev_data = json.load(f)
print(f"Loaded {len(dev_data)} dev claims.")

with open(train_claims_path, "r") as f:
    train_data = json.load(f)
print(f"Loaded {len(train_data)} train claims.")

with open(test_claims_path, "r") as f:
    test_data = json.load(f)
print(f"Loaded {len(test_data)} test claims.")

with open(evidence_path, "r") as f:
    evidence = json.load(f)
print(f"Loaded {len(evidence)} evidence entries.")


Loaded 154 dev claims.
Loaded 1228 train claims.
Loaded 153 test claims.
Loaded 1208827 evidence entries.


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata
import string

nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text, remove_stopwords=True, apply_lemma=True):
    # Normalize unicode characters
    text = unicodedata.normalize("NFKD", text)

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Lowercase
    text = text.lower()

    # Remove placeholder citations
    text = re.sub(r'\[citation needed\]', '', text, flags=re.IGNORECASE)

    # Remove bracketed content like [example]
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[\[\]]', '', text)

    # Remove repeated dots and normalize whitespace
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[\'"`“”‘’]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove punctuation tokens
    tokens = [tok for tok in tokens if tok not in string.punctuation]

    # Lemmatize (optional)
    if apply_lemma:
        tokens = [lemmatizer.lemmatize(tok) for tok in tokens]

    # Remove stopwords (optional)
    if remove_stopwords:
        tokens = [tok for tok in tokens if tok not in stop_words]

    return " ".join(tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from tqdm import tqdm
import os
import json

def apply_claim_preprocessing(claim_dict, preprocess_fn, field="claim_text"):
    result = {}
    for cid, data in claim_dict.items():
        processed = preprocess_fn(data[field])
        result[cid] = data.copy()
        result[cid][field] = processed
    return result

# Claim Preprocessing
train_p1 = apply_claim_preprocessing(train_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=True))
train_p2 = apply_claim_preprocessing(train_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=False))
train_p3 = apply_claim_preprocessing(train_data, lambda x: preprocess(x, remove_stopwords=False, apply_lemma=True))

dev_p1 = apply_claim_preprocessing(dev_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=True))
dev_p2 = apply_claim_preprocessing(dev_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=False))
dev_p3 = apply_claim_preprocessing(dev_data, lambda x: preprocess(x, remove_stopwords=False, apply_lemma=True))

test_p1 = apply_claim_preprocessing(test_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=True))
test_p2 = apply_claim_preprocessing(test_data, lambda x: preprocess(x, remove_stopwords=True, apply_lemma=False))
test_p3 = apply_claim_preprocessing(test_data, lambda x: preprocess(x, remove_stopwords=False, apply_lemma=True))

# Evidence Preprocessing
evidence_p1, evidence_p2, evidence_p3 = {}, {}, {}

print("Applying preprocess_1 (lemma + stopwords) to evidence...")
for k in tqdm(evidence, desc="preprocess_1"):
    evidence_p1[k] = preprocess(evidence[k], remove_stopwords=True, apply_lemma=True)

print("Applying preprocess_2 (no lemma + stopwords) to evidence...")
for k in tqdm(evidence, desc="preprocess_2"):
    evidence_p2[k] = preprocess(evidence[k], remove_stopwords=True, apply_lemma=False)

print("Applying preprocess_3 (lemma + no stopwords) to evidence...")
for k in tqdm(evidence, desc="preprocess_3"):
    evidence_p3[k] = preprocess(evidence[k], remove_stopwords=False, apply_lemma=True)

json.dump(train_p1, open(os.path.join(data_dir, "train-claims-preprocessed1.json"), "w"), indent=2)
json.dump(train_p2, open(os.path.join(data_dir, "train-claims-preprocessed2.json"), "w"), indent=2)
json.dump(train_p3, open(os.path.join(data_dir, "train-claims-preprocessed3.json"), "w"), indent=2)

json.dump(dev_p1, open(os.path.join(data_dir, "dev-claims-preprocessed1.json"), "w"), indent=2)
json.dump(dev_p2, open(os.path.join(data_dir, "dev-claims-preprocessed2.json"), "w"), indent=2)
json.dump(dev_p3, open(os.path.join(data_dir, "dev-claims-preprocessed3.json"), "w"), indent=2)

json.dump(test_p1, open(os.path.join(data_dir, "test-claims-unlabelled-preprocessed1.json"), "w"), indent=2)
json.dump(test_p2, open(os.path.join(data_dir, "test-claims-unlabelled-preprocessed2.json"), "w"), indent=2)
json.dump(test_p3, open(os.path.join(data_dir, "test-claims-unlabelled-preprocessed3.json"), "w"), indent=2)

json.dump(evidence_p1, open(os.path.join(data_dir, "evidence-preprocessed1.json"), "w"), indent=2)
json.dump(evidence_p2, open(os.path.join(data_dir, "evidence-preprocessed2.json"), "w"), indent=2)
json.dump(evidence_p3, open(os.path.join(data_dir, "evidence-preprocessed3.json"), "w"), indent=2)


Applying preprocess_1 (lemma + stopwords) to evidence...


preprocess_1: 100%|██████████| 1208827/1208827 [04:16<00:00, 4710.82it/s]


Applying preprocess_2 (no lemma + stopwords) to evidence...


preprocess_2: 100%|██████████| 1208827/1208827 [02:42<00:00, 7450.72it/s]


Applying preprocess_3 (lemma + no stopwords) to evidence...


preprocess_3: 100%|██████████| 1208827/1208827 [04:14<00:00, 4757.09it/s]


In [None]:
# Preview of three preprocessing mthods
first_claim_id = next(iter(train_data))
first_evidence_id = next(iter(evidence))

original_claim = train_data[first_claim_id]["claim_text"]
preprocessed_claim_p1 = train_p1[first_claim_id]["claim_text"]
preprocessed_claim_p2 = train_p2[first_claim_id]["claim_text"]
preprocessed_claim_p3 = train_p3[first_claim_id]["claim_text"]

print("Original claim:")
print(original_claim)
print("\n Preprocessed claim (preprocess_1: lemma + stopwords removed):")
print(preprocessed_claim_p1)
print("\n Preprocessed claim (preprocess_2: no lemma + stopwords removed):")
print(preprocessed_claim_p2)
print("\n Preprocessed claim (preprocess_3: lemma + stopwords kept):")
print(preprocessed_claim_p3)

original_evidence = evidence[first_evidence_id]
preprocessed_evidence_p1 = evidence_p1[first_evidence_id]
preprocessed_evidence_p2 = evidence_p2[first_evidence_id]
preprocessed_evidence_p3 = evidence_p3[first_evidence_id]

print("\n\n Original evidence:")
print(original_evidence)
print("\n Preprocessed evidence (preprocess_1: lemma + stopwords removed):")
print(preprocessed_evidence_p1)
print("\n Preprocessed evidence (preprocess_2: no lemma + stopwords removed):")
print(preprocessed_evidence_p2)
print("\n Preprocessed evidence (preprocess_3: lemma + stopwords kept):")
print(preprocessed_evidence_p3)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm import tqdm
import os
import scipy.sparse
import joblib

def whitespace_tokenizer(text):
    return text.split()

train_p1_texts = [claim["claim_text"] for claim in train_p1.values()]
evidence_p1_ids = list(evidence_p1.keys())
evidence_p1_texts = [evidence_p1[eid] for eid in tqdm(evidence_p1_ids, desc="Collecting evidence_p1 texts")]

# TF-IDF Vectorization
print("\n TF-IDF Vectorizing...")

tfidf_vectorizer = TfidfVectorizer(max_features=5000, tokenizer=whitespace_tokenizer)
evidence_p1_tfidf = tfidf_vectorizer.fit_transform(tqdm(evidence_p1_texts, desc="Fitting TF-IDF on evidence"))
train_p1_tfidf = tfidf_vectorizer.transform(train_p1_texts)

# Save TF-IDF Results
scipy.sparse.save_npz(os.path.join(data_dir, "evidence_p1_tfidf.npz"), evidence_p1_tfidf)
scipy.sparse.save_npz(os.path.join(data_dir, "train_p1_tfidf.npz"), train_p1_tfidf)
joblib.dump(tfidf_vectorizer, os.path.join(data_dir, "tfidf_vectorizer.pkl"))
print(" TF-IDF vectors and model saved.")

# BoW Vectorization
print("\n BoW Vectorizing...")

bow_vectorizer = CountVectorizer(max_features=5000, tokenizer=whitespace_tokenizer)
evidence_p1_bow = bow_vectorizer.fit_transform(tqdm(evidence_p1_texts, desc="Fitting BoW on evidence"))
train_p1_bow = bow_vectorizer.transform(train_p1_texts)

# Save BoW Results
scipy.sparse.save_npz(os.path.join(data_dir, "evidence_p1_bow.npz"), evidence_p1_bow)
scipy.sparse.save_npz(os.path.join(data_dir, "train_p1_bow.npz"), train_p1_bow)
joblib.dump(bow_vectorizer, os.path.join(data_dir, "bow_vectorizer.pkl"))
print(" BoW vectors and model saved.")

# Vector shapes
print("\nVector Shapes:")
print(f"TF-IDF (train):    {train_p1_tfidf.shape}")
print(f"TF-IDF (evidence): {evidence_p1_tfidf.shape}")
print(f"BoW (train):       {train_p1_bow.shape}")
print(f"BoW (evidence):    {evidence_p1_bow.shape}")

Collecting evidence_p1 texts: 100%|██████████| 1208827/1208827 [00:00<00:00, 2058748.04it/s]



🔧 TF-IDF Vectorizing...


Fitting TF-IDF on evidence: 100%|██████████| 1208827/1208827 [00:08<00:00, 147232.71it/s]


✅ TF-IDF vectors and model saved.

🔧 BoW Vectorizing...


Fitting BoW on evidence: 100%|██████████| 1208827/1208827 [00:08<00:00, 147262.67it/s]


✅ BoW vectors and model saved.

📐 Vector Shapes:
TF-IDF (train):    (1228, 5000)
TF-IDF (evidence): (1208827, 5000)
BoW (train):       (1228, 5000)
BoW (evidence):    (1208827, 5000)


In [None]:
import os
import json
import nltk
import numpy as np
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from collections import Counter
#Building Vocalbulary for sequential models
nltk.download("punkt_tab")

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
min_freq = 5
special_tokens = ("<pad>", "<unk>", "<cls>")

# Load data
def load_json_file(filename):
    with open(os.path.join(data_dir, filename)) as f:
        return json.load(f)

train_data = load_json_file("train-claims-preprocessed2.json")
dev_data = load_json_file("dev-claims-preprocessed2.json")
test_data = load_json_file("test-claims-unlabelled-preprocessed2.json")
evidence_data = load_json_file("evidence-preprocessed2.json")

train_texts = [v["claim_text"] for v in train_data.values()]
dev_texts = [v["claim_text"] for v in dev_data.values()]
test_texts = [v["claim_text"] for v in test_data.values()]
evidence_texts = [v for v in evidence_data.values()]

# oken Iterator
def yield_tokens(data):
    for item in data:
        yield word_tokenize(item.lower())

# Build Vocabulary
def build_vocab_from_iterator(iterator, min_freq=5, special_tokens=("<pad>", "<unk>", "<cls>")):
    counter = Counter()
    for tokens in iterator:
        counter.update(tokens)

    vocab = {tok: idx for idx, tok in enumerate(special_tokens)}
    cur_idx = len(special_tokens)

    for token, freq in counter.items():
        if freq >= min_freq and token not in vocab:
            vocab[token] = cur_idx
            cur_idx += 1

    idx_to_token = {idx: tok for tok, idx in vocab.items()}
    return vocab, idx_to_token

print("Building vocabulary...")
vocab, idx_to_token = build_vocab_from_iterator(
    yield_tokens(train_texts + evidence_texts),
    min_freq=min_freq,
    special_tokens=special_tokens
)

print(f"Vocabulary size: {len(vocab)}")

def process_text(text, vocab):
    tokens = word_tokenize(text.lower())
    return [vocab["<cls>"]] + [vocab.get(tok, vocab["<unk>"]) for tok in tokens]

# Convert All to Indices
print("Processing texts to index sequences...")
train_texts_indices = [process_text(text, vocab) for text in tqdm(train_texts, desc="Train")]
dev_texts_indices = [process_text(text, vocab) for text in tqdm(dev_texts, desc="Dev")]
test_texts_indices = [process_text(text, vocab) for text in tqdm(test_texts, desc="Test")]
evidence_texts_indices = [process_text(text, vocab) for text in tqdm(evidence_texts, desc="Evidence")]

# Save indexed versions
np.save(os.path.join(data_dir, "train_claim_indices.npy"), np.array(train_texts_indices, dtype=object))
np.save(os.path.join(data_dir, "dev_claim_indices.npy"), np.array(dev_texts_indices, dtype=object))
np.save(os.path.join(data_dir, "test_claim_indices.npy"), np.array(test_texts_indices, dtype=object))
np.save(os.path.join(data_dir, "evidence_indices.npy"), np.array(evidence_texts_indices, dtype=object))

# Save vocabulary
with open(os.path.join(data_dir, "seq_models_vocab.json"), "w") as f:
    json.dump(vocab, f, indent=2)

print("All sequences processed and vocabulary saved.")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Building vocabulary...
Vocabulary size: 111585
Processing texts to index sequences...


Train: 100%|██████████| 1228/1228 [00:00<00:00, 8643.94it/s]
Dev: 100%|██████████| 154/154 [00:00<00:00, 11119.15it/s]
Test: 100%|██████████| 153/153 [00:00<00:00, 11839.35it/s]
Evidence: 100%|██████████| 1208827/1208827 [01:33<00:00, 12959.34it/s]


All sequences processed and vocabulary saved.


In [None]:
# Extra Preprocessing
import json
import os

data_dir = "/content/drive/MyDrive/NLP_content"

with open(os.path.join(data_dir, "evidence-preprocessed2.json")) as f:
    evidence = json.load(f)

indexed_evidence = {
    i: {
        "evidence_id": eid,
        "text": evidence[eid]
    }
    for i, eid in enumerate(evidence)
}

output_path = os.path.join(data_dir, "evidence-preprocessed2-indexed.json")
with open(output_path, "w") as f:
    json.dump(indexed_evidence, f, indent=2)

print(f"Saved indexed evidence to: {output_path}")

## 1.2 Exmain the Dev Data

In [None]:
evi_counts = [len(item["evidences"]) for item in dev_data.values()]
print("max evidence count for one dev data:", max(evi_counts))
print("min evidence count for one dev data:", min(evi_counts))
print("mean evidence count for one dev data:", sum(evi_counts) / len(evi_counts))

max evidence count for one training data: 5
min evidence count for one training data: 1
mean evidence count for one training data: 3.188311688311688


In [None]:
labels = [item["claim_label"] for item in dev_data.values()]
print(Counter(labels))

Counter({'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18})


## 1.3 Data Preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import unicodedata


nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text, remove_stopwords=True):
    # Normalize unicode
    text = unicodedata.normalize("NFKD", text)

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Lowercase
    text = text.lower()

    # Remove speaker patterns like "Mark Latham said", "Chris Bowen claimed"
    text = re.sub(
        r'^[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\s'
        r'(?:said|says|claimed|claims|stated|states|argued|argues|asserts|asserted):?\s*',
        '',
        text
    )

    # Remove bracketed noise like [ ... ]
    text = re.sub(r'\[.*?\]', '', text)

    # Remove repeated dots and normalize whitespace
    text = re.sub(r'\.{2,}', '.', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[\'"`“”‘’]', '', text)  # removes any form of stray quotes from both ends


    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Lemmatize
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens]
    # Optional: remove stopwords
    if remove_stopwords:
        tokens = [tok for tok in tokens if tok not in stop_words]
    return " ".join(tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Apply preprocessing
for cid in train_data:
    text = train_data[cid]["claim_text"]
    train_data[cid]["claim_text"] = preprocess(text, remove_stopwords=False)

with open("train-claims-cleaned.json", "w") as f:
    json.dump(train_data, f, indent=2)

In [None]:
with open("train-claims-cleaned.json", "r") as f:
    train_data_cleaned = json.load(f)

claim_lengths = [len(item["claim_text"]) for item in train_data_cleaned.values()]
print("max cleaned training claim length:", max(claim_lengths))
print("min cleaned training claim length:", min(claim_lengths))
print("mean cleaned training claim length:", sum(claim_lengths) / len(claim_lengths))

max cleaned training claim length: 329
min cleaned training claim length: 26
mean cleaned training claim length: 121.52524429967427


In [None]:
cleaned_evidence = {
    evid_id: preprocess(text) for evid_id, text in evidence_data.items()
}

with open("evidence-cleaned.json", "w") as f:
    json.dump(cleaned_evidence, f, indent=2)

# Task 1: Evidence Retrieval

## Model 1: BERTTopic

### Step 1: Clustering the training claims using BERTopic

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic
import json
import pandas as pd

# Load training claims
with open("train-claims.json", "r") as f:
    train_claims = json.load(f)

# Extract claim texts and IDs
claim_ids = list(train_claims.keys())
claim_texts = [train_claims[cid]["claim_text"] for cid in claim_ids]

# Create and fit BERTopic model
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")  # Small and Colab-friendly
topics, probs = topic_model.fit_transform(claim_texts)

# Save topic model for reuse
topic_model.save("bertopic_claims_model")

# Save clustering results
claim_cluster_df = pd.DataFrame({
    "claim_id": claim_ids,
    "claim_text": claim_texts,
    "topic": topics,
    "probability": probs
})
claim_cluster_df.to_csv("claim_clusters.csv", index=False)


In [None]:
topic_model = BERTopic.load("bertopic_claims_model")

topic_model.visualize_topics()


In [None]:
topic_model.visualize_heatmap()


In [None]:
n_clusters = len(set(claim_cluster_df["topic"])) - ("-1" in set(claim_cluster_df["topic"]))
print(f"Number of clusters (excluding outliers): {n_clusters}")

Number of clusters (excluding outliers): 22


### Step 2: Extract Evidence Text per Cluster

In [None]:
import json
import pandas as pd
from collections import defaultdict

# === LOAD DATA ===
with open("train-claims.json", "r") as f:
    train_claims = json.load(f)

with open("evidence.json", "r", encoding="utf-8") as f:
    evidence_data = json.load(f)

# Load claim cluster assignments
claim_cluster_df = pd.read_csv("claim_clusters.csv")  # must contain: claim_id, topic

# === MAP EACH TOPIC TO ITS EVIDENCE PASSAGES ===
cluster_evidence_map = defaultdict(list)

for _, row in claim_cluster_df.iterrows():
    claim_id = row["claim_id"]
    topic = row["topic"]

    if claim_id in train_claims:
        evidence_ids = train_claims[claim_id].get("evidences", [])
        for eid in evidence_ids:
            passage = evidence_data.get(eid)
            if passage:
                cluster_evidence_map[topic].append(passage)

# === OPTIONAL: Convert to a DataFrame for downstream use ===
clustered_evidence_df = pd.DataFrame([
    {"topic": topic, "evidence_text": evidence}
    for topic, evidences in cluster_evidence_map.items()
    for evidence in evidences
])

# === SAVE IF NEEDED ===
clustered_evidence_df.to_csv("clustered_evidence.csv", index=False)


### Step 3: Extract Evidence Keywords by Cluster

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from collections import defaultdict

# Load the evidence dataframe (sample already saved)
clustered_evidence_df = pd.read_csv("clustered_evidence.csv")

##################### Limit to first 2000 rows for sampling (per topic if needed)
#sampled_df = clustered_evidence_df.groupby("topic").head(2000)

# Build a mapping: topic -> list of evidence texts
topic_to_texts = defaultdict(list)
for _, row in clustered_evidence_df.iterrows():
    topic_to_texts[row["topic"]].append(row["evidence_text"])

# === Extract keywords using TF-IDF per topic ===
topic_keywords = {}

for topic, texts in topic_to_texts.items():
    # Combine all evidence into a single "document" per topic
    corpus = [" ".join(texts)]

    # Use TF-IDF to get important terms
    vectorizer = TfidfVectorizer(max_features=20, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    keywords = vectorizer.get_feature_names_out()

    topic_keywords[topic] = keywords.tolist()

# Convert to DataFrame for inspection
keywords_df = pd.DataFrame([
    {"topic": topic, "keywords": ", ".join(words)}
    for topic, words in topic_keywords.items()
])

# Save if needed
keywords_df.to_csv("topic_keywords_by_cluster.csv", index=False)

# Preview output
keywords_df.head()


Unnamed: 0,topic,keywords
0,0,"atmosphere, atmospheric, carbon, climate, co2,..."
1,-1,"carbon, change, changes, climate, earth, emiss..."
2,2,"assessment, caused, change, climate, consensus..."
3,3,"average, celsius, century, climate, data, deca..."
4,1,"antarctic, arctic, climate, glaciers, global, ..."


### Step 4: Extract Keywords from ALL Evidence Passages

In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# === Load full evidence corpus ===
with open("evidence.json", "r", encoding="utf-8") as f:
    evidence_data = json.load(f)

# Convert to DataFrame
evidence_df = pd.DataFrame([
    {"evidence_id": eid, "text": text}
    for eid, text in evidence_data.items()
])

# === Apply TF-IDF per evidence passage ===

# Vectorize all passages together to ensure consistent vocabulary
vectorizer = TfidfVectorizer(max_features=1000, stop_words="english")
tfidf_matrix = vectorizer.fit_transform(evidence_df["text"])
feature_names = vectorizer.get_feature_names_out()

# Extract top N keywords per passage (set N = 15 here)
top_n = 15
evidence_keywords = []

for i, row in enumerate(tfidf_matrix):
    row_data = row.toarray().flatten()
    top_indices = row_data.argsort()[::-1][:top_n]
    top_words = [feature_names[idx] for idx in top_indices if row_data[idx] > 0]
    evidence_keywords.append(top_words)

evidence_df["keywords"] = evidence_keywords

# Save for matching later
evidence_df.to_csv("evidence_keywords.csv", index=False)

# Preview
evidence_df.head()


Unnamed: 0,evidence_id,text,keywords
0,evidence-0,"John Bennet Lawes, English entrepreneur and ag...","[john, english]"
1,evidence-1,Lindberg began his professional career at the ...,"[eventually, 1977, age, began, 16, career, pro..."
2,evidence-2,``Boston (Ladies of Cambridge)'' by Vampire We...,[]
3,evidence-3,"Gerald Francis Goyer (born October 20, 1936) w...","[hockey, 1936, 40, ice, 20, games, professiona..."
4,evidence-4,He detected abnormalities of oxytocinergic fun...,"[release, post]"


TFIDF perform terribly, so we alter back to the BertTopic Model (tradeoff b/w computation efficiency and prediction accuracy)

In [None]:
from bertopic import BERTopic
import json
import pandas as pd

# === Load evidence data ===
with open("evidence.json", "r", encoding="utf-8") as f:
    evidence_data = json.load(f)

evidence_ids = list(evidence_data.keys())
evidence_texts = list(evidence_data.values())

# === Train BERTopic model on all evidence ===
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", calculate_probabilities=False, verbose=True)
topics, _ = topic_model.fit_transform(evidence_texts)

# === Extract keywords for EACH evidence ===
evidence_keywords = []

for idx, text in enumerate(evidence_texts):
    topic = topics[idx]
    if topic == -1:
        # -1 is outlier topic — assign empty keywords or fallback
        keywords = []
    else:
        # Extract keywords for the current topic and filter those relevant to this doc
        topic_words = topic_model.get_topic(topic)
        doc_keywords = []
        for word, _ in topic_words:
            if word.lower() in text.lower():
                doc_keywords.append(word)
            if len(doc_keywords) >= 10:
                break
        keywords = doc_keywords

    evidence_keywords.append({
        "evidence_id": evidence_ids[idx],
        "text": text,
        "keywords": keywords
    })

# === Save as DataFrame ===
df = pd.DataFrame(evidence_keywords)
df.to_csv("evidence_keywords.csv", index=False)


### Step 5: Using NER-mased matching mechanism

get NER on claims

In [None]:
!pip install "numpy<2.0"



In [None]:
import json
import spacy
from tqdm import tqdm

# Load spaCy English NER model
nlp = spacy.load("en_core_web_sm")

# Load evidence data
with open("evidence.json", "r") as f:
    evidence_data = json.load(f)

# Extract NER for each evidence entry
evidence_ner = {}
for eid, text in tqdm(evidence_data.items(), desc="Extracting NER"):
    doc = nlp(text)
    ner_entities = list(set(ent.text for ent in doc.ents if ent.label_))  # deduplicated
    evidence_ner[eid] = ner_entities

# Save to JSON
with open("evidence_ner.json", "w") as f:
    json.dump(evidence_ner, f, indent=2)

In [None]:
import spacy
import json
from tqdm import tqdm

# Load spaCy and NER JSON
nlp = spacy.load("en_core_web_sm")

with open("dev-claims.json") as f:
    claims = json.load(f)

claim_ner = {}
for cid, entry in tqdm(claims.items()):
    doc = nlp(entry["claim_text"])
    ents = list(set(ent.text.lower() for ent in doc.ents if ent.label_))
    claim_ner[cid] = ents

with open("claim_ner.json", "w") as f:
    json.dump(claim_ner, f, indent=2)


100%|██████████| 154/154 [00:00<00:00, 227.69it/s]


evidence_ner.json is obtained in preprocess_evidence_for_ner.ipynb \
now load both of them

In [None]:
import json

with open("claim_ner.json", "r") as f:
    claim_ner = json.load(f)

with open("evidence_ner.json", "r") as f:
    evidence_ner = json.load(f)

Compute NER Overlap Score

In [None]:
def compute_ner_overlap(claim_entities, evidence_entities):
    claim_set = set(e.lower() for e in claim_entities)
    evidence_set = set(e.lower() for e in evidence_entities)
    overlap = claim_set & evidence_set
    return len(overlap) / (len(claim_set) + 1e-6)  # +1e-6 to avoid div-by-zero


Define the NER similarity scoring function

In [None]:
def ner_overlap_score(claim_ents, evidence_ents):
    if not claim_ents or not evidence_ents:
        return 0.0

    # Partial match allowed — token overlap
    count = 0
    for ce in claim_ents:
        for ee in evidence_ents:
            if ce in ee or ee in ce:
                count += 1
                break  # avoid multiple matches for same ce

    return count / len(claim_ents)


### Step 6: FineTune SentenceTransformer for Next Step

In [None]:
!pip install datasets

In [None]:
from datasets import Dataset
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
import json

# === Step 1: Load and prepare data ===
with open("train-claims.json", "r") as f:
    train_claims = json.load(f)
with open("evidence.json", "r") as f:
    evidence_data = json.load(f)

anchor_texts = []
positive_texts = []

for claim in train_claims.values():
    claim_text = claim["claim_text"]
    for eid in claim.get("evidences", []):
        if eid in evidence_data:
            anchor_texts.append(claim_text)
            positive_texts.append(evidence_data[eid])

dataset = Dataset.from_dict({
    "anchor": anchor_texts,
    "positive": positive_texts,
})

# === Step 2: Initialize model and loss ===
model = SentenceTransformer("all-MiniLM-L6-v2")
loss = MultipleNegativesRankingLoss(model)

# === Step 3: Training arguments ===
args = SentenceTransformerTrainingArguments(
    output_dir="fine-tuned-sentence-transformer",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    learning_rate=2e-5,
    fp16=True,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_steps=10,
)

# === Step 4: Trainer and Training ===
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    loss=loss,
)
trainer.train()

# Save model
model.save("fine-tuned-sentence-transformer")

### Step 7: Evidence Retrieval via Weighted Keyword Similarity

For each test claim, match it to the most relevant evidence passages, using both:
- Keywords from the claim itself, and
- Keywords from the nearest cluster (with inverse distance weighting

In [None]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast

# === Load Data ===
with open("dev-claims.json", "r") as f:
    test_claims = json.load(f)

evidence_df = pd.read_csv("evidence_keywords.csv")
evidence_df["keywords"] = evidence_df["keywords"].apply(ast.literal_eval)

# === Embed Evidence Passages ===
embedder = SentenceTransformer("fine-tuned-sentence-transformer")
evidence_texts = evidence_df["text"].tolist()
evidence_embeddings = embedder.encode(evidence_texts, batch_size=256, show_progress_bar=True)

# === Process All Claims ===
retrieval_results = {}
top_k = 5

for idx, (cid, cinfo) in enumerate(test_claims.items()):
    claim_text = cinfo["claim_text"]
    print(f"\n[{idx+1}/{len(test_claims)}] Claim ID: {cid}")
    print(f"  → Claim text: {claim_text}")

    # Embed claim
    claim_embedding = embedder.encode([claim_text])[0]

    # Compute cosine similarity to all evidence passages
    similarities = cosine_similarity([claim_embedding], evidence_embeddings)[0]

    # Get top-k evidence
    top_indices = np.argsort(similarities)[::-1][:top_k]
    top_evidence_ids = evidence_df.iloc[top_indices]["evidence_id"].tolist()
    top_scores = similarities[top_indices]

    for i, eid in enumerate(top_evidence_ids):
        print(f"    Rank {i+1}: {eid} (score: {top_scores[i]:.4f})")

    retrieval_results[cid] = top_evidence_ids

# === Save Output in Required Format ===
formatted_predictions = {
    cid: {
        "claim_label": "NOT_ENOUGH_INFO",
        "evidences": evids
    }
    for cid, evids in retrieval_results.items()
}

with open("dev-claims-predictions.json", "w") as f:
    json.dump(formatted_predictions, f, indent=2)


Batches:   0%|          | 0/4722 [00:00<?, ?it/s]


[1/5] Claim ID: claim-752
  → Claim text: [South Australia] has the most expensive electricity in the world.
    Rank 1: evidence-67732 (score: 0.8175)
    Rank 2: evidence-572512 (score: 0.7730)
    Rank 3: evidence-780332 (score: 0.6659)
    Rank 4: evidence-1061888 (score: 0.6277)
    Rank 5: evidence-452156 (score: 0.6275)

[2/5] Claim ID: claim-375
  → Claim text: when 3 per cent of total annual global emissions of carbon dioxide are from humans and Australia prod­uces 1.3 per cent of this 3 per cent, then no amount of emissions reductio­n here will have any effect on global climate.
    Rank 1: evidence-647121 (score: 0.7270)
    Rank 2: evidence-559290 (score: 0.6956)
    Rank 3: evidence-415619 (score: 0.6742)
    Rank 4: evidence-361694 (score: 0.6684)
    Rank 5: evidence-949910 (score: 0.6644)

[3/5] Claim ID: claim-1266
  → Claim text: This means that the world is now 1C warmer than it was in pre-industrial times
    Rank 1: evidence-694262 (score: 0.6901)
    Rank 2: evid

Alternative: combine with NER

In [None]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from tqdm import tqdm

# === Load Data ===
with open("dev-claims.json", "r") as f:
    test_claims = json.load(f)

evidence_df = pd.read_csv("evidence_keywords.csv")
evidence_df["keywords"] = evidence_df["keywords"].apply(ast.literal_eval)

with open("claim_ner.json", "r") as f:
    claim_ner = json.load(f)

with open("evidence_ner.json", "r") as f:
    evidence_ner = json.load(f)

# === Load fine-tuned SentenceTransformer ===
embedder = SentenceTransformer("fine-tuned-sentence-transformer")
evidence_texts = evidence_df["text"].tolist()
evidence_ids = evidence_df["evidence_id"].tolist()
evidence_embeddings = embedder.encode(evidence_texts, batch_size=256, show_progress_bar=True)

# === Define NER overlap scoring ===
def ner_overlap_score(claim_ents, evidence_ents):
    if not claim_ents or not evidence_ents:
        return 0.0
    count = 0
    for ce in claim_ents:
        for ee in evidence_ents:
            if ce in ee or ee in ce:
                count += 1
                break
    return count / len(claim_ents)

# === Perform Retrieval with Combined Score ===
top_k = 5
alpha = 0.8  # adjust this weight between [0, 1] — more toward semantic

retrieval_results = {}

for idx, (cid, cinfo) in enumerate(tqdm(test_claims.items(), desc="Retrieving evidences")):
    claim_text = cinfo["claim_text"]
    claim_embedding = embedder.encode([claim_text])[0]
    claim_entities = claim_ner.get(cid, [])

    similarities = cosine_similarity([claim_embedding], evidence_embeddings)[0]

    combined_scores = []
    for i, eid in enumerate(evidence_ids):
        ner_entities = evidence_ner.get(eid, [])
        ner_score = ner_overlap_score(claim_entities, ner_entities)
        combined = alpha * similarities[i] + (1 - alpha) * ner_score
        combined_scores.append((eid, combined))

    # Rank by combined score
    top_evidences = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_k]
    retrieval_results[cid] = [eid for eid, _ in top_evidences]

# === Format and Save Output ===
formatted_predictions = {
    cid: {
        "claim_label": "NOT_ENOUGH_INFO",
        "evidences": evids
    }
    for cid, evids in retrieval_results.items()
}

with open("dev-claims-predictions_fine_tuned_with_ner.json", "w") as f:
    json.dump(formatted_predictions, f, indent=2)

print("✅ Retrieval complete. Output saved to dev-claims-predictions_fine_tuned.json")


In [None]:
import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast
from tqdm import tqdm

# === Load Data ===
with open("dev-claims.json", "r") as f:
    test_claims = json.load(f)

evidence_df = pd.read_csv("evidence_keywords.csv")
evidence_df["keywords"] = evidence_df["keywords"].apply(ast.literal_eval)

with open("claim_ner.json", "r") as f:
    claim_ner = json.load(f)

with open("evidence_ner.json", "r") as f:
    evidence_ner = json.load(f)

# === Load fine-tuned SentenceTransformer ===
embedder = SentenceTransformer("fine-tuned-sentence-transformer")
evidence_texts = evidence_df["text"].tolist()
evidence_ids = evidence_df["evidence_id"].tolist()
evidence_embeddings = embedder.encode(evidence_texts, batch_size=256, show_progress_bar=True)

# === Define NER overlap scoring ===
def ner_overlap_score(claim_ents, evidence_ents):
    if not claim_ents or not evidence_ents:
        return 0.0
    count = 0
    for ce in claim_ents:
        for ee in evidence_ents:
            if ce in ee or ee in ce:
                count += 1
                break
    return count / len(claim_ents)

# === Perform Retrieval with Combined Score ===
top_k = 5

retrieval_results = {}

for idx, (cid, cinfo) in enumerate(tqdm(test_claims.items(), desc="Retrieving evidences")):
    claim_text = cinfo["claim_text"]
    claim_embedding = embedder.encode([claim_text])[0]
    claim_entities = claim_ner.get(cid, [])

    similarities = cosine_similarity([claim_embedding], evidence_embeddings)[0]

    combined_scores = []
    for i, eid in enumerate(evidence_ids):
        ner_entities = evidence_ner.get(eid, [])
        ner_score = ner_overlap_score(claim_entities, ner_entities)
        ner_boosted = 0.6 + 0.5 * ner_score  # map [0,1] → [0.6,1.0]

        combined = max(similarities[i], ner_boosted)
        combined_scores.append((eid, combined))

    # Rank by combined score
    top_evidences = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_k]
    retrieval_results[cid] = [eid for eid, _ in top_evidences]

# === Format and Save Output ===
formatted_predictions = {
    cid: {
        "claim_label": "NOT_ENOUGH_INFO",
        "evidences": evids
    }
    for cid, evids in retrieval_results.items()
}

with open("dev-claims-predictions_fine_tuned_with_ner.json", "w") as f:
    json.dump(formatted_predictions, f, indent=2)

print("✅ Retrieval complete. Output saved to dev-claims-predictions_fine_tuned.json")


### Step 8: BERTTopic Evaluation

In [None]:
import json

# Load predictions and ground truth
with open("dev-claims-predictions_fine_tuned_with_ner.json", "r") as f:
    predictions = json.load(f)

with open("dev-claims.json", "r") as f:
    ground_truth = json.load(f)

# Evaluate prediction recall
total_gold = 0
total_matched = 0

print("Match count per claim:")

for cid, info in ground_truth.items():
    gold_evidence = set(info.get("evidences", []))
    pred_evidence = set(predictions.get(cid, {}).get("evidences", []))

    matched = len(gold_evidence & pred_evidence)
    total = len(gold_evidence)

    print(f"  {cid}: {matched}/{total}")
    total_matched += matched
    total_gold += total

print("\nSummary:")
print(f"  Total gold evidences: {total_gold}")
print(f"  Total matched evidences: {total_matched}")
print(f"  Recall: {total_matched / total_gold:.2%}" if total_gold > 0 else "  No gold evidences found.")


## Model 2: Pre-ranking + Re-ranking


### 2.1 Evidence Retrieval - Pre-ranking

In [None]:
def whitespace_tokenizer(text):
    return text.split()

#### 2.1.1 Experiments - TIDF/BoW Similarity Filtering

In [None]:
import os
import json
import numpy as np
import joblib
import scipy.sparse
from tqdm import tqdm
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# --- Config ---
top_k = 4
k_cap = 5000
force_include_gold = True
n_components = 200
data_dir = "/content/drive/MyDrive/NLP_content"

print("Loading saved TF-IDF vectors and model...")
evidence_p1_tfidf = scipy.sparse.load_npz(os.path.join(data_dir, "evidence_p1_tfidf.npz"))
tfidf_vectorizer = joblib.load(os.path.join(data_dir, "tfidf_vectorizer.pkl"))

# Reload if interrupt
train_p1 = json.load(open(os.path.join(data_dir, "train-claims-preprocessed1.json")))
evidence_p1 = json.load(open(os.path.join(data_dir, "evidence-preprocessed1.json")))
train_data = json.load(open(os.path.join(data_dir, "train-claims.json")))


train_claims_ids = list(train_p1.keys())
evidence_ids = list(evidence_p1.keys())

print("Vectorizing claims...")
train_claim_vectors = tfidf_vectorizer.transform(
    [train_p1[cid]["claim_text"] for cid in tqdm(train_claims_ids, desc="Transforming claims")]
)

# Apply SVD
print(f"\n Applying Truncated SVD: reducing to {n_components} dimensions")
svd = TruncatedSVD(n_components=n_components, random_state=42)
evidence_tfidf_reduced = svd.fit_transform(evidence_p1_tfidf)
train_tfidf_reduced = svd.transform(train_claim_vectors)

svd_path = os.path.join(data_dir, "tfidf_svd_model.pkl")
joblib.dump(svd, svd_path)
print(f" SVD model saved to: {svd_path}")

# Compute cosine similarity
print("\n Calculating cosine similarity (SVD-reduced TF-IDF)...")
cosine_similarities = cosine_similarity(train_tfidf_reduced, evidence_tfidf_reduced)

# Step 1: Find global max_k to cover all gold evidence
print(" Scanning to find maximum k needed to include all gold evidences...")
ranked_indices_all = np.argsort(-cosine_similarities, axis=1)
max_k = top_k
overflow_claims = []

for i, claim_id in enumerate(tqdm(train_claims_ids, desc="Scanning max k")):
    gold_evids = set(train_data[claim_id].get("evidences", []))
    if not force_include_gold or not gold_evids:
        continue

    ranked_ids = [evidence_ids[j] for j in ranked_indices_all[i]]
    dynamic_k = top_k
    while dynamic_k <= len(ranked_ids):
        if gold_evids.issubset(set(ranked_ids[:dynamic_k])):
            break
        dynamic_k += 1
        if dynamic_k > k_cap:
            overflow_claims.append(claim_id)
            break

    max_k = max(max_k, dynamic_k)

print(f"\n Global max_k needed to include all gold evidence: {max_k}")

# Step 2: Retrieve top max_k for all claims
print(f"\n Retrieving top-{max_k} evidence for each claim...")
top_k_evidence_tfidf = {
    claim_id: ranked_indices_all[i][:max_k].tolist()
    for i, claim_id in enumerate(train_claims_ids)
}

# Step 3: Build claim-evidence dictionary
train_claims_retrieved_tfidf = {
    claim_id: {
        "claim_text": train_p1[claim_id]["claim_text"],
        "claim_label": train_data[claim_id].get("claim_label"),
        "evidences": train_data[claim_id].get("evidences", []),
        "pre_ranked_evidences": [evidence_ids[i] for i in top_k_evidence_tfidf[claim_id]]
    }
    for claim_id in train_claims_ids
}

# Step 4: Save retrieved result
output_path = os.path.join(data_dir, "train_claims_retrieved_tfidf_svd.json")
with open(output_path, "w") as f:
    json.dump(train_claims_retrieved_tfidf, f, indent=2)
print(f"\n Saved SVD-based TF-IDF retrieval results to {output_path}")

k_path = os.path.join(data_dir, "train_claims_final_k_svd.json")
with open(k_path, "w") as f:
    json.dump({"final_k": max_k}, f, indent=2)
print(f" Saved final max_k to: {k_path}")

# Step 5: Analyze cosine threshold at cutoff
print("\n Analyzing cosine similarity thresholds at max_k cutoff...")

cutoff_similarities = []
for i in range(len(train_claims_ids)):
    sims = cosine_similarities[i]
    sorted_sims = np.sort(sims)[::-1]
    if len(sorted_sims) >= max_k:
        cutoff_similarities.append(sorted_sims[max_k - 1])
    else:
        cutoff_similarities.append(sorted_sims[-1])

avg_threshold = np.mean(cutoff_similarities)
min_threshold = np.min(cutoff_similarities)
max_threshold_sim = np.max(cutoff_similarities)

print(f" Final max_k = {max_k}")
print(f" Average similarity threshold at position {max_k}: {avg_threshold:.4f}")
print(f" Min similarity at cutoff: {min_threshold:.4f}")
print(f" Max similarity at cutoff: {max_threshold_sim:.4f}")

# --- Step 6: Report overflow statistics ---
num_overflows = len(overflow_claims)
total_claims = len(train_claims_ids)
percentage = 100 * num_overflows / total_claims

print(f"\n {num_overflows} claims failed to retrieve all gold evidence within k_cap = {k_cap}.")
print(f" This is {percentage:.2f}% of all training claims.")

📥 Loading saved TF-IDF vectors and model...
🔄 Vectorizing claims...


Transforming claims: 100%|██████████| 1228/1228 [00:00<00:00, 2167763.18it/s]


🔧 Applying Truncated SVD: reducing to 200 dimensions





✅ SVD model saved to: /content/drive/MyDrive/NLP_content/tfidf_svd_model.pkl

🔍 Calculating cosine similarity (SVD-reduced TF-IDF)...
📈 Scanning to find maximum k needed to include all gold evidences...


Scanning max k: 100%|██████████| 1228/1228 [09:11<00:00,  2.23it/s]



✅ Global max_k needed to include all gold evidence: 2001

📌 Retrieving top-2001 evidence for each claim...

✅ Saved SVD-based TF-IDF retrieval results to /content/drive/MyDrive/NLP_content/train_claims_retrieved_tfidf_svd.json
📊 Saved final max_k to: /content/drive/MyDrive/NLP_content/train_claims_final_k_svd.json

📊 Analyzing cosine similarity thresholds at max_k cutoff...
📐 Final max_k = 2001
📊 Average similarity threshold at position 2001: 0.6389
📉 Min similarity at cutoff: 0.3706
📈 Max similarity at cutoff: 0.9595

⚠️ 1158 claims failed to retrieve all gold evidence within k_cap = 2000.
📊 This is 94.30% of all training claims.


In [None]:
import os
import json
import numpy as np
import joblib
import scipy.sparse
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

top_k = 4
k_cap = 2000
force_include_gold = True
data_dir = "/content/drive/MyDrive/NLP_content"

# Load BoW vectors and vectorizer
print(" Loading saved BoW vectors and model...")
evidence_p1_bow = scipy.sparse.load_npz(os.path.join(data_dir, "evidence_p1_bow.npz"))
train_p1_bow = scipy.sparse.load_npz(os.path.join(data_dir, "train_p1_bow.npz"))
bow_vectorizer = joblib.load(os.path.join(data_dir, "bow_vectorizer.pkl"))

# Reload if interrupt
train_p1 = json.load(open(os.path.join(data_dir, "train-claims-preprocessed1.json")))
evidence_p1 = json.load(open(os.path.join(data_dir, "evidence-preprocessed1.json")))
train_data = json.load(open(os.path.join(data_dir, "train-claims.json")))

train_claims_ids = list(train_p1.keys())
evidence_ids = list(evidence_p1.keys())

# Step 1: Compute cosine similarity
print("\n Calculating cosine similarity (BoW)...")
bow_cosine_similarities = cosine_similarity(train_p1_bow, evidence_p1_bow)

# Step 2: Find global max_k to cover all gold evidence
print(" Scanning to find maximum k needed to include all gold evidences...")
ranked_indices_all_bow = np.argsort(-bow_cosine_similarities, axis=1)
max_k_bow = top_k
overflow_claims_bow = []

for i, claim_id in enumerate(tqdm(train_claims_ids, desc="Scanning max k (BoW)")):
    gold_evids = set(train_data[claim_id].get("evidences", []))
    if not force_include_gold or not gold_evids:
        continue

    ranked_ids = [evidence_ids[j] for j in ranked_indices_all_bow[i]]
    dynamic_k = top_k
    while dynamic_k <= len(ranked_ids):
        if gold_evids.issubset(set(ranked_ids[:dynamic_k])):
            break
        dynamic_k += 1
        if dynamic_k > k_cap:
            overflow_claims_bow.append(claim_id)
            break

    max_k_bow = max(max_k_bow, dynamic_k)

print(f"\n Global max_k (BoW) needed to include all gold evidence: {max_k_bow}")

# Step 3: Retrieve top max_k for each claim
top_k_evidence_bow = {
    claim_id: ranked_indices_all_bow[i][:max_k_bow].tolist()
    for i, claim_id in enumerate(train_claims_ids)
}

# Step 4: Build final claim-evidence map
train_claims_retrieved_bow = {
    claim_id: {
        "claim_text": train_p1[claim_id]["claim_text"],
        "claim_label": train_data[claim_id].get("claim_label"),
        "evidences": train_data[claim_id].get("evidences", []),
        "pre_ranked_evidences": [evidence_ids[i] for i in top_k_evidence_bow[claim_id]]
    }
    for claim_id in train_claims_ids
}

bow_output_path = os.path.join(data_dir, "train_claims_retrieved_bow.json")
with open(bow_output_path, "w") as f:
    json.dump(train_claims_retrieved_bow, f, indent=2)
print(f"\n Saved BoW retrieval results to {bow_output_path}")

k_bow_path = os.path.join(data_dir, "train_claims_final_k_bow.json")
with open(k_bow_path, "w") as f:
    json.dump({"final_k": max_k_bow}, f, indent=2)
print(f" Saved final max_k (BoW) to: {k_bow_path}")

# Report threshold statistics
print("\n Analyzing cosine similarity thresholds at BoW max_k cutoff...")

cutoff_similarities_bow = []
for i in range(len(train_claims_ids)):
    sims = bow_cosine_similarities[i]
    sorted_sims = np.sort(sims)[::-1]
    if len(sorted_sims) >= max_k_bow:
        cutoff_similarities_bow.append(sorted_sims[max_k_bow - 1])
    else:
        cutoff_similarities_bow.append(sorted_sims[-1])

avg_sim_bow = np.mean(cutoff_similarities_bow)
min_sim_bow = np.min(cutoff_similarities_bow)
max_sim_bow = np.max(cutoff_similarities_bow)

print(f" Final max_k (BoW) = {max_k_bow}")
print(f" Average similarity threshold at cutoff: {avg_sim_bow:.4f}")
print(f" Min similarity: {min_sim_bow:.4f}")
print(f" Max similarity: {max_sim_bow:.4f}")

# Report overflow stats
num_overflows = len(overflow_claims_bow)
total = len(train_claims_ids)
percent = 100 * num_overflows / total

print(f"\n {num_overflows} claims failed to retrieve gold under k_cap={k_cap} using BoW.")
print(f" That’s {percent:.2f}% of all claims.")

with open(os.path.join(data_dir, "overflow_claims_bow.json"), "w") as f:
    json.dump(overflow_claims_bow, f, indent=2)
print(" Saved BoW overflow claim IDs to overflow_claims_bow.json")


📥 Loading saved BoW vectors and model...

🔍 Calculating cosine similarity (BoW)...
📈 Scanning to find maximum k needed to include all gold evidences...


Scanning max k (BoW): 100%|██████████| 1228/1228 [03:28<00:00,  5.90it/s]



✅ Global max_k (BoW) needed to include all gold evidence: 2001

✅ Saved BoW retrieval results to /content/drive/MyDrive/NLP_content/train_claims_retrieved_bow.json
📊 Saved final max_k (BoW) to: /content/drive/MyDrive/NLP_content/train_claims_final_k_bow.json

📊 Analyzing cosine similarity thresholds at BoW max_k cutoff...
📐 Final max_k (BoW) = 2001
📊 Average similarity threshold at cutoff: 0.2076
📉 Min similarity: 0.0000
📈 Max similarity: 0.4714

⚠️ 879 claims failed to retrieve gold under k_cap=2000 using BoW.
📊 That’s 71.58% of all claims.
📝 Saved BoW overflow claim IDs to overflow_claims_bow.json


In [None]:
import os
import json
import numpy as np
import scipy.sparse
import joblib
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
n_components = 200
fixed_k_values = [5000, 10000, 15000, 20000, 25000, 50000, ]

# Load necessary files
train_data = json.load(open(os.path.join(data_dir, "train-claims.json")))
train_p1 = json.load(open(os.path.join(data_dir, "train-claims-preprocessed1.json")))
evidence_p1 = json.load(open(os.path.join(data_dir, "evidence-preprocessed1.json")))

train_claims_ids = list(train_p1.keys())
evidence_ids = list(evidence_p1.keys())

# Load vectorizer and matrix
tfidf_vectorizer = joblib.load(os.path.join(data_dir, "tfidf_vectorizer.pkl"))
evidence_p1_tfidf = scipy.sparse.load_npz(os.path.join(data_dir, "evidence_p1_tfidf.npz"))
svd = joblib.load(os.path.join(data_dir, "tfidf_svd_model.pkl"))

# Transform claim vectors and reduce dimensions
print("Vectorizing and reducing claims...")
train_claim_vectors = tfidf_vectorizer.transform(
    [train_p1[cid]["claim_text"] for cid in tqdm(train_claims_ids, desc="Transforming claims")]
)
evidence_tfidf_reduced = svd.transform(evidence_p1_tfidf)
train_tfidf_reduced = svd.transform(train_claim_vectors)

# Compute cosine similarities and sort
print("Calculating cosine similarities...")
cosine_similarities = cosine_similarity(train_tfidf_reduced, evidence_tfidf_reduced)
ranked_indices_all = np.argsort(-cosine_similarities, axis=1)

# Evaluate for each k
coverage_results = []

for fixed_k in fixed_k_values:
    missed = 0
    total = 0

    print(f"\nEvaluating top-{fixed_k} evidence coverage...")
    for i, claim_id in enumerate(tqdm(train_claims_ids, desc=f"Top-{fixed_k}")):
        gold_evidence = set(train_data[claim_id].get("evidences", []))
        if not gold_evidence:
            continue

        retrieved_ids = [evidence_ids[j] for j in ranked_indices_all[i][:fixed_k]]
        if not gold_evidence.issubset(set(retrieved_ids)):
            missed += 1
        total += 1

    coverage_percent = 100 * (1 - missed / total)
    coverage_results.append({
        "tested_k": fixed_k,
        "total_claims_with_gold": total,
        "claims_missing_gold": missed,
        "coverage_percent": round(coverage_percent, 2)
    })

# Output results as DataFrame
coverage_df = pd.DataFrame(coverage_results)
print("\nFixed-K Evidence Coverage Results:")
print(coverage_df.to_string(index=False))


Vectorizing and reducing claims...


Transforming claims: 100%|██████████| 1228/1228 [00:00<00:00, 1740657.42it/s]


Calculating cosine similarities...

Evaluating top-5000 evidence coverage...


Top-5000: 100%|██████████| 1228/1228 [00:04<00:00, 295.28it/s]



Evaluating top-10000 evidence coverage...


Top-10000: 100%|██████████| 1228/1228 [00:08<00:00, 149.35it/s]



Evaluating top-15000 evidence coverage...


Top-15000: 100%|██████████| 1228/1228 [00:12<00:00, 101.79it/s]



Evaluating top-20000 evidence coverage...


Top-20000: 100%|██████████| 1228/1228 [00:17<00:00, 72.11it/s]



Evaluating top-25000 evidence coverage...


Top-25000: 100%|██████████| 1228/1228 [00:20<00:00, 59.37it/s]



Evaluating top-50000 evidence coverage...


Top-50000: 100%|██████████| 1228/1228 [00:38<00:00, 31.94it/s]


Fixed-K Evidence Coverage Results:
 tested_k  total_claims_with_gold  claims_missing_gold  coverage_percent
     5000                    1228                 1092             11.07
    10000                    1228                 1011             17.67
    15000                    1228                  943             23.21
    20000                    1228                  884             28.01
    25000                    1228                  840             31.60
    50000                    1228                  649             47.15





In [None]:
import os
import json
import numpy as np
import scipy.sparse
import joblib
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
fixed_k_values = [5000, 10000, 15000, 20000, 25000, 50000]

# Load necessary files
train_data = json.load(open(os.path.join(data_dir, "train-claims.json")))
train_p1 = json.load(open(os.path.join(data_dir, "train-claims-preprocessed1.json")))
evidence_p1 = json.load(open(os.path.join(data_dir, "evidence-preprocessed1.json")))

train_claims_ids = list(train_p1.keys())
evidence_ids = list(evidence_p1.keys())

# Load BoW vectorizer and matrix
bow_vectorizer = joblib.load(os.path.join(data_dir, "bow_vectorizer.pkl"))
evidence_p1_bow = scipy.sparse.load_npz(os.path.join(data_dir, "evidence_p1_bow.npz"))
train_p1_bow = scipy.sparse.load_npz(os.path.join(data_dir, "train_p1_bow.npz"))

# Compute cosine similarities and sort
print("Calculating cosine similarities (BoW)...")
cosine_similarities = cosine_similarity(train_p1_bow, evidence_p1_bow)
ranked_indices_all = np.argsort(-cosine_similarities, axis=1)

# Evaluate for each k
coverage_results = []

for fixed_k in fixed_k_values:
    missed = 0
    total = 0

    print(f"\nEvaluating top-{fixed_k} evidence coverage (BoW)...")
    for i, claim_id in enumerate(tqdm(train_claims_ids, desc=f"Top-{fixed_k} BoW")):
        gold_evidence = set(train_data[claim_id].get("evidences", []))
        if not gold_evidence:
            continue

        retrieved_ids = [evidence_ids[j] for j in ranked_indices_all[i][:fixed_k]]
        if not gold_evidence.issubset(set(retrieved_ids)):
            missed += 1
        total += 1

    coverage_percent = 100 * (1 - missed / total)
    coverage_results.append({
        "tested_k": fixed_k,
        "total_claims_with_gold": total,
        "claims_missing_gold": missed,
        "coverage_percent": round(coverage_percent, 2)
    })

# Output results as DataFrame
coverage_df = pd.DataFrame(coverage_results)
print("\nFixed-K Evidence Coverage Results (BoW):")
print(coverage_df.to_string(index=False))

Calculating cosine similarities (BoW)...

Evaluating top-5000 evidence coverage (BoW)...


Top-5000 BoW: 100%|██████████| 1228/1228 [00:03<00:00, 313.59it/s]



Evaluating top-10000 evidence coverage (BoW)...


Top-10000 BoW: 100%|██████████| 1228/1228 [00:07<00:00, 163.00it/s]



Evaluating top-15000 evidence coverage (BoW)...


Top-15000 BoW: 100%|██████████| 1228/1228 [00:11<00:00, 109.57it/s]



Evaluating top-20000 evidence coverage (BoW)...


Top-20000 BoW: 100%|██████████| 1228/1228 [00:16<00:00, 74.86it/s]



Evaluating top-25000 evidence coverage (BoW)...


Top-25000 BoW: 100%|██████████| 1228/1228 [00:19<00:00, 63.46it/s]



Evaluating top-50000 evidence coverage (BoW)...


Top-50000 BoW: 100%|██████████| 1228/1228 [00:30<00:00, 39.94it/s]


Fixed-K Evidence Coverage Results (BoW):
 tested_k  total_claims_with_gold  claims_missing_gold  coverage_percent
     5000                    1228                  727             40.80
    10000                    1228                  632             48.53
    15000                    1228                  564             54.07
    20000                    1228                  522             57.49
    25000                    1228                  477             61.16
    50000                    1228                  365             70.28





#### 2.1.2 Hybrid Method - BM25 & MiniLM Bi-Encoder

##### 2.1.2.1. BM25

In [None]:
!apt-get update
!apt-get install -y openjdk-11-jdk
!update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-11-openjdk-amd64/bin/java 1
!update-alternatives --set java /usr/lib/jvm/java-11-openjdk-amd64/bin/java
!java -version

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,676 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,934 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://

In [None]:
!pip install pyserini==0.17.0 --no-deps

Collecting pyserini==0.17.0
  Downloading pyserini-0.17.0-py3-none-any.whl.metadata (4.5 kB)
Downloading pyserini-0.17.0-py3-none-any.whl (109.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.5/109.5 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyserini
Successfully installed pyserini-0.17.0


In [None]:
!pip install cython
!pip install onnxruntime
!pip install pyjnius

Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pac

In [None]:
!pip install faiss-cpu
!pip install faiss-gpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [None]:
!rm -rf indexes/evidence_index
!python -m pyserini.index \
  --collection JsonCollection \
  --input corpus_json \
  --index indexes/evidence_index \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --storePositions --storeDocvectors --storeRaw

pyserini.index is deprecated, please use pyserini.index.lucene.
2025-05-13 03:53:07,649 INFO  [main] index.IndexCollection (IndexCollection.java:645) - Setting log level to INFO
2025-05-13 03:53:07,652 INFO  [main] index.IndexCollection (IndexCollection.java:648) - Starting indexer...
2025-05-13 03:53:07,653 INFO  [main] index.IndexCollection (IndexCollection.java:650) - DocumentCollection path: corpus_json
2025-05-13 03:53:07,653 INFO  [main] index.IndexCollection (IndexCollection.java:651) - CollectionClass: JsonCollection
2025-05-13 03:53:07,653 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Generator: DefaultLuceneDocumentGenerator
2025-05-13 03:53:07,654 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Threads: 4
2025-05-13 03:53:07,654 INFO  [main] index.IndexCollection (IndexCollection.java:654) - Language: en
2025-05-13 03:53:07,654 INFO  [main] index.IndexCollection (IndexCollection.java:655) - Stemmer: porter
2025-05-13 03:53:07,654 INFO  [main

In [None]:
!pip install onnxruntime

Collecting onnxruntime
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m136.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected pac

In [None]:
import os
import json

data_dir = "/content/drive/MyDrive/NLP_content"
evidence_path = os.path.join(data_dir, "evidence-preprocessed2.json")
output_jsonl_dir = "corpus_json"
output_jsonl_file = os.path.join(output_jsonl_dir, "evidence_corpus.jsonl")

os.makedirs(output_jsonl_dir, exist_ok=True)

with open(evidence_path, "r", encoding="utf-8") as f:
    evidence_corpus = json.load(f)

# Convert to Pyserini-compatible JSONL
with open(output_jsonl_file, "w", encoding="utf-8") as out_f:
    for evid, text in evidence_corpus.items():
        doc = {"id": str(evid), "contents": text}
        out_f.write(json.dumps(doc) + "\n")

print(f" Converted {len(evidence_corpus)} evidence entries to Pyserini JSONL format at {output_jsonl_file}")

 Converted 1208827 evidence entries to Pyserini JSONL format at corpus_json/evidence_corpus.jsonl


In [None]:
# Save the evidence BM25 index
import shutil
DATA_DIR = "/content/drive/MyDrive/NLP_content"
shutil.copytree("indexes/evidence_index", f"{DATA_DIR}/indexes/evidence_index")

'/content/drive/MyDrive/NLP_content/indexes/evidence_index'

In [None]:
import os
import json
import numpy as np
from tqdm import tqdm
from itertools import product
from concurrent.futures import ThreadPoolExecutor
from pyserini.search.lucene import LuceneSearcher

# Paths and grid
DATA_DIR = "/content/drive/MyDrive/NLP_content"
INDEX_PATH = "indexes/evidence_index"
SAVE_PATH = os.path.join(DATA_DIR, "bm25_grid_results.json")

PARAM_GRID = {
    "k1": [0.5, 1.0, 1.5, 2.0],
    "b": [0.3, 0.5, 0.7, 0.9],
    "top_k": [100, 500, 1000, 2000, 5000]
}

# Load data
def load_json_file(filename):
    with open(os.path.join(DATA_DIR, filename)) as f:
        return json.load(f)

train_data = load_json_file("train-claims-preprocessed2.json")

# Evaluate BM25
def evaluate_bm25(params):
    k1, b, top_k = params
    try:
        searcher = LuceneSearcher(INDEX_PATH)
        searcher.set_bm25(k1=k1, b=b)

        total_claims = 0
        total_gold = 0
        total_retrieved_gold = 0
        full_coverage_count = 0

        for claim_data in train_data.values():
            gold_evid_ids = set(claim_data.get("evidences", []))
            if not gold_evid_ids:
                continue

            hits = searcher.search(claim_data["claim_text"], top_k)
            retrieved_ids = set(hit.docid for hit in hits)
            matched = gold_evid_ids.intersection(retrieved_ids)

            total_gold += len(gold_evid_ids)
            total_retrieved_gold += len(matched)
            total_claims += 1
            if matched == gold_evid_ids:
                full_coverage_count += 1

        return {
            "k1": k1,
            "b": b,
            "top_k": top_k,
            "avg_retrieved_gold": round(total_retrieved_gold / total_claims, 2),
            "recall": round(total_retrieved_gold / total_gold * 100, 2),
            "full_coverage_pct": round(full_coverage_count / total_claims * 100, 2)
        }
    except Exception as e:
        return {"k1": k1, "b": b, "top_k": top_k, "error": str(e)}

# Load prior results
results = []
done_configs = set()
if os.path.exists(SAVE_PATH):
    with open(SAVE_PATH, "r") as f:
        results = json.load(f)
        done_configs = {(r["k1"], r["b"], r["top_k"]) for r in results}

# Run parallel search with checkpointing
grid = list(product(PARAM_GRID["k1"], PARAM_GRID["b"], PARAM_GRID["top_k"]))
grid = [params for params in grid if params not in done_configs]

print(f"Running parallel BM25 grid search on {len(grid)} configs...")

with ThreadPoolExecutor(max_workers=4) as executor:
    for result in tqdm(executor.map(evaluate_bm25, grid), total=len(grid)):
        results.append(result)
        with open(SAVE_PATH, "w") as f:
            json.dump(results, f, indent=2)


Running parallel BM25 grid search on 80 configs...


100%|██████████| 80/80 [36:57<00:00, 27.72s/it]


In [None]:
import json

# Load saved results
results_path = "/content/drive/MyDrive/NLP_content/bm25_grid_results.json"
with open(results_path, "r") as f:
    results = json.load(f)

# Filter for top_k = 5000 and sort by recall
results_k5000 = sorted(
    [r for r in results if r["top_k"] == 5000],
    key=lambda x: -x["recall"]
)

# Filter for top_k = 2000 and sort by recall
results_k2000 = sorted(
    [r for r in results if r["top_k"] == 2000],
    key=lambda x: -x["recall"]
)

# Print top 10 for k = 5000
print("Top 10 configurations for top_k = 5000:\n")
for res in results_k5000[:10]:
    print(res)

# Print top 10 for k = 2000
print("\nTop 10 configurations for top_k = 2000:\n")
for res in results_k2000[:10]:
    print(res)

Top 10 configurations for top_k = 5000:

{'k1': 0.5, 'b': 0.3, 'top_k': 5000, 'avg_retrieved_gold': 2.83, 'recall': 84.33, 'full_coverage_pct': 68.0}
{'k1': 0.5, 'b': 0.5, 'top_k': 5000, 'avg_retrieved_gold': 2.83, 'recall': 84.18, 'full_coverage_pct': 67.83}
{'k1': 1.0, 'b': 0.3, 'top_k': 5000, 'avg_retrieved_gold': 2.82, 'recall': 84.16, 'full_coverage_pct': 67.75}
{'k1': 0.5, 'b': 0.7, 'top_k': 5000, 'avg_retrieved_gold': 2.82, 'recall': 84.04, 'full_coverage_pct': 67.18}
{'k1': 1.0, 'b': 0.5, 'top_k': 5000, 'avg_retrieved_gold': 2.82, 'recall': 83.94, 'full_coverage_pct': 67.1}
{'k1': 1.5, 'b': 0.3, 'top_k': 5000, 'avg_retrieved_gold': 2.82, 'recall': 83.89, 'full_coverage_pct': 67.1}
{'k1': 0.5, 'b': 0.9, 'top_k': 5000, 'avg_retrieved_gold': 2.81, 'recall': 83.79, 'full_coverage_pct': 66.69}
{'k1': 1.0, 'b': 0.7, 'top_k': 5000, 'avg_retrieved_gold': 2.81, 'recall': 83.72, 'full_coverage_pct': 66.21}
{'k1': 1.5, 'b': 0.5, 'top_k': 5000, 'avg_retrieved_gold': 2.81, 'recall': 83.67, 

In [None]:
# Mining BM25 Hard Negatives (explicitly sorted by scores)
import os
import json
from pyserini.search.lucene import LuceneSearcher
from tqdm import tqdm

# Constants
DATA_DIR = "/content/drive/MyDrive/NLP_content"
INDEX_PATH = os.path.join(DATA_DIR, "indexes/evidence_index")
INPUT_PATH = os.path.join(DATA_DIR, "train-claims-preprocessed2.json")
OUTPUT_PATH = os.path.join(DATA_DIR, "train-claims-with-negatives-bm25.json")

# BM25 params
k1 = 0.5
b = 0.3
top_k = 5000

# Load training claims
with open(INPUT_PATH, 'r') as f:
    train_data = json.load(f)

# Init searcher
searcher = LuceneSearcher(INDEX_PATH)
searcher.set_bm25(k1=k1, b=b)

# Output
updated_data = {}

# Search and add BM25 evidence + hard negatives with scores
for claim_id, claim_data in tqdm(train_data.items(), desc="Mining BM25 and Hard Negatives"):
    claim_text = claim_data["claim_text"]
    gold_ids = set(claim_data.get("evidences", []))

    hits = searcher.search(claim_text, top_k)

    # Separate matched golds and hard negatives with score
    bm25_evidence = [
        {"id": hit.docid, "score": hit.score}
        for hit in hits if hit.docid in gold_ids
    ]
    hard_negatives = [
        {"id": hit.docid, "score": hit.score}
        for hit in hits if hit.docid not in gold_ids
    ]

    updated_data[claim_id] = {
        "claim_text": claim_text,
        "claim_label": claim_data.get("claim_label", ""),
        "evidences": list(gold_ids),
        "BM25_evidence": bm25_evidence,
        "hard_negative": hard_negatives
    }

# Save to JSON
with open(OUTPUT_PATH, "w") as f:
    json.dump(updated_data, f, indent=2)

print(f" Saved enriched claim data with BM25 scores to: {OUTPUT_PATH}")

Mining BM25 and Hard Negatives: 100%|██████████| 1228/1228 [05:32<00:00,  3.69it/s]


 Saved enriched claim data with BM25 scores to: /content/drive/MyDrive/NLP_content/train-claims-with-negatives-bm25.json


##### 2.1.2.2. MiniLM Bi-Encoder

In [None]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [None]:
!pip install wandb



In [None]:
import os
os.environ["WANDB_API_KEY"] = "d5028d81ac9f90338e2452cbdbb9635c9d506007"
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33meachann1018[0m ([33meachann1018-the-university-of-melbourne[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!pip install faiss-cpu
!pip install faiss-gpu

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu[0m[31m
[0m

In [None]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import os
import json
import torch
import faiss
import random
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Config and Data Load
data_dir = "/content/drive/MyDrive/NLP_content"
triplet_source_file = os.path.join(data_dir, "train-claims-with-negatives.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_save_path = os.path.join(data_dir, "fine_tuned_dpr_triplet_model")
faiss_index_path = os.path.join(data_dir, "evidence_faiss.index")

with open(triplet_source_file, 'r') as f:
    claim_data = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

# Construct Triplet Training Examples
triplet_examples = []
for cid, item in tqdm(claim_data.items(), desc="Constructing triplets"):
    claim_text = item["claim_text"]
    bm25_evidence = item.get("BM25_evidence", [])
    hard_negatives = item.get("hard_negative", [])

    if not bm25_evidence or not hard_negatives:
        continue

    for pos_id in bm25_evidence:
        if pos_id not in evidence_corpus:
            continue
        pos_text = evidence_corpus[pos_id]

        sampled_negs = random.sample(hard_negatives, min(3, len(hard_negatives)))
        for neg_id in sampled_negs:
            if neg_id not in evidence_corpus:
                continue
            neg_text = evidence_corpus[neg_id]
            triplet_examples.append(InputExample(texts=[claim_text, pos_text, neg_text]))

print(f"Prepared {len(triplet_examples)} triplet examples.")

# Train with TripletLoss
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

train_dataloader = DataLoader(triplet_examples, shuffle=True, batch_size=8)
train_loss = losses.TripletLoss(
    model=model,
    distance_metric=losses.TripletDistanceMetric.COSINE,
    triplet_margin=0.3
)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=4,
    optimizer_params={'lr': 2e-5},
    warmup_steps=300,
    show_progress_bar=True,
    output_path=model_save_path
)

print(f"Model saved to: {model_save_path}")

# Build and Save FAISS Index for Dense Retrieval
model = SentenceTransformer(model_save_path)
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatIP(embedding_dim)
evid_ids = list(evidence_corpus.keys())
evid_texts = list(evidence_corpus.values())

print("Encoding evidence for FAISS index...")
evid_embeddings = model.encode(
    evid_texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)
index.add(np.array(evid_embeddings).astype('float32'))
faiss.write_index(index, faiss_index_path)

print(f"FAISS index built with {index.ntotal} documents and saved to: {faiss_index_path}")


Constructing triplets: 100%|██████████| 1228/1228 [00:00<00:00, 13728.07it/s]


Prepared 10428 triplet examples.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss
500,0.0521
1000,0.0353
1500,0.0259
2000,0.0161
2500,0.0146
3000,0.0094
3500,0.008
4000,0.0069
4500,0.0044
5000,0.0043


Model saved to: /content/drive/MyDrive/NLP_content/fine_tuned_dpr_triplet_model
Encoding evidence for FAISS index...


Batches:   0%|          | 0/18888 [00:00<?, ?it/s]

FAISS index built with 1208827 documents and saved to: /content/drive/MyDrive/NLP_content/evidence_faiss.index


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
train_claim_file = os.path.join(data_dir, "train-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "fine_tuned_dpr_triplet_model")
faiss_index_file = os.path.join(data_dir, "evidence_faiss_.index")

# Load model, FAISS index, and data
model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(train_claim_file, 'r') as f:
    train_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())

# Dense retrieval function
def dense_retrieve(claim_text: str, top_k: int = 40):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    _, I = index.search(query_vec, top_k)
    return [evid_ids[i] for i in I[0]]

# Recall@K and Accuracy@K on Train Set
def evaluate_on_train_set(claims_data, k=40):
    total_claims = 0
    recall_hits = 0
    exact_hits = 0
    total_gold_evids = 0
    matched_gold_evids = 0

    for cid, entry in tqdm(claims_data.items(), desc=f"Evaluating @Top-{k} on train set"):
        claim_text = entry["claim_text"]
        gold_ids = set(entry.get("evidences", []))
        if not gold_ids:
            continue

        retrieved_ids = set(dense_retrieve(claim_text, top_k=k))
        matched = retrieved_ids & gold_ids

        total_claims += 1
        total_gold_evids += len(gold_ids)
        matched_gold_evids += len(matched)

        if matched:
            recall_hits += 1
        if matched == gold_ids:
            exact_hits += 1

    # Compute metrics
    item_level_recall = matched_gold_evids / total_gold_evids if total_gold_evids > 0 else 0
    exact_accuracy = exact_hits / total_claims if total_claims > 0 else 0
    recall_hit_rate = recall_hits / total_claims if total_claims > 0 else 0

    # Report
    print(f"\nTrain Set Evaluation @Top-{k}:")
    print(f"Claim-level Recall: {item_level_recall:.2%} ({matched_gold_evids}/{total_gold_evids} gold evidences matched)")
    print(f"Instance-level Accuracy (all gold matched): {exact_accuracy:.2%} ({exact_hits}/{total_claims} claims)")
    print(f"High-level Recall-hit rate (≥1 gold matched): {recall_hits}/{total_claims} ({recall_hit_rate:.2%})")


# Run evaluation
for k in [5, 50, 100, 500, 1000]:
    evaluate_on_train_set(train_claims, k)


Evaluating @Top-5 on train set: 100%|██████████| 1228/1228 [03:19<00:00,  6.14it/s]



Train Set Evaluation @Top-5:
Claim-level Recall: 24.48% (1009/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 8.47% (104/1228 claims)
High-level Recall-hit rate (≥1 gold matched): 680/1228 (55.37%)


Evaluating @Top-50 on train set: 100%|██████████| 1228/1228 [03:18<00:00,  6.18it/s]



Train Set Evaluation @Top-50:
Claim-level Recall: 59.70% (2461/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 35.50% (436/1228 claims)
High-level Recall-hit rate (≥1 gold matched): 1053/1228 (85.75%)


Evaluating @Top-100 on train set: 100%|██████████| 1228/1228 [03:18<00:00,  6.19it/s]



Train Set Evaluation @Top-100:
Claim-level Recall: 69.84% (2879/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 46.25% (568/1228 claims)
High-level Recall-hit rate (≥1 gold matched): 1114/1228 (90.72%)


Evaluating @Top-500 on train set: 100%|██████████| 1228/1228 [03:17<00:00,  6.21it/s]



Train Set Evaluation @Top-500:
Claim-level Recall: 86.73% (3575/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 72.07% (885/1228 claims)
High-level Recall-hit rate (≥1 gold matched): 1190/1228 (96.91%)


Evaluating @Top-1000 on train set: 100%|██████████| 1228/1228 [03:19<00:00,  6.16it/s]


Train Set Evaluation @Top-1000:
Claim-level Recall: 91.36% (3766/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 81.84% (1005/1228 claims)
High-level Recall-hit rate (≥1 gold matched): 1203/1228 (97.96%)





In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

data_dir = "/content/drive/MyDrive/NLP_content"
train_claim_file = os.path.join(data_dir, "train-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "fine_tuned_dpr_triplet_model")
faiss_index_file = os.path.join(data_dir, "evidence_faiss_minilm.index")
output_file = os.path.join(data_dir, "train-claims-pre-ranked-minilm.json")

# Load model, FAISS index, and data
model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(train_claim_file, 'r') as f:
    train_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())

# Dense retrieval function
def dense_retrieve(claim_text: str, top_k: int = 100):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    D, I = index.search(query_vec, top_k)
    return [(evid_ids[i], float(D[0][idx])) for idx, i in enumerate(I[0])]

# Evaluation and export
def evaluate_and_export(claims_data, k=100):
    total_claims = 0
    recall_hits = 0
    exact_hits = 0
    total_gold_evids = 0
    matched_gold_evids = 0
    output_with_retrieval = {}

    for cid, entry in tqdm(claims_data.items(), desc=f"Evaluating & Saving @Top-{k}"):
        claim_text = entry["claim_text"]
        gold_ids = set(entry.get("evidences", []))
        if not gold_ids:
            continue

        retrieved = dense_retrieve(claim_text, top_k=k)
        retrieved_ids = [eid for eid, _ in retrieved]
        retrieved_set = set(retrieved_ids)
        matched = retrieved_set & gold_ids

        total_claims += 1
        total_gold_evids += len(gold_ids)
        matched_gold_evids += len(matched)

        if matched:
            recall_hits += 1
        if matched == gold_ids:
            exact_hits += 1

        output_with_retrieval[cid] = {
            "claim_text": claim_text,
            "claim_label": entry.get("claim_label", ""),
            "evidences": list(gold_ids),
            "re_ranked_evidence": retrieved_ids,
            "re_ranked_scores": [round(score, 5) for _, score in retrieved]
        }

    # Metrics
    item_level_recall = matched_gold_evids / total_gold_evids if total_gold_evids > 0 else 0
    exact_accuracy = exact_hits / total_claims if total_claims > 0 else 0
    recall_hit_rate = recall_hits / total_claims if total_claims > 0 else 0

    print(f"\nTrain Set Evaluation @Top-{k}:")
    print(f"Claim-level Recall: {item_level_recall:.2%} ({matched_gold_evids}/{total_gold_evids})")
    print(f"Instance-level Accuracy (all gold matched): {exact_accuracy:.2%} ({exact_hits}/{total_claims})")
    print(f"Recall-hit rate (≥1 gold matched): {recall_hits}/{total_claims} ({recall_hit_rate:.2%})")

    # Export to file
    with open(output_file, 'w') as f_out:
        json.dump(output_with_retrieval, f_out, indent=2)
    print(f"Output written to: {output_file}")

evaluate_and_export(train_claims, k=100)

Evaluating & Saving @Top-100: 100%|██████████| 1228/1228 [03:20<00:00,  6.13it/s]



Train Set Evaluation @Top-100:
Claim-level Recall: 69.84% (2879/4122)
Instance-level Accuracy (all gold matched): 46.25% (568/1228)
Recall-hit rate (≥1 gold matched): 1114/1228 (90.72%)
Output written to: /content/drive/MyDrive/NLP_content/train-claims-pre-ranked-minilm.json


##### 2.1.2.3 RoBERTa-base DPR Bi-Encoder

In [None]:
# RoBERTa-base DPR Bi-Encoder Training with TripletLoss and Dynamic Epochs, Early Stopping

from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.util import batch_to_device
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch
import os
import json
import math
import random
import faiss
import numpy as np

# Config and File Paths
data_dir = "/content/drive/MyDrive/NLP_content"
triplet_source_file = os.path.join(data_dir, "train-claims-with-negatives-bm25.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_save_path = os.path.join(data_dir, "roberta_dpr_biencoder")
faiss_index_path = os.path.join(data_dir, "roberta_faiss.index")

# Load claim/evidence data
with open(triplet_source_file, 'r') as f:
    claim_data = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

# Build Triplet Examples
triplet_examples = []
for cid, item in tqdm(claim_data.items(), desc="Constructing triplets"):
    claim_text = item["claim_text"]
    bm25_evidence = item.get("BM25_evidence", [])
    hard_negatives = item.get("hard_negative", [])

    if not bm25_evidence or not hard_negatives:
        continue

    for pos_item in bm25_evidence:
        pos_id = pos_item.get("id") if isinstance(pos_item, dict) else pos_item
        if not pos_id or pos_id not in evidence_corpus:
            continue
        pos_text = evidence_corpus[pos_id]

        sampled_negs = random.sample(hard_negatives, min(8, len(hard_negatives)))
        for neg_item in sampled_negs:
            neg_id = neg_item.get("id") if isinstance(neg_item, dict) else neg_item
            if not neg_id or neg_id not in evidence_corpus:
                continue
            neg_text = evidence_corpus[neg_id]

            triplet_examples.append(InputExample(texts=[claim_text, pos_text, neg_text]))

print(f"Prepared {len(triplet_examples)} RoBERTa triplet examples.")

# Initialize RoBERTa bi-encoder
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/roberta-base-nli-mean-tokens', device=device)

# Custom Training Loop with Gradient Accumulation and Early Stopping
batch_size = 8
accumulation_steps = 8
train_dataloader = DataLoader(
    triplet_examples,
    shuffle=True,
    batch_size=batch_size,
    collate_fn=model.smart_batching_collate  # keep this
)

# Dynamic epoch calculation
desired_updates = 10000
steps_per_epoch = len(train_dataloader) // accumulation_steps
epochs = math.ceil(desired_updates / steps_per_epoch)
print(f"Training for {epochs} dynamically calculated epochs")

loss_fn = losses.TripletLoss(model=model, distance_metric=losses.TripletDistanceMetric.COSINE, triplet_margin=0.3)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Early stopping config
patience = 3
best_loss = float('inf')
early_stop_counter = 0
max_epochs = 10

model.train()
for epoch in range(min(epochs, max_epochs)):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    optimizer.zero_grad()
    epoch_loss = 0

    for step, batch in enumerate(tqdm(train_dataloader, desc="Training")):
        features, labels = batch
        features = [batch_to_device(f, model.device) for f in features]

        loss = loss_fn(features, labels)
        epoch_loss += loss.item()
        loss = loss / accumulation_steps
        loss.backward()

        if (step + 1) % accumulation_steps == 0 or (step + 1) == len(train_dataloader):
            optimizer.step()
            optimizer.zero_grad()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Average epoch loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        best_loss = avg_loss
        early_stop_counter = 0
        model.save(model_save_path)
        print(f"Model improved and saved to {model_save_path}")
    else:
        early_stop_counter += 1
        print(f"No improvement. Early stop counter: {early_stop_counter}/{patience}")
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break


# Build FAISS index
model = SentenceTransformer(model_save_path)
embedding_dim = model.get_sentence_embedding_dimension()
index = faiss.IndexFlatIP(embedding_dim)
evid_ids = list(evidence_corpus.keys())
evid_texts = list(evidence_corpus.values())

print("Encoding evidence for FAISS index...")
evid_embeddings = model.encode(
    evid_texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True
)
index.add(np.array(evid_embeddings).astype('float32'))
faiss.write_index(index, faiss_index_path)
print(f"FAISS index saved to {faiss_index_path} with {index.ntotal} documents.")

Constructing triplets: 100%|██████████| 1228/1228 [00:00<00:00, 14729.65it/s]


Prepared 27808 RoBERTa triplet examples.
Training for 24 dynamically calculated epochs

Epoch 1/24


Training: 100%|██████████| 3476/3476 [05:57<00:00,  9.73it/s]


Average epoch loss: 0.0379
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 2/24


Training: 100%|██████████| 3476/3476 [05:56<00:00,  9.74it/s]


Average epoch loss: 0.0125
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 3/24


Training: 100%|██████████| 3476/3476 [05:57<00:00,  9.72it/s]


Average epoch loss: 0.0064
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 4/24


Training: 100%|██████████| 3476/3476 [05:56<00:00,  9.75it/s]


Average epoch loss: 0.0038
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 5/24


Training: 100%|██████████| 3476/3476 [05:55<00:00,  9.77it/s]


Average epoch loss: 0.0023
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 6/24


Training: 100%|██████████| 3476/3476 [05:56<00:00,  9.76it/s]


Average epoch loss: 0.0018
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 7/24


Training: 100%|██████████| 3476/3476 [05:55<00:00,  9.77it/s]


Average epoch loss: 0.0014
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 8/24


Training: 100%|██████████| 3476/3476 [05:54<00:00,  9.81it/s]


Average epoch loss: 0.0010
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 9/24


Training: 100%|██████████| 3476/3476 [05:53<00:00,  9.84it/s]


Average epoch loss: 0.0010
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder

Epoch 10/24


Training: 100%|██████████| 3476/3476 [05:54<00:00,  9.80it/s]


Average epoch loss: 0.0008
Model improved and saved to /content/drive/MyDrive/NLP_content/roberta_dpr_biencoder
Encoding evidence for FAISS index...


Batches:   0%|          | 0/18888 [00:00<?, ?it/s]

FAISS index saved to /content/drive/MyDrive/NLP_content/roberta_faiss.index with 1208827 documents.


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

# Config for RoBERTa DPR
data_dir = "/content/drive/MyDrive/NLP_content"
train_claim_file = os.path.join(data_dir, "train-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "roberta_dpr_biencoder")
faiss_index_file = os.path.join(data_dir, "roberta_faiss.index")

# Load model, FAISS index, and data
model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(train_claim_file, 'r') as f:
    train_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())

# Dense retrieval function
def dense_retrieve(claim_text: str, top_k: int = 40):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    _, I = index.search(query_vec, top_k)
    return [evid_ids[i] for i in I[0]]

# Evaluation on Train Set
def evaluate_on_train_set(claims_data, k=40):
    total_claims = 0
    recall_hits = 0
    exact_hits = 0
    total_gold_evids = 0
    matched_gold_evids = 0

    for cid, entry in tqdm(claims_data.items(), desc=f"Evaluating @Top-{k} on train set"):
        claim_text = entry["claim_text"]
        gold_ids = set(entry.get("evidences", []))
        if not gold_ids:
            continue

        retrieved_ids = set(dense_retrieve(claim_text, top_k=k))
        matched = retrieved_ids & gold_ids

        total_claims += 1
        total_gold_evids += len(gold_ids)
        matched_gold_evids += len(matched)

        if matched:
            recall_hits += 1
        if matched == gold_ids:
            exact_hits += 1

    # Metrics
    item_level_recall = matched_gold_evids / total_gold_evids if total_gold_evids > 0 else 0
    exact_accuracy = exact_hits / total_claims if total_claims > 0 else 0
    recall_hit_rate = recall_hits / total_claims if total_claims > 0 else 0

    # Report
    print(f"\n[RoBERTa DPR] Train Set Evaluation @Top-{k}:")
    print(f"Claim-level Recall: {item_level_recall:.2%} ({matched_gold_evids}/{total_gold_evids} gold evidences matched)")
    print(f"Instance-level Accuracy (all gold matched): {exact_accuracy:.2%} ({exact_hits}/{total_claims} claims)")
    print(f"Recall-hit rate (≥1 gold matched): {recall_hits}/{total_claims} ({recall_hit_rate:.2%})")

# Run evaluation
for k in [5, 50, 100, 500, 1000]:
    evaluate_on_train_set(train_claims, k)


Evaluating @Top-5 on train set: 100%|██████████| 1228/1228 [04:33<00:00,  4.50it/s]



[RoBERTa DPR] Train Set Evaluation @Top-5:
Claim-level Recall: 34.28% (1413/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 12.79% (157/1228 claims)
Recall-hit rate (≥1 gold matched): 872/1228 (71.01%)


Evaluating @Top-50 on train set: 100%|██████████| 1228/1228 [04:31<00:00,  4.52it/s]



[RoBERTa DPR] Train Set Evaluation @Top-50:
Claim-level Recall: 66.98% (2761/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 41.21% (506/1228 claims)
Recall-hit rate (≥1 gold matched): 1124/1228 (91.53%)


Evaluating @Top-100 on train set: 100%|██████████| 1228/1228 [04:34<00:00,  4.47it/s]



[RoBERTa DPR] Train Set Evaluation @Top-100:
Claim-level Recall: 75.42% (3109/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 52.69% (647/1228 claims)
Recall-hit rate (≥1 gold matched): 1154/1228 (93.97%)


Evaluating @Top-500 on train set: 100%|██████████| 1228/1228 [04:35<00:00,  4.46it/s]



[RoBERTa DPR] Train Set Evaluation @Top-500:
Claim-level Recall: 88.91% (3665/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 76.30% (937/1228 claims)
Recall-hit rate (≥1 gold matched): 1201/1228 (97.80%)


Evaluating @Top-1000 on train set: 100%|██████████| 1228/1228 [04:36<00:00,  4.44it/s]


[RoBERTa DPR] Train Set Evaluation @Top-1000:
Claim-level Recall: 91.63% (3777/4122 gold evidences matched)
Instance-level Accuracy (all gold matched): 82.33% (1011/1228 claims)
Recall-hit rate (≥1 gold matched): 1210/1228 (98.53%)





In [None]:
# Retrieve the chosen k = 100
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

data_dir = "/content/drive/MyDrive/NLP_content"
train_claim_file = os.path.join(data_dir, "train-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "roberta_dpr_biencoder")
faiss_index_file = os.path.join(data_dir, "roberta_faiss.index")
output_path = os.path.join(data_dir, "train-claims-pre-ranked-roberta.json")

model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(train_claim_file, 'r') as f:
    train_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())

# Dense retrieval function
def dense_retrieve_with_scores(claim_text: str, top_k: int = 100):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    scores, indices = index.search(query_vec, top_k)
    result = [(evid_ids[i], float(scores[0][idx])) for idx, i in enumerate(indices[0])]
    return result

# Main retrieval
def attach_top_k_retrieval(claims_data, k=100, output_file=output_path):
    output_with_retrieval = {}

    for cid, entry in tqdm(claims_data.items(), desc=f"Retrieving Top-{k} evidences"):
        claim_text = entry["claim_text"]
        gold_ids = set(entry.get("evidences", []))
        retrieved = dense_retrieve_with_scores(claim_text, top_k=k)

        retrieved_ids_ordered = [docid for docid, _ in retrieved]
        retrieved_scores = [round(score, 5) for _, score in retrieved]

        output_with_retrieval[cid] = {
            "claim_text": claim_text,
            "claim_label": entry.get("claim_label", ""),
            "evidences": list(gold_ids),
            "re_ranked_evidence": retrieved_ids_ordered,
            "re_ranked_scores": retrieved_scores
        }

    with open(output_file, 'w') as f:
        json.dump(output_with_retrieval, f, indent=2)
    print(f"\nSaved retrieval output to: {output_file}")

attach_top_k_retrieval(train_claims, k=100)


Retrieving Top-100 evidences: 100%|██████████| 1228/1228 [06:35<00:00,  3.11it/s]



Saved retrieval output to: /content/drive/MyDrive/NLP_content/train-claims-pre-ranked-roberta.json


In [None]:
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
file_path = os.path.join(data_dir, "train-claims-pre-ranked-roberta.json")

# Load JSON
with open(file_path, 'r') as f:
    data = json.load(f)

unique_retrieved_ids = set()
gold_evidence_ids = set()
matched_gold_ids = set()

for entry in data.values():
    pool = entry.get("re_ranked_evidence", [])
    gold = entry.get("evidences", [])

    unique_retrieved_ids.update(pool)
    gold_evidence_ids.update(gold)

    for eid in gold:
        if eid in pool:
            matched_gold_ids.add(eid)

# Output
print(f"Total unique evidence IDs in pre_ranked_pool: {len(unique_retrieved_ids)}")
print(f"Total unique gold evidence IDs: {len(gold_evidence_ids)}")
print(f"Matched gold evidence IDs in pre_ranked_pool: {len(matched_gold_ids)}")
print(f"Coverage of gold evidences: {len(matched_gold_ids) / len(gold_evidence_ids):.4f}")

export_path = os.path.join(data_dir, "unique_retrieved_evidence_ids_bm25.json")
with open(export_path, "w") as f:
    json.dump(sorted(list(unique_retrieved_ids)), f, indent=2)

print(f"Exported {len(unique_retrieved_ids)} unique evidence IDs to: {export_path}")

Total unique evidence IDs in pre_ranked_pool: 25641
Total unique gold evidence IDs: 3121
Matched gold evidence IDs in pre_ranked_pool: 2468
Coverage of gold evidences: 0.7908
Exported 25641 unique evidence IDs to: /content/drive/MyDrive/NLP_content/unique_retrieved_evidence_ids_bm25.json


In [None]:
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
file_path = os.path.join(data_dir, "train-claims-pre-ranked-minilm.json")

# Load JSON
with open(file_path, 'r') as f:
    data = json.load(f)

unique_retrieved_ids = set()
gold_evidence_ids = set()
matched_gold_ids = set()

for entry in data.values():
    pool = entry.get("re_ranked_evidence", [])
    gold = entry.get("evidences", [])

    unique_retrieved_ids.update(pool)
    gold_evidence_ids.update(gold)

    for eid in gold:
        if eid in pool:
            matched_gold_ids.add(eid)

# Output
print(f"Total unique evidence IDs in pre_ranked_pool: {len(unique_retrieved_ids)}")
print(f"Total unique gold evidence IDs: {len(gold_evidence_ids)}")
print(f"Matched gold evidence IDs in pre_ranked_pool: {len(matched_gold_ids)}")
print(f"Coverage of gold evidences: {len(matched_gold_ids) / len(gold_evidence_ids):.4f}")
export_path = os.path.join(data_dir, "unique_retrieved_evidence_ids_minilm.json")
with open(export_path, "w") as f:
    json.dump(sorted(list(unique_retrieved_ids)), f, indent=2)

print(f"Exported {len(unique_retrieved_ids)} unique evidence IDs to: {export_path}")

Total unique evidence IDs in pre_ranked_pool: 28461
Total unique gold evidence IDs: 3121
Matched gold evidence IDs in pre_ranked_pool: 2203
Coverage of gold evidences: 0.7059
Exported 28461 unique evidence IDs to: /content/drive/MyDrive/NLP_content/unique_retrieved_evidence_ids_minilm.json


#### 2.1.3 Hybrid Union - Pre-rank
The choice of is based on the evaluation on the same dev set in 2.1.2 and 2.1.3.

In [None]:
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
bm25_file_path = os.path.join(data_dir, "train-claims-with-negatives-bm25.json")
biencoder_file_path = os.path.join(data_dir, "train-claims-pre-ranked-minilm.json")
output_path = os.path.join(data_dir, "merged_pre_rank_pool.json")

# Load files
with open(bm25_file_path, 'r') as f:
    bm25_data = json.load(f)
with open(biencoder_file_path, 'r') as f:
    biencoder_data = json.load(f)

final_merged_data = {}
total_duplicates = 0
total_pool_length = 0
total_bm25_size = 0
total_dense_size = 0
total_combined_size = 0
total_bm25_unique = 0
total_dense_unique = 0
num_claims = 0

for cid in bm25_data:
    claim_text = bm25_data[cid]["claim_text"]
    claim_label = bm25_data[cid].get("claim_label", "")
    gold_evidences = bm25_data[cid].get("evidences", [])

    # BM25: gold + hard negatives (up to 100)
    bm25_gold = [item["id"] for item in bm25_data[cid].get("BM25_evidence", []) if isinstance(item, dict) and "id" in item]
    num_bm25_gold = len(bm25_gold)
    hard_negatives = [item["id"] for item in bm25_data[cid].get("hard_negative", []) if isinstance(item, dict) and "id" in item]
    needed_negatives = max(0, 100 - num_bm25_gold)
    bm25_pool = bm25_gold + hard_negatives[:needed_negatives]

    # Dense retrieval pool
    dense_raw = biencoder_data.get(cid, {}).get("re_ranked_evidence", [])[:100]
    dense_pool = []
    for ev in dense_raw:
        if isinstance(ev, str):
            dense_pool.append(ev)
        elif isinstance(ev, dict) and "id" in ev:
            dense_pool.append(ev["id"])

    # Combine and deduplicate, tracking contributions
    combined_pool = bm25_pool + dense_pool
    seen = set()
    pre_ranked_pool = []
    bm25_unique = 0
    dense_unique = 0

    for ev in combined_pool:
        if isinstance(ev, str) and ev not in seen:
            seen.add(ev)
            pre_ranked_pool.append(ev)
            if ev in bm25_pool:
                bm25_unique += 1
            if ev in dense_pool:
                dense_unique += 1
        elif isinstance(ev, str):
            total_duplicates += 1

    final_merged_data[cid] = {
        "claim_text": claim_text,
        "claim_label": claim_label,
        "evidences": gold_evidences,
        "pre_ranked_pool": pre_ranked_pool
    }

    # Track sizes and contributions
    total_bm25_size += len(bm25_pool)
    total_dense_size += len(dense_pool)
    total_combined_size += len(combined_pool)
    total_pool_length += len(pre_ranked_pool)
    total_bm25_unique += bm25_unique
    total_dense_unique += dense_unique
    num_claims += 1

# Save merged result
with open(output_path, 'w') as f:
    json.dump(final_merged_data, f, indent=2)

# Compute and print statistics
avg_duplicates = total_duplicates / num_claims if num_claims > 0 else 0
avg_pool_length = total_pool_length / num_claims if num_claims > 0 else 0
avg_bm25_size = total_bm25_size / num_claims if num_claims > 0 else 0
avg_dense_size = total_dense_size / num_claims if num_claims > 0 else 0
avg_combined_size = total_combined_size / num_claims if num_claims > 0 else 0
avg_bm25_unique = total_bm25_unique / num_claims if num_claims > 0 else 0
avg_dense_unique = total_dense_unique / num_claims if num_claims > 0 else 0

print(f"Average BM25 pool size: {avg_bm25_size:.2f}")
print(f"Average dense pool size: {avg_dense_size:.2f}")
print(f"Average combined pool size: {avg_combined_size:.2f}")
print(f"Average BM25 unique items: {avg_bm25_unique:.2f}")
print(f"Average dense unique items: {avg_dense_unique:.2f}")
print(f"Average duplicates per claim: {avg_duplicates:.2f}")
print(f"Average evidence pool length: {avg_pool_length:.2f}")
print(f"Merged pre-rank pool saved: {output_path}")

Average BM25 pool size: 100.00
Average dense pool size: 100.00
Average combined pool size: 200.00
Average BM25 unique items: 100.00
Average dense unique items: 100.00
Average duplicates per claim: 16.24
Average evidence pool length: 183.76
Merged pre-rank pool saved: /content/drive/MyDrive/NLP_content/merged_pre_rank_pool.json


In [None]:
# Evaluation of gold evidence coverage

In [None]:
evaluate_pre_ranked_retrieval(
    data_dir="/content/drive/MyDrive/NLP_content",
    gold_claim_filename="train-claims-preprocessed2.json",
    pre_rank_filename="merged_pre_rank_pool.json"
)

Evaluating merged_pre_rank_pool.json: 100%|██████████| 1228/1228 [00:00<00:00, 70254.05it/s]


--- Evaluation: merged_pre_rank_pool.json ---
Recall@100: 0.8948
Precision@100: 0.0164
F1@100: 0.0321
Accuracy@100 (all gold in top-k): 0.7516





### 2.2 Evidence Retrieval - Re-ranking

#### 2.2.1 Preprocessing for Two-Phase Cross-Encoder


In [None]:
import os
import json
import random
from tqdm import tqdm

# Configuration
data_dir = "/content/drive/MyDrive/NLP_content"
merged_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
output_phase1 = os.path.join(data_dir, "training_rerank_phase1.json")
output_phase2 = os.path.join(data_dir, "training_rerank_phase2.json")

# Tuning
NUM_RANDOM_NEGATIVES = 5
NUM_HARD_NEGATIVES = 10  # 5 from BM25 + 5 from Dense

# Load input files
with open(merged_file) as f:
    merged_data = json.load(f)

with open(evidence_file) as f:
    evidence_corpus = json.load(f)

phase1_data = []
phase2_data = []
all_ids = set(evidence_corpus.keys())

# Process with progress bar
for cid, entry in tqdm(merged_data.items(), desc="Processing claims"):
    claim = entry["claim_text"]
    gold_ids = set(entry.get("evidences", []))
    pre_ranked_list = entry.get("pre_ranked_pool", [])

    if not gold_ids:
        continue

    # Gold evidence
    gold_texts = [
        {"text": evidence_corpus[eid], "label": 1}
        for eid in gold_ids if eid in evidence_corpus
    ]

    # Phase 1: random negatives
    possible_randoms = list(all_ids - set(pre_ranked_list) - gold_ids)
    sampled_random_ids = random.sample(possible_randoms, min(NUM_RANDOM_NEGATIVES, len(possible_randoms)))

    random_neg_texts = [
        {"text": evidence_corpus[eid], "label": 0}
        for eid in sampled_random_ids if eid in evidence_corpus
    ]

    phase1_data.append({
        "claim": claim,
        "candidates": gold_texts + random_neg_texts
    })

    # Phase 2: hard negatives split from BM25 and Dense
    filtered_hard_ids = [eid for eid in pre_ranked_list if eid not in gold_ids]
    hard_candidate_ids = filtered_hard_ids[24:183]  # Rank 25–183

    # Split assumed BM25/Dense
    split_index = 92  # Fixed half of 184
    bm25_range = hard_candidate_ids[:split_index - 24]
    dense_range = hard_candidate_ids[split_index - 24:]

    bm25_samples = random.sample(bm25_range, min(5, len(bm25_range)))
    dense_samples = random.sample(dense_range, min(5, len(dense_range)))
    sampled_hard_ids = bm25_samples + dense_samples

    hard_neg_texts = [
        {"text": evidence_corpus[eid], "label": 0}
        for eid in sampled_hard_ids if eid in evidence_corpus
    ]

    if len(phase1_data) == 1:
        print(f"\nSample claim {cid}:")
        print(f"  Gold IDs: {gold_ids}")
        print(f"  Sampled BM25 IDs: {bm25_samples[:3]}")
        print(f"  Sampled Dense IDs: {dense_samples[:3]}")
        print(f"  Hard negative text (1st): {hard_neg_texts[0] if hard_neg_texts else 'None'}")

    phase2_data.append({
        "claim": claim,
        "candidates": gold_texts + hard_neg_texts
    })

# Save output files
with open(output_phase1, 'w') as f:
    json.dump(phase1_data, f, indent=2)
print(f"\nSaved Phase 1 (gold + random) to: {output_phase1}")

with open(output_phase2, 'w') as f:
    json.dump(phase2_data, f, indent=2)
print(f"Saved Phase 2 (gold + hard) to: {output_phase2}")

# Stats
avg_phase1 = sum(len(d['candidates']) for d in phase1_data) / len(phase1_data) if phase1_data else 0
avg_phase2 = sum(len(d['candidates']) for d in phase2_data) / len(phase2_data) if phase2_data else 0
print(f"Phase 1: {len(phase1_data)} claims, avg {avg_phase1:.2f} candidates")
print(f"Phase 2: {len(phase2_data)} claims, avg {avg_phase2:.2f} candidates")


Processing claims:   0%|          | 1/1228 [00:00<07:29,  2.73it/s]


Sample claim claim-1937:
  Gold IDs: {'evidence-442946', 'evidence-12171', 'evidence-1194317'}
  Sampled BM25 IDs: ['evidence-36224', 'evidence-1008043', 'evidence-373200']
  Sampled Dense IDs: ['evidence-881617', 'evidence-631684', 'evidence-761183']
  Hard negative text (1st): {'text': 'united states electric power plants emit 2.4 billion tons carbon dioxide carbon dioxide year roughly 40 percent nations total emissions', 'label': 0}


Processing claims: 100%|██████████| 1228/1228 [06:20<00:00,  3.23it/s]



Saved Phase 1 (gold + random) to: /content/drive/MyDrive/NLP_content/training_rerank_phase1.json
Saved Phase 2 (gold + hard) to: /content/drive/MyDrive/NLP_content/training_rerank_phase2.json
Phase 1: 1228 claims, avg 8.36 candidates
Phase 2: 1228 claims, avg 13.36 candidates


In [None]:
# Validate rerank two-phase training data
import json
import os

data_dir = "/content/drive/MyDrive/NLP_content"
phase1_path = os.path.join(data_dir, "training_rerank_phase1.json")
phase2_path = os.path.join(data_dir, "training_rerank_phase2.json")

for file_path in [phase1_path, phase2_path]:
    try:
        with open(file_path, 'r') as f:
            json.load(f)
        print(f"{file_path} is valid JSON")
    except json.JSONDecodeError as e:
        print(f"{file_path} is invalid JSON: {e}")
    except FileNotFoundError:
        print(f"{file_path} not found")

/content/drive/MyDrive/NLP_content/training_rerank_phase1.json is valid JSON
/content/drive/MyDrive/NLP_content/training_rerank_phase2.json is valid JSON


In [None]:
#Class Imbalance
from collections import Counter

data_dir = "/content/drive/MyDrive/NLP_content"
phase1_path = os.path.join(data_dir, "training_rerank_phase1.json")
phase2_path = os.path.join(data_dir, "training_rerank_phase2.json")

def compute_label_distribution(*datasets):
    counter = Counter()
    for data in datasets:
        for entry in data:
            for cand in entry["candidates"]:
                counter[cand["label"]] += 1
    return counter

# Load and compute
with open(phase1_path) as f1, open(phase2_path) as f2:
    phase1 = json.load(f1)
    phase2 = json.load(f2)

label_counts = compute_label_distribution(phase1, phase2)
total = sum(label_counts.values())
class_weights = [total / label_counts[i] for i in sorted(label_counts)]

print("Class counts:", label_counts)
print("Class weights:", class_weights)

#### 2.2.2 MiniLM Cross-Encoder

In [None]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [None]:
import os
import json
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from tqdm import tqdm
import numpy as np
import torch
import datasets

# # Verify versions
# print("NumPy version:", np.__version__)
# print("Transformers version:", transformers.__version__)
# print("Datasets version:", datasets.__version__)

# Configuration
data_dir = "/content/drive/MyDrive/NLP_content"
phase1_path = os.path.join(data_dir, "training_rerank_phase1.json")
phase2_path = os.path.join(data_dir, "training_rerank_phase2.json")
pre_rank_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

def load_train_pairs_json_array(phase_path):
    result = []
    try:
        with open(phase_path, 'r') as f:
            entries = json.load(f)
    except json.JSONDecodeError as e:
        print(f"Error loading {phase_path}: {e}")
        raise
    except FileNotFoundError:
        print(f"{phase_path} not found")
        raise
    for entry in entries:
        claim = entry.get("claim") or entry.get("claim_text", "")
        candidates = entry.get("candidates", [])
        if not claim or not candidates:
            print(f"Skipping invalid entry in {phase_path}: claim={claim}, candidates={len(candidates)}")
            continue
        for cand in candidates:
            text = cand.get("text", "")
            label = cand.get("label", 0)
            if text:
                result.append((claim, text, label))
    return result

# Load and validate data
print("Loading Phase 1 data...")
train_pairs_phase1 = load_train_pairs_json_array(phase1_path)
print("Loading Phase 2 data...")
train_pairs_phase2 = load_train_pairs_json_array(phase2_path)

print("First Phase 1 tuple:", train_pairs_phase1[0] if train_pairs_phase1 else "No tuples")
print("First Phase 2 tuple:", train_pairs_phase2[0] if train_pairs_phase2 else "No tuples")

print("Converting to Hugging Face datasets...")
train_dataset_phase1 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": float(lbl)} for (c, e, lbl) in tqdm(train_pairs_phase1)
])
train_dataset_phase2 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": float(lbl)} for (c, e, lbl) in tqdm(train_pairs_phase2)
])

print(f"Phase 1 examples: {len(train_dataset_phase1)}")
print(f"Phase 2 examples: {len(train_dataset_phase2)}")
print("First Phase 1 item:", train_dataset_phase1[0] if train_dataset_phase1 else "No data")
print("Phase 1 keys:", list(train_dataset_phase1[0].keys()) if train_dataset_phase1 else "No keys")
print("Dataset features:", train_dataset_phase1.features if train_dataset_phase1 else "No features")

# Initialize model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

def tokenize_function(examples):
    encodings = tokenizer(examples["claim"], examples["evidence"], padding='max_length', truncation=True, max_length=256, return_tensors="pt")
    return {
        "input_ids": encodings["input_ids"],
        "attention_mask": encodings["attention_mask"],
        "label": torch.tensor(examples["label"], dtype=torch.float)
    }

# Tokenize datasets
print("Tokenizing datasets...")
train_dataset_phase1 = train_dataset_phase1.map(tokenize_function, batched=True, remove_columns=["claim", "evidence"])
train_dataset_phase2 = train_dataset_phase2.map(tokenize_function, batched=True, remove_columns=["claim", "evidence"])

# Set format for training
train_dataset_phase1.set_format("torch", columns=["input_ids", "attention_mask", "label"])
train_dataset_phase2.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Verify dataset
print("Post-tokenization Phase 1 item:", train_dataset_phase1[0])

# Phase 1 training
training_args_phase1 = TrainingArguments(
    output_dir="reranker_model_phase1",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    warmup_steps=100,
    logging_steps=50,
    logging_dir="logs_phase1",
    save_strategy="no",
    fp16=True
)

trainer_phase1 = Trainer(
    model=model,
    args=training_args_phase1,
    train_dataset=train_dataset_phase1,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

print("\nTraining Phase 1: Gold vs Random Negatives...")
trainer_phase1.train()
print("Saving model after Phase 1...")
trainer_phase1.save_model("reranker_model_phase1_final")

# Phase 2 training
training_args_phase2 = TrainingArguments(
    output_dir="reranker_model_phase2",
    overwrite_output_dir=True,
    eval_strategy="no",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=50,
    logging_dir="logs_phase2",
    save_strategy="no",
    fp16=True
)

trainer_phase2 = Trainer(
    model=model,
    args=training_args_phase2,
    train_dataset=train_dataset_phase2,
    processing_class=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

print("\nTraining Phase 2: Gold vs Hard Negatives...")
trainer_phase2.train()
print("Saving final model after Phase 2...")
trainer_phase2.save_model("reranker_model_final_minilm1")

# Evaluation
print("\nEvaluating Full Metrics on Training Set...")
try:
    with open(pre_rank_file) as f:
        rerank_pool = json.load(f)
    with open(evidence_file) as f:
        evidence_corpus = json.load(f)
except json.JSONDecodeError as e:
    print(f"Error loading evaluation files: {e}")
    raise
except FileNotFoundError as e:
    print(f"Evaluation file not found: {e}")
    raise

model.eval()
recalls = []
precisions = []
f_scores = []
task4_data = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for cid, entry in tqdm(rerank_pool.items(), desc="Scoring candidates"):
    claim_text = entry.get("claim_text", "")
    claim_label = entry["claim_label"]
    if claim_label is None:
        print(f"Missing claim_label for claim {cid}")
        continue
    gold_ids = set(entry.get("evidences", []))
    candidates = entry.get("pre_ranked_pool", [])
    if not claim_text or not gold_ids or not candidates:
        print(f"Skipping claim {cid}: invalid data")
        continue
    texts = [evidence_corpus[eid] for eid in candidates if eid in evidence_corpus]
    pairs = [(claim_text, txt) for txt in texts]
    if not pairs:
        continue

    encoded = tokenizer.batch_encode_plus(pairs, padding=True, truncation=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded.items()})
    scores = outputs.logits.squeeze()
    topk_indices = torch.topk(scores, k=min(5, len(scores))).indices.tolist()
    topk_ids = [candidates[i] for i in topk_indices]

    evidence_correct = sum(1 for g in gold_ids if g in topk_ids)
    evidence_recall = evidence_correct / len(gold_ids) if gold_ids else 0.0
    recalls.append(evidence_recall)
    evidence_precision = evidence_correct / len(topk_ids) if topk_ids else 0.0
    precisions.append(evidence_precision)
    evidence_fscore = (2 * evidence_precision * evidence_recall) / (evidence_precision + evidence_recall) if (evidence_precision + evidence_recall) > 0 else 0.0
    f_scores.append(evidence_fscore)

    task4_data.append({
        "claim_id": cid,
        "claim_text": claim_text,
        "claim_label": claim_label,
        "top_evidence_ids": topk_ids,
        "gold_evidence_ids": list(gold_ids)
    })

mean_f = np.mean(f_scores if f_scores else [0.0])
mean_recall = np.mean(recalls if recalls else [0.0])
mean_precision = np.mean(precisions if precisions else [0.0])

print(f"Evidence Retrieval F-score (F): {mean_f:.4f}")
print(f"Evidence Retrieval Recall: {mean_recall:.4f}")
print(f"Evidence Retrieval Precision: {mean_precision:.4f}")

# Save Task 4 dataset
with open("task4_input_minilm.json", "w") as f:
    json.dump(task4_data, f, indent=2)

NumPy version: 2.0.2
Transformers version: 4.51.3
Datasets version: 3.6.0
Loading Phase 1 data...
Loading Phase 2 data...
First Phase 1 tuple: ('scientific evidence carbon dioxide pollutant higher carbon dioxide concentrations actually help ecosystems support plant animal life', 'higher carbon dioxide concentrations favourably affect plant growth demand water', 1)
First Phase 2 tuple: ('scientific evidence carbon dioxide pollutant higher carbon dioxide concentrations actually help ecosystems support plant animal life', 'higher carbon dioxide concentrations favourably affect plant growth demand water', 1)
Converting to Hugging Face datasets...


100%|██████████| 10261/10261 [00:00<00:00, 1859001.92it/s]
100%|██████████| 16402/16402 [00:00<00:00, 1711445.49it/s]

Phase 1 examples: 10261
Phase 2 examples: 16402
First Phase 1 item: {'claim': 'scientific evidence carbon dioxide pollutant higher carbon dioxide concentrations actually help ecosystems support plant animal life', 'evidence': 'higher carbon dioxide concentrations favourably affect plant growth demand water', 'label': 1.0}
Phase 1 keys: ['claim', 'evidence', 'label']
Dataset features: {'claim': Value(dtype='string', id=None), 'evidence': Value(dtype='string', id=None), 'label': Value(dtype='float64', id=None)}





Tokenizing datasets...


Map:   0%|          | 0/10261 [00:00<?, ? examples/s]

Map:   0%|          | 0/16402 [00:00<?, ? examples/s]

Post-tokenization Phase 1 item: {'label': tensor(1.), 'input_ids': tensor([  101,  4045,  3350,  6351, 14384,  8554, 13210,  3372,  3020,  6351,
        14384, 14061,  2941,  2393, 20440,  2490,  3269,  4111,  2166,   102,
         3020,  6351, 14384, 14061,  7927,  8231,  7461,  3269,  3930,  5157,
         2300,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     

Step,Training Loss
50,29.3622
100,0.5306
150,0.1666
200,0.1536
250,0.1306
300,0.1254
350,0.0996
400,0.0866
450,0.076
500,0.069


Saving model after Phase 1...

Training Phase 2: Gold vs Hard Negatives...


Step,Training Loss
50,0.3152
100,0.166
150,0.177
200,0.1736
250,0.1628
300,0.1602
350,0.1602
400,0.1609
450,0.1697
500,0.1611


Saving final model after Phase 2...

Evaluating Full Metrics on Training Set...


Scoring candidates: 100%|██████████| 1228/1228 [41:35<00:00,  2.03s/it]

Evidence Retrieval F-score (F): 0.2045
Evidence Retrieval Recall: 0.2902
Evidence Retrieval Precision: 0.1743





In [None]:
#NEED TO BE UPDATED - IMBALANCE CLASS + GRADIENT
import os
import json
from collections import Counter
import numpy as np
import torch
from tqdm import tqdm
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    TrainerCallback
)
from torch.nn import BCEWithLogitsLoss

# Configuration
data_dir = "/content/drive/MyDrive/NLP_content"
phase1_path = os.path.join(data_dir, "training_rerank_phase1.json")
phase2_path = os.path.join(data_dir, "training_rerank_phase2.json")
pre_rank_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

# Load training pairs
def load_train_pairs_json_array(path):
    with open(path) as f:
        entries = json.load(f)
    return [(entry["claim"], cand["text"], cand["label"]) for entry in entries for cand in entry["candidates"]]

print("Loading training data...")
train_pairs_phase1 = load_train_pairs_json_array(phase1_path)
train_pairs_phase2 = load_train_pairs_json_array(phase2_path)

# Compute class weights for BCE
all_labels = [lbl for (_, _, lbl) in train_pairs_phase1 + train_pairs_phase2]
label_counts = Counter(all_labels)
total = sum(label_counts.values())
pos_weight = total / label_counts[1.0]
print("Class counts:", label_counts)
print("Pos weight (for BCE):", pos_weight)

# Convert to datasets
train_dataset_phase1 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": float(lbl)} for (c, e, lbl) in train_pairs_phase1
])
train_dataset_phase2 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": float(lbl)} for (c, e, lbl) in train_pairs_phase2
])

# Tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

def tokenize_function(examples):
    encodings = tokenizer(examples["claim"], examples["evidence"], padding='max_length', truncation=True, max_length=256)
    return {**encodings, "label": examples["label"]}

# Preprocess
train_dataset_phase1 = train_dataset_phase1.map(tokenize_function, batched=True, remove_columns=["claim", "evidence"])
train_dataset_phase2 = train_dataset_phase2.map(tokenize_function, batched=True, remove_columns=["claim", "evidence"])

train_dataset_phase1.set_format("torch")
train_dataset_phase2.set_format("torch")

# Loss callback
class LogLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            print(f"Step {state.global_step} - loss: {logs['loss']:.4f}")

# Custom trainer with weighted BCE loss
class BCETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").unsqueeze(1)  # (batch_size, 1)
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight).to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Phase 1 Training Args
training_args_phase1 = TrainingArguments(
    output_dir=os.path.join(data_dir, "reranker_model_phase1_minilm2"),
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    warmup_steps=100,
    logging_steps=50,
    logging_dir=os.path.join(data_dir, "logs_phase1"),
    save_strategy="no",
    fp16=True
)

# Phase 2 Training Args
training_args_phase2 = TrainingArguments(
    output_dir=os.path.join(data_dir, "reranker_model_phase2_minilm2"),
    overwrite_output_dir=True,
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=50,
    logging_dir=os.path.join(data_dir, "logs_phase2"),
    save_strategy="no",
    fp16=True
)

# Phase 1 Training
print("\nTraining Phase 1: Gold vs Random Negatives...")
trainer_phase1 = BCETrainer(
    model=model,
    args=training_args_phase1,
    train_dataset=train_dataset_phase1,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[LogLossCallback()]
)
trainer_phase1.train()
trainer_phase1.save_model(os.path.join(data_dir, "reranker_model_phase1_final_minilm2"))

# Reload model
print("\nTraining Phase 2: Gold vs Hard Negatives...")
model = AutoModelForSequenceClassification.from_pretrained(os.path.join(data_dir, "reranker_model_phase1_final_minilm2"))
trainer_phase2 = BCETrainer(
    model=model,
    args=training_args_phase2,
    train_dataset=train_dataset_phase2,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[LogLossCallback()]
)
trainer_phase2.train()
trainer_phase2.save_model(os.path.join(data_dir, "reranker_model_final_minilm2"))


In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
model_path = os.path.join(data_dir, "reranker_model_final_minilm2")
pre_rank_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
output_path = os.path.join(data_dir, "classification_input_minilm.json")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
with open(pre_rank_file) as f:
    rerank_pool = json.load(f)
with open(evidence_file) as f:
    evidence_corpus = json.load(f)

recalls, precisions, f_scores = [], [], []
task4_data = []

# Scoring loop
for cid, entry in tqdm(rerank_pool.items(), desc="Scoring candidates"):
    claim_text = entry.get("claim_text", "")
    claim_label = entry.get("claim_label", None)
    gold_ids = set(entry.get("evidences", []))
    candidates = entry.get("pre_ranked_pool", [])

    if not claim_text or not candidates or claim_label is None:
        continue

    texts = [evidence_corpus[eid] for eid in candidates if eid in evidence_corpus]
    pairs = [(claim_text, txt) for txt in texts]
    if not pairs:
        continue

    encoded = tokenizer.batch_encode_plus(pairs, padding=True, truncation=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded.items()})
        probs = outputs.logits.squeeze()

    topk_indices = torch.topk(scores, k=min(5, len(scores))).indices.tolist()
    topk_ids = [candidates[i] for i in topk_indices]

    correct = sum(1 for g in gold_ids if g in topk_ids)
    recall = correct / len(gold_ids) if gold_ids else 0.0
    precision = correct / len(topk_ids) if topk_ids else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    recalls.append(recall)
    precisions.append(precision)
    f_scores.append(f1)

    task4_data.append({
        "claim_id": cid,
        "claim_text": claim_text,
        "claim_label": claim_label,
        "top_evidence_ids": topk_ids,
        "gold_evidence_ids": list(gold_ids)
    })

# Report
print(f"Evidence Retrieval F-score (F): {np.mean(f_scores):.4f}")
print(f"Evidence Retrieval Recall: {np.mean(recalls):.4f}")
print(f"Evidence Retrieval Precision: {np.mean(precisions):.4f}")

# Save JSON
with open(output_path, "w") as f:
    json.dump(task4_data, f, indent=2)
print(f"Saved classification-ready file to: {output_path}")

#### 2.2.3 DistilBERT Cross-Encoder

In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from collections import Counter
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.nn import CrossEntropyLoss

# Paths
data_dir = "/content/drive/MyDrive/NLP_content"
phase1_path = os.path.join(data_dir, "training_rerank_phase1.json")
phase2_path = os.path.join(data_dir, "training_rerank_phase2.json")
pre_rank_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_name = "distilbert-base-uncased"

# Load training data
with open(phase1_path) as f:
    phase1_raw = json.load(f)
with open(phase2_path) as f:
    phase2_raw = json.load(f)

def convert_to_pairs(raw):
    return [(entry["claim"], cand["text"], cand["label"])
            for entry in raw for cand in entry["candidates"]]

train_pairs_phase1 = convert_to_pairs(phase1_raw)
train_pairs_phase2 = convert_to_pairs(phase2_raw)

train_dataset_phase1 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": lbl} for (c, e, lbl) in train_pairs_phase1
])
train_dataset_phase2 = Dataset.from_list([
    {"claim": c, "evidence": e, "label": lbl} for (c, e, lbl) in train_pairs_phase2
])

print(f"Phase1 examples: {len(train_dataset_phase1)}")
print(f"Phase2 examples: {len(train_dataset_phase2)}")
print(train_dataset_phase1[0])

# Compute class weights from both phases
def compute_label_distribution(*datasets):
    counter = Counter()
    for data in datasets:
        for entry in data:
            for cand in entry["candidates"]:
                counter[cand["label"]] += 1
    return counter

label_counts = compute_label_distribution(phase1_raw, phase2_raw)
total = sum(label_counts.values())
class_weights = [total / label_counts[i] for i in sorted(label_counts)]
print("Class counts:", label_counts)
print("Class weights:", class_weights)

# Tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Encode function
def encode_batch(batch):
    return tokenizer(batch["claim"], batch["evidence"],
                     padding='max_length', truncation=True,
                     max_length=256, return_tensors="pt")

# Custom trainer with class-weighted loss
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        w0, w1 = class_weights.get(0.0, 1.0), class_weights.get(1.0, 1.0)
        self.weight_tensor = torch.tensor([w0, w1])

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("label").view(-1)
        outputs = model(**inputs)
        logits = outputs.logits.view(-1)
        weights = torch.where(labels == 1.0, self.weight_tensor[1], self.weight_tensor[0]).to(logits.device)
        loss = BCEWithLogitsLoss(weight=weights)(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Phase 1 training
training_args_phase1 = TrainingArguments(
    output_dir="distilbert_reranker_phase1",
    overwrite_output_dir=True,
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    warmup_steps=100,
    logging_steps=50,
    logging_dir="logs_distil_phase1",
    save_strategy="no",
    fp16=True
)

trainer_phase1 = WeightedTrainer(
    model=model,
    args=training_args_phase1,
    train_dataset=train_dataset_phase1,
    tokenizer=tokenizer,
    data_collator=lambda data: encode_batch(data),
    class_weights=class_weights
)

print("Training Phase 1 (DistilBERT): Gold vs Random Negatives...")
trainer_phase1.train()

# Phase 2 training
training_args_phase2 = TrainingArguments(
    output_dir="distilbert_reranker_phase2",
    overwrite_output_dir=True,
    evaluation_strategy="no",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    warmup_steps=50,
    logging_steps=50,
    logging_dir="logs_distil_phase2",
    save_strategy="no",
    fp16=True
)

trainer_phase2 = WeightedTrainer(
    model=model,
    args=training_args_phase2,
    train_dataset=train_dataset_phase2,
    tokenizer=tokenizer,
    data_collator=lambda data: encode_batch(data),
    class_weights=class_weights
)

print("Training Phase 2 (DistilBERT): Gold vs Hard Negatives...")
trainer_phase2.train()

# Save final model
final_model_path = os.path.join(data_dir, "reranker_model_final_distilbert")
trainer_phase2.save_model(final_model_path)
print(f"Saved final model to: {final_model_path}")

In [None]:
# EVALUATION BLOCK
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
model_path = os.path.join(data_dir, "reranker_model_final_distilbert")
pre_rank_file = os.path.join(data_dir, "merged_pre_rank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
output_path = os.path.join(data_dir, "classification_input_distilbert.json")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
with open(pre_rank_file) as f:
    rerank_pool = json.load(f)
with open(evidence_file) as f:
    evidence_corpus = json.load(f)

recalls, precisions, f_scores = [], [], []
task4_data = []

# Scoring loop
for cid, entry in tqdm(rerank_pool.items(), desc="Scoring candidates"):
    claim_text = entry.get("claim_text", "")
    claim_label = entry.get("claim_label", None)
    gold_ids = set(entry.get("evidences", []))
    candidates = entry.get("pre_ranked_pool", [])

    if not claim_text or not candidates or claim_label is None:
        continue

    texts = [evidence_corpus[eid] for eid in candidates if eid in evidence_corpus]
    pairs = [(claim_text, txt) for txt in texts]
    if not pairs:
        continue

    encoded = tokenizer.batch_encode_plus(pairs, padding=True, truncation=True, max_length=256, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded.items()})
        scores = torch.softmax(outputs.logits, dim=1)[:, 1]

    topk_indices = torch.topk(scores, k=min(5, len(scores))).indices.tolist()
    topk_ids = [candidates[i] for i in topk_indices]

    correct = sum(1 for g in gold_ids if g in topk_ids)
    recall = correct / len(gold_ids) if gold_ids else 0.0
    precision = correct / len(topk_ids) if topk_ids else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    recalls.append(recall)
    precisions.append(precision)
    f_scores.append(f1)

    task4_data.append({
        "claim_id": cid,
        "claim_text": claim_text,
        "claim_label": claim_label,
        "top_evidence_ids": topk_ids,
        "gold_evidence_ids": list(gold_ids)
    })

# Report
print(f"Evidence Retrieval F-score (F): {np.mean(f_scores):.4f}")
print(f"Evidence Retrieval Recall: {np.mean(recalls):.4f}")
print(f"Evidence Retrieval Precision: {np.mean(precisions):.4f}")

# Save JSON
with open(output_path, "w") as f:
    json.dump(task4_data, f, indent=2)
print(f"Saved classification-ready file to: {output_path}")

In [None]:
import json
import os
import numpy as np
from tqdm import tqdm

def evaluate_reranker_results(json_path, top_k=5):
    with open(json_path, "r") as f:
        data = json.load(f)

    recalls = []
    precisions = []
    f1s = []
    accuracies = []

    for entry in tqdm(data, desc=f"Evaluating {os.path.basename(json_path)}"):
        gold_ids = set(entry["gold_evidence_ids"])
        predicted_ids = entry["top_evidence_ids"][:top_k]

        if not gold_ids or not predicted_ids:
            continue

        correct = sum(1 for gid in gold_ids if gid in predicted_ids)
        recall = correct / len(gold_ids)
        precision = correct / len(predicted_ids)
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        accuracy = int(gold_ids.issubset(set(predicted_ids)))  # all golds found in top-k

        recalls.append(recall)
        precisions.append(precision)
        f1s.append(f1)
        accuracies.append(accuracy)

    print(f"\n--- Evaluation: {os.path.basename(json_path)} ---")
    print(f"Recall@{top_k}: {np.mean(recalls):.4f}")
    print(f"Precision@{top_k}: {np.mean(precisions):.4f}")
    print(f"F1@{top_k}: {np.mean(f1s):.4f}")
    print(f"Accuracy@{top_k} (all gold in top-k): {np.mean(accuracies):.4f}")


In [None]:
data_dir = "/content/drive/MyDrive/NLP_content"

evaluate_reranker_results(os.path.join(data_dir, "classification_input_minilm1.json"))
evaluate_reranker_results(os.path.join(data_dir, "classification_input_minilm2.json"))
evaluate_reranker_results(os.path.join(data_dir, "classification_input_distilbert.json"))


Evaluating classification_input_minilm1.json: 100%|██████████| 1228/1228 [00:00<00:00, 351145.71it/s]



--- Evaluation: classification_input_minilm1.json ---
Recall@5: 0.2841
Precision@5: 0.1715
F1@5: 0.2009
Accuracy@5 (all gold in top-k): 0.1059


Evaluating classification_input_minilm2.json: 100%|██████████| 1228/1228 [00:00<00:00, 377444.33it/s]



--- Evaluation: classification_input_minilm2.json ---
Recall@5: 0.3545
Precision@5: 0.2156
F1@5: 0.2522
Accuracy@5 (all gold in top-k): 0.1368


Evaluating classification_input_distilbert.json: 100%|██████████| 1228/1228 [00:00<00:00, 383144.04it/s]


--- Evaluation: classification_input_distilbert.json ---
Recall@5: 0.4613
Precision@5: 0.2780
F1@5: 0.3251
Accuracy@5 (all gold in top-k): 0.1808





# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## 3.1 Evidence Retrieval - Pre-ranking

### 3.1.1 Baseline Model - BoW

In [None]:
import os
import json
import joblib
import numpy as np
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Settings
top_k = 5
data_dir = "/content/drive/MyDrive/NLP_content"

# Load vectoriser and BoW matrix for evidence
print("Loading BoW vectoriser and evidence matrix...")
bow_vectorizer = joblib.load(os.path.join(data_dir, "bow_vectorizer.pkl"))
evidence_bow = scipy.sparse.load_npz(os.path.join(data_dir, "evidence_p1_bow.npz"))

# Load evidence and dev claim data
evidence_data = json.load(open(os.path.join(data_dir, "evidence-preprocessed1.json")))
dev_claims_data = json.load(open(os.path.join(data_dir, "dev-claims-preprocessed1.json")))

evidence_ids = list(evidence_data.keys())
dev_claim_ids = list(dev_claims_data.keys())
dev_claim_texts = [dev_claims_data[cid]["claim_text"] for cid in dev_claim_ids]

# Vectorise dev claims using existing vectoriser
print("Vectorising dev claims...")
dev_bow = bow_vectorizer.transform(dev_claim_texts)

# Compute cosine similarity
print("Computing cosine similarity...")
cosine_sim = cosine_similarity(dev_bow, evidence_bow)
ranked_indices = np.argsort(-cosine_sim, axis=1)

# Retrieve top-k evidence for each dev claim
print(f"Retrieving top {top_k} evidence IDs per dev claim...")
top_k_evidence = {
    cid: [evidence_ids[i] for i in ranked_indices[idx][:top_k]]
    for idx, cid in enumerate(dev_claim_ids)
}

# Build and save final output
dev_claims_retrieved = {
    cid: {
        "claim_text": dev_claims_data[cid]["claim_text"],
        "pre_ranked_evidences": top_k_evidence[cid]
    }
    for cid in dev_claim_ids
}

output_path = os.path.join(data_dir, "dev_claims_retrieved_bow_top5.json")
with open(output_path, "w") as f:
    json.dump(dev_claims_retrieved, f, indent=2)

print(f"\nSaved top-5 BoW evidence retrieval results to: {output_path}")


Loading BoW vectoriser and evidence matrix...
Vectorising dev claims...
Computing cosine similarity...
Retrieving top 5 evidence IDs per dev claim...

Saved top-5 BoW evidence retrieval results to: /content/drive/MyDrive/NLP_content/dev_claims_retrieved_bow_top5.json


In [None]:
import os
import json
import numpy as np
from tqdm import tqdm

# File paths
data_dir = "/content/drive/MyDrive/NLP_content"
gold_claim_file = os.path.join(data_dir, "dev-claims-preprocessed1.json")
pre_rank_file = os.path.join(data_dir, "dev_claims_retrieved_bow_top5.json")
pre_rank_filename = os.path.basename(pre_rank_file)

# Load data
with open(gold_claim_file) as f:
    gold_claims = json.load(f)
with open(pre_rank_file) as f:
    pre_ranked = json.load(f)

# Initialise metrics
recalls, precisions, f1s = [], [], []
accurate_full_match = 0
total = 0

# Evaluate
for cid, entry in tqdm(pre_ranked.items(), desc=f"Evaluating {pre_rank_filename}"):
    claim_text = entry.get("claim_text", "")
    pre_ranked_pool = set(entry.get("pre_ranked_evidences", []))  # <-- adapted key
    gold_ids = set(gold_claims.get(cid, {}).get("evidences", []))

    if not claim_text or not gold_ids or not pre_ranked_pool:
        continue

    correct = gold_ids & pre_ranked_pool
    recall = len(correct) / len(gold_ids)
    precision = len(correct) / len(pre_ranked_pool)
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    recalls.append(recall)
    precisions.append(precision)
    f1s.append(f1)

    if gold_ids.issubset(pre_ranked_pool):
        accurate_full_match += 1
    total += 1

# Report
print(f"\n--- Evaluation: {pre_rank_filename} ---")
print(f"Recall@{len(pre_ranked_pool)}: {np.mean(recalls):.4f}")
print(f"Precision@{len(pre_ranked_pool)}: {np.mean(precisions):.4f}")
print(f"F1@{len(pre_ranked_pool)}: {np.mean(f1s):.4f}")
print(f"Accuracy@{len(pre_ranked_pool)} (all gold in top-k): {accurate_full_match / total:.4f}")


Evaluating dev_claims_retrieved_bow_top5.json: 100%|██████████| 154/154 [00:00<00:00, 220075.92it/s]


--- Evaluation: dev_claims_retrieved_bow_top5.json ---
Recall@5: 0.0752
Precision@5: 0.0390
F1@5: 0.0467
Accuracy@5 (all gold in top-k): 0.0325





In [None]:
import os
import json

# Paths
data_dir = "/content/drive/MyDrive/NLP_content"
retrieved_file = os.path.join(data_dir, "dev_claims_retrieved_bow_top5.json")
gold_file = os.path.join(data_dir, "dev-claims.json")
output_path = os.path.join(data_dir, "dev_task2_input_bow.json")  # final output

# Load retrieved and gold data
with open(retrieved_file, 'r') as f:
    retrieved_data = json.load(f)
with open(gold_file, 'r') as f:
    gold_data = json.load(f)

final_output = {}

for cid, entry in retrieved_data.items():
    claim_text = entry.get("claim_text", "")
    pre_ranked = entry.get("pre_ranked_evidences", [])
    gold_evidences = gold_data.get(cid, {}).get("evidences", [])
    label = gold_data.get(cid, {}).get("claim_label", "")

    # Merge: ensure gold evidences are included
    merged_set = []
    seen = set()
    for eid in pre_ranked + gold_evidences:
        if eid not in seen:
            seen.add(eid)
            merged_set.append(eid)

    final_output[cid] = {
        "claim_text": claim_text,
        "claim_label": label,
        "evidences": gold_evidences,
        "pre_ranked_pool": merged_set  # renamed for classification input
    }

# Save
with open(output_path, 'w') as f:
    json.dump(final_output, f, indent=2)

print(f"Final classification input (Task 2 - BoW) saved to: {output_path}")
print(f"Total claims processed: {len(final_output)}")


Final classification input (Task 2 - BoW) saved to: /content/drive/MyDrive/NLP_content/dev_task2_input_bow.json
Total claims processed: 154


### 3.1.1 BM25

In [None]:
import os
import json
from tqdm import tqdm
from pyserini.search.lucene import LuceneSearcher

# Config
DATA_DIR = "/content/drive/MyDrive/NLP_content"
INDEX_PATH = os.path.join(DATA_DIR, "indexes/evidence_index")
DEV_INPUT = os.path.join(DATA_DIR, "dev-claims-preprocessed2.json")
DEV_OUTPUT = os.path.join(DATA_DIR, "dev-claims-preranked-bm25.json")

K1 = 0.5
B = 0.3
TOP_K = 100

# Load Dev JSON
def load_json_file(filename):
    with open(filename, 'r') as f:
        return json.load(f)

dev_data = load_json_file(DEV_INPUT)

# Initialize BM25 Searcher
searcher = LuceneSearcher(INDEX_PATH)
searcher.set_bm25(k1=K1, b=B)

# BM25 Retrieval for Dev Claims (as Test)
bm25_outputs = {}

for claim_id, claim_data in tqdm(dev_data.items(), desc="BM25 Retrieval on Dev (Test) Set"):
    claim_text = claim_data["claim_text"]

    hits = searcher.search(claim_text, TOP_K)
    retrieved_ids = [hit.docid for hit in hits]

    bm25_outputs[claim_id] = {
        "claim_text": claim_text,
        "claim_label": claim_data.get("claim_label", ""),
        "pre_ranked_pool": retrieved_ids
    }

# Save
with open(DEV_OUTPUT, "w") as f:
    json.dump(bm25_outputs, f, indent=2)

print(f"\n Saved BM25 pre-ranked dev set (top {TOP_K}) to: {DEV_OUTPUT}")


BM25 Retrieval on Dev (Test) Set: 100%|██████████| 154/154 [00:01<00:00, 78.71it/s]


 Saved BM25 pre-ranked dev set (top 100) to: /content/drive/MyDrive/NLP_content/dev-claims-preranked-bm25.json





### 3.1.2 MiniLM Bi-Encoder

In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
dev_claim_file = os.path.join(data_dir, "dev-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "fine_tuned_dpr_triplet_model")
faiss_index_file = os.path.join(data_dir, "evidence_faiss_minilm.index")
output_file = os.path.join(data_dir, "dev-preranked-minilm.json")
top_k = 100

# Load model, FAISS index, and data
model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(dev_claim_file, 'r') as f:
    dev_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())

# Dense retrieval
def dense_retrieve(claim_text: str, top_k: int):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    D, I = index.search(query_vec, top_k)
    return [(evid_ids[i], float(D[0][idx])) for idx, i in enumerate(I[0])]

# Pre-rank only (no eval)
pre_ranked_output = {}

for cid, entry in tqdm(dev_claims.items(), desc=f"Retrieving top-{top_k} with MiniLM FAISS"):
    claim_text = entry["claim_text"]
    retrieved = dense_retrieve(claim_text, top_k=top_k)
    pre_ranked_output[cid] = {
        "claim_text": claim_text,
        "claim_label": entry.get("claim_label", ""),
        "pre_ranked_pool": [eid for eid, _ in retrieved]
    }

# Save output
with open(output_file, 'w') as f:
    json.dump(pre_ranked_output, f, indent=2)

print(f"Saved MiniLM pre-ranked top-{top_k} evidence to: {output_file}")

Retrieving top-100 with MiniLM FAISS: 100%|██████████| 154/154 [00:24<00:00,  6.32it/s]

Saved MiniLM pre-ranked top-100 evidence to: /content/drive/MyDrive/NLP_content/dev-preranked-minilm.json





In [None]:
import os
import json
import numpy as np
from tqdm import tqdm

def evaluate_pre_ranked_retrieval(data_dir, gold_claim_filename, pre_rank_filename):
    """
    Evaluate retrieval effectiveness of a pre-ranked evidence pool.

    Args:
        data_dir (str): Path to the directory containing JSON files.
        gold_claim_filename (str): Filename of the gold claim file (e.g. "dev-claims-preprocessed2.json").
        pre_rank_filename (str): Filename of the pre-ranked file (e.g. "dev-preranked-minilm.json").

    Prints:
        Mean Recall, Precision, F1, and Accuracy (all gold retrieved).
    """

    gold_claim_file = os.path.join(data_dir, gold_claim_filename)
    pre_rank_file = os.path.join(data_dir, pre_rank_filename)

    # Load data
    with open(gold_claim_file) as f:
        gold_claims = json.load(f)
    with open(pre_rank_file) as f:
        pre_ranked = json.load(f)

    recalls, precisions, f1s = [], [], []
    accurate_full_match = 0
    total = 0

    for cid, entry in tqdm(pre_ranked.items(), desc=f"Evaluating {pre_rank_filename}"):
        claim_text = entry.get("claim_text", "")
        pre_ranked_pool = set(entry.get("pre_ranked_pool", []))
        gold_ids = set(gold_claims.get(cid, {}).get("evidences", []))

        if not claim_text or not gold_ids or not pre_ranked_pool:
            continue

        correct = gold_ids & pre_ranked_pool
        recall = len(correct) / len(gold_ids)
        precision = len(correct) / len(pre_ranked_pool)
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

        recalls.append(recall)
        precisions.append(precision)
        f1s.append(f1)

        if gold_ids.issubset(pre_ranked_pool):
            accurate_full_match += 1
        total += 1

    print(f"\n--- Evaluation: {pre_rank_filename} ---")
    print(f"Recall@100: {np.mean(recalls):.4f}")
    print(f"Precision@100: {np.mean(precisions):.4f}")
    print(f"F1@100: {np.mean(f1s):.4f}")
    print(f"Accuracy@100 (all gold in top-k): {accurate_full_match / total:.4f}")

In [None]:
evaluate_pre_ranked_retrieval(
    data_dir="/content/drive/MyDrive/NLP_content",
    gold_claim_filename="dev-claims-preprocessed2.json",
    pre_rank_filename="dev-claims-preranked-bm25.json"
)

evaluate_pre_ranked_retrieval(
    data_dir="/content/drive/MyDrive/NLP_content",
    gold_claim_filename="dev-claims-preprocessed2.json",
    pre_rank_filename="dev-preranked-minilm.json"
)

Evaluating dev-claims-preranked-bm25.json: 100%|██████████| 154/154 [00:00<00:00, 79002.30it/s]



--- Evaluation: dev-claims-preranked-bm25.json ---
Recall@100: 0.5389
Precision@100: 0.0163
F1@100: 0.0314
Accuracy@100 (all gold in top-k): 0.2857


Evaluating dev-preranked-minilm.json: 100%|██████████| 154/154 [00:00<00:00, 92274.69it/s]


--- Evaluation: dev-preranked-minilm.json ---
Recall@100: 0.5939
Precision@100: 0.0182
F1@100: 0.0350
Accuracy@100 (all gold in top-k): 0.3636





### 3.1.3 Constuct Pre-rank Pool

In [None]:
import os
import json

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
bm25_file_path = os.path.join(data_dir, "dev-claims-preranked-bm25.json")
dense_file_path = os.path.join(data_dir, "dev-preranked-minilm.json")
gold_file_path = os.path.join(data_dir, "dev-claims.json")  # gold evidence here
output_path = os.path.join(data_dir, "merged_dev_prerank_pool.json")

# Load data
with open(bm25_file_path, 'r') as f:
    bm25_data = json.load(f)
with open(dense_file_path, 'r') as f:
    dense_data = json.load(f)
with open(gold_file_path, 'r') as f:
    gold_data = json.load(f)

merged_output = {}
total_pool_lengths = []
total_duplicates = 0
missing_gold = 0

for cid in bm25_data:
    bm25_entry = bm25_data[cid]
    dense_entry = dense_data.get(cid, {})
    gold_entry = gold_data.get(cid, {})

    bm25_pool = bm25_entry.get("pre_ranked_pool", [])[:100]
    dense_pool = dense_entry.get("pre_ranked_pool", [])[:100]

    # Merge pools without duplicates
    seen = set()
    merged_pool = []
    for eid in bm25_pool + dense_pool:
        if eid not in seen:
            seen.add(eid)
            merged_pool.append(eid)
        else:
            total_duplicates += 1

    gold_evidences = gold_entry.get("evidences", [])
    if not gold_evidences:
        missing_gold += 1

    merged_output[cid] = {
        "claim_text": bm25_entry.get("claim_text", ""),
        "claim_label": bm25_entry.get("claim_label", ""),
        "evidences": gold_evidences,
        "pre_ranked_pool": merged_pool
    }
    total_pool_lengths.append(len(merged_pool))

# Save
with open(output_path, 'w') as f:
    json.dump(merged_output, f, indent=2)

print(f"Merged dev pre-rank pool saved to: {output_path}")
print(f"Average pool length: {sum(total_pool_lengths) / len(total_pool_lengths):.2f}")
print(f"Total duplicate removals: {total_duplicates}")
print(f"Claims missing gold evidence: {missing_gold}")


Merged dev pre-rank pool saved to: /content/drive/MyDrive/NLP_content/merged_dev_prerank_pool.json
Average pool length: 184.62
Total duplicate removals: 2369
Claims missing gold evidence: 0


### 3.1.4 RoBERTa DPR Bi-Encoder

In [None]:
# THIS SHOULD NOT BE USED!!! DELETE LATER
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os
import json

data_dir = "/content/drive/MyDrive/NLP_content"
dev_claim_file = os.path.join(data_dir, "dev-claims-preprocessed2.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
model_path = os.path.join(data_dir, "roberta_dpr_biencoder")
faiss_index_file = os.path.join(data_dir, "roberta_faiss.index")
output_file = os.path.join(data_dir, "dev-pre-ranked-roberta.json")

# Load model, FAISS index, and data
model = SentenceTransformer(model_path)
index = faiss.read_index(faiss_index_file)

with open(dev_claim_file, 'r') as f:
    dev_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence_corpus = json.load(f)

evid_ids = list(evidence_corpus.keys())


# Dense retrieval function
def dense_retrieve(claim_text: str, top_k: int = 100):
    query_vec = model.encode([claim_text], normalize_embeddings=True).astype('float32')
    D, I = index.search(query_vec, top_k)
    evid_id_score_pairs = [(evid_ids[i], float(D[0][idx])) for idx, i in enumerate(I[0])]
    return evid_id_score_pairs

# Evaluation + attach retrieval results
def evaluate_on_dev_set(claims_data, k=100):
    total_claims = 0
    recall_hits = 0
    exact_hits = 0
    total_gold_evids = 0
    matched_gold_evids = 0
    output_with_retrieval = {}

    for cid, entry in tqdm(claims_data.items(), desc=f"Evaluating @Top-{k} on dev set"):
        claim_text = entry["claim_text"]
        gold_ids = set(entry.get("evidences", []))
        if not gold_ids:
            continue

        retrieved = dense_retrieve(claim_text, top_k=k)
        retrieved_ids_ordered = [eid for eid, _ in retrieved]
        retrieved_set = set(retrieved_ids_ordered)
        matched = retrieved_set & gold_ids

        total_claims += 1
        total_gold_evids += len(gold_ids)
        matched_gold_evids += len(matched)

        if matched:
            recall_hits += 1
        if matched == gold_ids:
            exact_hits += 1

        # Save result in output
        output_with_retrieval[cid] = {
            "claim_text": claim_text,
            "claim_label": entry.get("claim_label", ""),
            "evidences": list(gold_ids),
            "re_ranked_evidence": retrieved_ids_ordered,
            "re_ranked_scores": [round(score, 5) for _, score in retrieved]
        }

    # Metrics
    item_level_recall = matched_gold_evids / total_gold_evids if total_gold_evids > 0 else 0
    exact_accuracy = exact_hits / total_claims if total_claims > 0 else 0
    recall_hit_rate = recall_hits / total_claims if total_claims > 0 else 0

    print(f"\n[RoBERTa DPR] Dev Set Evaluation @Top-{k}:")
    print(f"Claim-level Recall: {item_level_recall:.2%} ({matched_gold_evids}/{total_gold_evids} gold evidences matched)")
    print(f"Instance-level Accuracy (all gold matched): {exact_accuracy:.2%} ({exact_hits}/{total_claims} claims)")
    print(f"Recall-hit rate (≥1 gold matched): {recall_hits}/{total_claims} ({recall_hit_rate:.2%})")

    # Save to JSON
    with open(output_file, 'w') as f_out:
        json.dump(output_with_retrieval, f_out, indent=2)
    print(f"\nOutput written to: {output_file}")

evaluate_on_dev_set(dev_claims, k=100)


Evaluating @Top-100 on dev set: 100%|██████████| 154/154 [00:47<00:00,  3.25it/s]


[RoBERTa DPR] Dev Set Evaluation @Top-100:
Claim-level Recall: 37.88% (186/491 gold evidences matched)
Instance-level Accuracy (all gold matched): 18.18% (28/154 claims)
Recall-hit rate (≥1 gold matched): 102/154 (66.23%)

Output written to: /content/drive/MyDrive/NLP_content/dev-pre-ranked-roberta.json





## 3.2 Evidence Retrieval - Re-ranking

### 3.2.1 MiniLM Cross-Encoder

In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
model_path = os.path.join(data_dir, "reranker_model_final_minilm2")
input_file = os.path.join(data_dir, "merged_dev_prerank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
output_path = os.path.join(data_dir, "task4_input_minilm_from_dev.json")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
with open(input_file) as f:
    rerank_pool = json.load(f)
with open(evidence_file) as f:
    evidence_corpus = json.load(f)

task4_data = []
recalls = []
precisions = []
f1_scores = []

for cid, entry in tqdm(rerank_pool.items(), desc="Scoring with MiniLM reranker"):
    claim_text = entry["claim_text"]
    claim_label = entry.get("claim_label", "")
    candidates = entry.get("pre_ranked_pool", [])
    gold_ids = set(entry.get("evidences", []))

    if not claim_text or not candidates:
        continue

    texts = [evidence_corpus[eid] for eid in candidates if eid in evidence_corpus]
    pairs = [(claim_text, txt) for txt in texts]
    if not pairs:
        continue

    encoded = tokenizer.batch_encode_plus(
        pairs,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded.items()})
        # For regression-style model: use logits.squeeze()
        # For classification model (num_labels=2): use class 1 probs
        if outputs.logits.shape[-1] == 1:
            scores = outputs.logits.squeeze()
        else:
            scores = torch.softmax(outputs.logits, dim=1)[:, 1]

    topk_indices = torch.topk(scores, k=min(5, len(scores))).indices.tolist()
    topk_ids = [candidates[i] for i in topk_indices]

    # Evaluation
    if gold_ids:
        matched = sum(1 for g in gold_ids if g in topk_ids)
        recall = matched / len(gold_ids)
        precision = matched / len(topk_ids)
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)

    task4_data.append({
        "claim_id": cid,
        "claim_text": claim_text,
        "claim_label": claim_label,
        "top_evidence_ids": topk_ids
    })

# Save classification-ready export
with open(output_path, "w") as f:
    json.dump(task4_data, f, indent=2)

# Print evaluation summary
print("\n[MiniLM Cross-Encoder Reranker Evaluation]")
print(f"Avg Recall:    {np.mean(recalls):.4f}")
print(f"Avg Precision: {np.mean(precisions):.4f}")
print(f"Avg F1-score:  {np.mean(f1_scores):.4f}")
print(f"Saved Task 4 input to: {output_path}")

Scoring with MiniLM reranker: 100%|██████████| 154/154 [00:11<00:00, 13.38it/s]


[MiniLM Cross-Encoder Reranker Evaluation]
Avg Recall:    0.3118
Avg Precision: 0.1818
Avg F1-score:  0.2143
Saved Task 4 input to: /content/drive/MyDrive/NLP_content/task4_input_minilm_from_dev.json





### 3.2.2 DistilBERT Cross-Encoder

In [None]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Config
data_dir = "/content/drive/MyDrive/NLP_content"
model_path = os.path.join(data_dir, "reranker_model_final_distilbert")
input_file = os.path.join(data_dir, "merged_dev_prerank_pool.json")
evidence_file = os.path.join(data_dir, "evidence-preprocessed2.json")
output_path = os.path.join(data_dir, "task4_input_distilbert_from_dev.json")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load data
with open(input_file) as f:
    rerank_pool = json.load(f)
with open(evidence_file) as f:
    evidence_corpus = json.load(f)

task4_data = []
recalls = []
precisions = []
f1_scores = []

for cid, entry in tqdm(rerank_pool.items(), desc="Scoring and evaluating"):
    claim_text = entry["claim_text"]
    claim_label = entry.get("claim_label", "")
    candidates = entry.get("pre_ranked_pool", [])
    gold_ids = set(entry.get("evidences", []))

    if not claim_text or not candidates:
        continue

    texts = [evidence_corpus[eid] for eid in candidates if eid in evidence_corpus]
    pairs = [(claim_text, txt) for txt in texts]
    if not pairs:
        continue

    encoded = tokenizer.batch_encode_plus(
        pairs,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**{k: v.to(device) for k, v in encoded.items()})
        scores = torch.softmax(outputs.logits, dim=1)[:, 1]  # class 1 = relevant

    topk_indices = torch.topk(scores, k=min(5, len(scores))).indices.tolist()
    topk_ids = [candidates[i] for i in topk_indices]

    # Evaluation metrics
    if gold_ids:
        matched = sum(1 for g in gold_ids if g in topk_ids)
        recall = matched / len(gold_ids)
        precision = matched / len(topk_ids)
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        recalls.append(recall)
        precisions.append(precision)
        f1_scores.append(f1)

    # Output
    task4_data.append({
        "claim_id": cid,
        "claim_text": claim_text,
        "claim_label": claim_label,
        "top_evidence_ids": topk_ids
    })

# Save classification-ready output
with open(output_path, "w") as f:
    json.dump(task4_data, f, indent=2)

# Print evaluation metrics
print(f"\n[DistilBERT Cross-Encoder Re-ranker Evaluation]")
print(f"Avg Recall:    {np.mean(recalls):.4f}")
print(f"Avg Precision: {np.mean(precisions):.4f}")
print(f"Avg F1-score:  {np.mean(f1_scores):.4f}")
print(f"Saved Task 4 classification input to: {output_path}")

Scoring and evaluating: 100%|██████████| 154/154 [00:32<00:00,  4.70it/s]


[DistilBERT Cross-Encoder Re-ranker Evaluation]
Avg Recall:    0.2184
Avg Precision: 0.1182
Avg F1-score:  0.1439
Saved Task 4 classification input to: /content/drive/MyDrive/NLP_content/task4_input_distilbert_from_dev.json





## 3.3 Four-Class Classification

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*