In [1]:
import json
import torch
import joblib
import numpy as np
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open("train-claims.json", "r") as f:
    train_claims = json.load(f)

with open("evidence.json", "r") as f:
    evidence_data = json.load(f)

with open("filtered_retrieval_output.json", "r") as f:
    retrieval_data = json.load(f)

with open("dev-claims.json", "r") as f:
    dev_data = json.load(f)

In [3]:
embedder = SentenceTransformer("intfloat/e5-large-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [4]:
label_map = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}
model = SentenceTransformer("intfloat/e5-large-v2")
X_texts, y_labels = [], []

for cid, entry in train_claims.items():
    claim_text = entry["claim_text"]
    label = entry["claim_label"]
    for evid_id in entry.get("evidences", []):
        if evid_id in evidence_data:
            ev_text = evidence_data[evid_id]
            X_texts.append(f"query: {claim_text} passage: {ev_text}")
            y_labels.append(label_map[label])

In [5]:
X_embeddings = model.encode(X_texts)
y_labels = np.array(y_labels)


X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y_labels, test_size=0.2, stratify=y_labels, random_state=42)

In [6]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train)

In [7]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(256,), max_iter=500, random_state=42)
clf.fit(X_train_bal, y_train_bal)

In [8]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [9]:
y_pred = clf.predict(X_test)

inv_label_map = {v: k for k, v in label_map.items()}
print(classification_report(y_test, y_pred, target_names=[inv_label_map[i] for i in sorted(inv_label_map)]))

                 precision    recall  f1-score   support

       SUPPORTS       0.61      0.65      0.63       269
        REFUTES       0.69      0.24      0.35        92
NOT_ENOUGH_INFO       0.63      0.83      0.72       386
       DISPUTED       0.00      0.00      0.00        78

       accuracy                           0.63       825
      macro avg       0.48      0.43      0.43       825
   weighted avg       0.57      0.63      0.58       825



In [10]:
joblib.dump(clf, "logistic_model_improve.joblib")

['logistic_model_improve.joblib']

In [19]:
# Load trained classifier
clf = joblib.load("logistic_model_improve.joblib")

# Embed only claim_texts (without evidence)
dev_claim_ids = []
dev_claim_embeddings = []
for cid, data in dev_data.items():
    dev_claim_ids.append(cid)
    claim_text = data["claim_text"]
    dev_claim_embeddings.append(embedder.encode(f"query: {claim_text}"))

dev_claim_embeddings = np.array(dev_claim_embeddings)

# Predict using trained model
pred_labels = clf.predict(dev_claim_embeddings)
pred_label_names = [inv_label_map[label] for label in pred_labels]

# Save or use predictions
predictions_without_retrieval = dict(zip(dev_claim_ids, pred_label_names))

# Save to file
with open("predictions_without_retrieval.json", "w") as f:
    json.dump(predictions_without_retrieval, f, indent=2)

print("Predictions without retrieval saved.")

Predictions without retrieval saved.


In [16]:
# Eveluating through retrieval outputs

label_map_rev = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT_ENOUGH_INFO", 3: "DISPUTED"}

predictions = {}

for cid, entry in retrieval_data.items():
    claim_text = entry["claim_text"]
    evid_ids = entry["filtered_evidences"]

    evidence_texts = [evidence_data[eid] for eid in evid_ids if eid in evidence_data]
    if not evidence_texts:
        continue

    claim_vec = embedder.encode(f"query: {claim_text}")
    inputs = [f"query: {claim_text} passage: {ev}" for ev in evidence_texts]
    pair_vecs = embedder.encode(inputs)

    cos_scores = cosine_similarity([claim_vec], pair_vecs)[0]
    top_k = np.argsort(cos_scores)[-5:][::-1]

    top_vecs = [pair_vecs[i] for i in top_k]
    pred_probs = clf.predict_proba(top_vecs)
    avg_probs = np.mean(pred_probs, axis=0)
    final_label = label_map_rev[np.argmax(avg_probs)]
    top_evid_ids = [evid_ids[i] for i in top_k]

    predictions[cid] = {
        "claim_label": final_label,
        "evidences": top_evid_ids
    }

In [17]:
with open("predictions_eval_ready.json", "w") as f:
    json.dump(predictions, f, indent=2)

print("Saved predictions to predictions_eval_ready.json")

Saved predictions to predictions_eval_ready.json


In [18]:
!python eval.py --predictions predictions_eval_ready.json --groundtruth dev-claims.json

Evidence Retrieval F-score (F)    = 0.13520923520923522
Claim Classification Accuracy (A) = 0.42207792207792205
Harmonic Mean of F and A          = 0.20480943189384412
