# 2025 COMP90042 Project
*Make sure you change the file name with your group id.*

# Readme
*If there is something to be noted for the marker, please mention here.*

*If you are planning to implement a program with Object Oriented Programming style, please put those the bottom of this ipynb file*

# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

In [None]:
import json
from collections import Counter
import numpy as np

# Load data
with open("data/train-claims.json") as f:
    train = json.load(f)
with open("data/dev-claims.json") as f:
    dev = json.load(f)
with open("data/test-claims-unlabelled.json") as f:
    test = json.load(f)
with open("data/evidence.json") as f:
    evidence = json.load(f)

# Claim counts and class distribution
def summarize_claims(claim_dict):
    labels = [info["claim_label"] for info in claim_dict.values()]
    evid_counts = [len(info["evidences"]) for info in claim_dict.values()]
    print(f"→ #claims: {len(labels)}")
    print("→ class dist:", Counter(labels))
    print(f"→ avg. evidences/claim: {np.mean(evid_counts):.2f}")

print("TRAIN set:")
summarize_claims(train)
print("\nDEV set:")
summarize_claims(dev)

# Evidence corpus stats
lengths = [len(txt.split()) for txt in evidence.values()]
print(f"\n#evidence passages: {len(lengths)}")
print(f"passage length (tokens) — avg: {np.mean(lengths):.1f}, max: {np.max(lengths)}, min: {np.min(lengths)}")


TRAIN set:
→ #claims: 1228
→ class dist: Counter({'SUPPORTS': 519, 'NOT_ENOUGH_INFO': 386, 'REFUTES': 199, 'DISPUTED': 124})
→ avg. evidences/claim: 3.36

DEV set:
→ #claims: 154
→ class dist: Counter({'SUPPORTS': 68, 'NOT_ENOUGH_INFO': 41, 'REFUTES': 27, 'DISPUTED': 18})
→ avg. evidences/claim: 3.19

#evidence passages: 1208827
passage length (tokens) — avg: 19.7, max: 479, min: 1


# 2. Model Implementation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Baseline Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1. Fit TF-IDF on evidence
ev_texts = list(evidence.values())
ev_ids   = list(evidence.keys())
ev_vec   = TfidfVectorizer(max_features=50000).fit(ev_texts)
ev_tfidf = ev_vec.transform(ev_texts)

# 2. For each train claim, retrieve top-1 evidence
train_docs, train_labels = [], []
for cid, info in train_claims.items():
    ctf = ev_vec.transform([info["claim_text"]])
    sims = cosine_similarity(ctf, ev_tfidf)[0]
    top1 = np.argmax(sims)
    doc = info["claim_text"] + " " + ev_texts[top1]
    train_docs.append(doc)
    train_labels.append(info["claim_label"])

# 3. Vectorize claim+evidence concatenations
clf_vec = TfidfVectorizer(max_features=50000)
X_train = clf_vec.fit_transform(train_docs)

# 4. Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, train_labels)

# 5. Evaluate on dev set similarly


# 3.Testing and Evaluation
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

## Object Oriented Programming codes here

*You can use multiple code snippets. Just add more if needed*