# 2025 COMP90042 Project

Group 24 Faiss Simces preprocessing pipeline



# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### Notebook: Ppl_preprocessing.ipynb

This notebook is used to compute the perplexity for each evidence text, which is helpful for downstream preprocessing tasks. The notebook demonstrates perplexity computation using gold evidence from the training set as an example.

Computing perplexity for all 1.2 million evidence texts would take approximately 7 hours on a T4 GPU. To speed up the process, we distribute the workload across multiple accounts.

The final result, `evidence_perplexity.json`, which contains perplexity scores for 1.2 million pieces of evidence, will be available via a Google Drive link provided in the README.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from accelerate.test_utils.testing import get_backend
from multiprocessing import Pool, cpu_count
from functools import partial
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(cpu_count(), device)
# get num of gpu
print(torch.cuda.device_count())

2 cuda
1


### Upload Data Files

Upload the following files into the `data` folder:

- `train-claims.json`
- `dev-claims.json`
- `test-claims-unlabelled.json`
- `evidence.json`

### Output Files

- `evidence_subset.json`: Contains gold evidence from train-claims in the format `{evidence_id: text}`
- `claims.json`: Contains claims in the format `{claim_id: text}`

In [None]:
train_json_path = "train-claims.json"      # claim & evidence
dev_json_path = "dev-claims.json"      # claim & evidence
test_json_path = "test-claims-unlabelled.json"  # claim
evidence_json_path = "evidence.json"  # evidence
output_evidence_set_path = "evidence_subset.json"
output_claim_set_path = "claims.json"

# output_dev_emb_path = "local_data/dev-embed-1.json"

with open(train_json_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open(dev_json_path, "r", encoding="utf-8") as f:
    dev_data = json.load(f)
with open(test_json_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)
with open(evidence_json_path, "r", encoding="utf-8") as f:
    evidence_data = json.load(f)

#combine train and dev data
# merged_data = {**train_data, **dev_data}
merged_data = train_data

evicence_set = {}
for claim_id, claim_info in merged_data.items():

    claim_text = claim_info["claim_text"]
    positive_ids = claim_info["evidences"]

    for pos_id in positive_ids:
        if pos_id not in evidence_data:
            print(f"Warning: Evidence ID {pos_id} not found in evidence data.")
            continue
        evicence_set[pos_id] = evidence_data[pos_id]

print(len(evicence_set))
# Save the evidence set to a JSON file
with open(output_evidence_set_path, "w", encoding="utf-8") as f:
    json.dump(evicence_set, f, ensure_ascii=False, indent=4)


claim_set = {}

def get_claims(data):

    claim_set = {}
    for claim_id, claim_info in data.items():
        claim_text = claim_info["claim_text"]
        claim_set[claim_id] = {
            "claim_text": claim_text,
        }
    return claim_set

train_claim_set = get_claims(train_data)
dev_claim_set = get_claims(dev_data)
test_claim_set = get_claims(test_data)
claim_set = {**train_claim_set, **dev_claim_set, **test_claim_set}
print(len(claim_set))
# Save the claim set to a JSON file
with open(output_claim_set_path, "w", encoding="utf-8") as f:
    json.dump(claim_set, f, ensure_ascii=False, indent=4)

3121
1535


In [None]:
model_id = "openai-community/gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Perplexity Computation

The function `fast_perplexity_batch` computes the perplexity of a list of input texts using a language model. The formula for perplexity is given by:

$$
PP(W) = \sqrt[m]{\frac{1}{P(W)}}
$$

Taking the logarithm of both sides:

$$
\log{PP(W)} = -\frac{1}{m} \log{P(W)}
$$

In the implementation:

- Cross-entropy loss is used to estimate the negative log-likelihood.
- The loss is masked by the attention mask to ignore padding tokens.
- The average loss per sequence is exponentiated to get the perplexity.

In [None]:
def fast_perplexity_batch(text_list, tokenizer, model, device, max_length=128):
    encodings = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    input_ids = encodings.input_ids.to(device)
    attention_mask = encodings.attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    shift_logits = logits[:, :-1, :].contiguous()
    shift_labels = input_ids[:, 1:].contiguous()
    shift_attention = attention_mask[:, 1:]

    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    loss = loss.view(shift_labels.size()) * shift_attention

    seq_loss = loss.sum(dim=1) / shift_attention.sum(dim=1)
    perplexity = torch.exp(seq_loss)
    return perplexity.tolist()

In [None]:
def process_one_batch(batch_ids, batch_texts, tokenizer, model, device, max_length=128):
    """

    config
        batch_ids (List[str/int])
        batch_texts (List[str])
        tokenizer: Hugging Face tokenizer

    return
        List[Tuple[eid, Dict]]
    """
    results = []
    try:
        ppls = fast_perplexity_batch(batch_texts, tokenizer, model, device, max_length=max_length)

        for eid, text, ppl in zip(batch_ids, batch_texts, ppls):
            results.append((eid, {"text": text, "ppl": ppl}))

    except Exception as e:
        for eid, text in zip(batch_ids, batch_texts):
            results.append((eid, {"text": text, "error": f"PPL error: {e}"}))

    return results

### Run Full Evidence Perplexity Computation

Uncomment the lines below in the notebook to compute perplexity for the full set of 1.2 million evidence texts.

In [None]:
import json

result = {}
save_every = 640
batch_size = 64
batch_id = 0

# with open("evidence.json", "r", encoding="utf-8") as f:
#     evidence = json.load(f)

with open("evidence_subset.json", "r", encoding="utf-8") as f:
    evidence = json.load(f)

evidence = dict(list(evidence.items()))
print(f"Total samples: {len(evidence)}")

eids = list(evidence.keys())
texts = list(evidence.values())

for i in range(0, len(evidence), batch_size):
    batch_eids = eids[i:i + batch_size]
    batch_texts = texts[i:i + batch_size]

    batch_results = process_one_batch(batch_eids, batch_texts, tokenizer, model, device)

    for eid, res in batch_results:
        result[eid] = res

    print(f"Progress: {i + len(batch_results)}/{len(evidence)}")

    if (i + batch_size) % save_every == 0 or (i + batch_size) >= len(evidence):
        print(f'Saving batch {batch_id}: {i + batch_size} / {len(evidence)}')
        with open(f"colab_partial_{batch_id}.json", "w", encoding="utf-8") as f:
            json.dump(result, f, ensure_ascii=False, indent=2)
        # with open(f"drive/MyDrive/nlp_data/colab_partial_{batch_id}.json", "w", encoding="utf-8") as f:
        #     json.dump(result, f, ensure_ascii=False, indent=2)
        batch_id += 1
        result.clear()

Total samples: 3121
Progress: 64/3121
Progress: 128/3121
Progress: 192/3121
Progress: 256/3121
Progress: 320/3121
Progress: 384/3121
Progress: 448/3121
Progress: 512/3121
Progress: 576/3121
Progress: 640/3121
Saving batch 0: 640 / 3121
Progress: 704/3121
Progress: 768/3121
Progress: 832/3121
Progress: 896/3121
Progress: 960/3121
Progress: 1024/3121
Progress: 1088/3121
Progress: 1152/3121
Progress: 1216/3121
Progress: 1280/3121
Saving batch 1: 1280 / 3121
Progress: 1344/3121
Progress: 1408/3121
Progress: 1472/3121
Progress: 1536/3121
Progress: 1600/3121
Progress: 1664/3121
Progress: 1728/3121
Progress: 1792/3121
Progress: 1856/3121
Progress: 1920/3121
Saving batch 2: 1920 / 3121
Progress: 1984/3121
Progress: 2048/3121
Progress: 2112/3121
Progress: 2176/3121
Progress: 2240/3121
Progress: 2304/3121
Progress: 2368/3121
Progress: 2432/3121
Progress: 2496/3121
Progress: 2560/3121
Saving batch 3: 2560 / 3121
Progress: 2624/3121
Progress: 2688/3121
Progress: 2752/3121
Progress: 2816/3121
Progr

In [None]:
import glob

result_files = glob.glob("colab_partial_*.json")
final = {}
for file in result_files:
    with open(file, "r", encoding="utf-8") as f:
        final.update(json.load(f))

with open("evidence_perplexity.json", "w", encoding="utf-8") as f:
    json.dump(final, f, indent=2)

# Analysis Gold standard


In [None]:
import json
with open("evidence_perplexity.json", "r", encoding="utf-8") as f:
    evidence = json.load(f)


In [None]:
len(evidence)

3364

In [None]:
good_one = {}
bad_one = {}
for eid, data in evidence.items():
    if (data.get("ppl", 0) < 85) :
        ppl = data.get("ppl", 0)
        good_one[eid] = (data)
    else:
        bad_one[eid] = data
print(len(bad_one), len(bad_one)/len(evidence))

220 0.06539833531510107
