# NLP Assignment 01

## Setup

### General Setup

In [96]:
# import libraries
import sys, subprocess, shlex, os, warnings
from typing import List

# helper: to run commands
def run(cmd: str) -> int:
    print(f"$ {cmd}")
    return subprocess.run(shlex.split(cmd), check=False).returncode

# helper : to install pip packages
def pip_install(args: List[str], index_url: str | None = None, quiet: bool = True):
    cmd = [sys.executable, "-m", "pip", "install"]
    if quiet: cmd += ["-q"]
    cmd += ["--upgrade-strategy", "only-if-needed"] + args
    if index_url: cmd += ["--index-url", index_url]
    print("$", " ".join(cmd))
    subprocess.check_call(cmd)

# helper : to uninstall pip packages
def pip_uninstall(pkgs: List[str], quiet: bool = True):
    if not pkgs: return
    cmd = [sys.executable, "-m", "pip", "uninstall", "-y"]
    if quiet: cmd += ["-q"]
    cmd += pkgs
    print("$", " ".join(cmd))
    subprocess.run(cmd, check=False)

try:
    from importlib import metadata as importlib_metadata
except ImportError:
    pip_install(["importlib_metadata>=6.0"])
    from importlib import metadata as importlib_metadata

try:
    from packaging.requirements import Requirement
except (ImportError, ModuleNotFoundError):
    pip_install(["packaging>=24.0"])
    from packaging.requirements import Requirement

# helper : check for package requirements
def satisfies(requirement_str: str) -> bool:
    req = Requirement(requirement_str)
    name = req.name
    try:
        v = importlib_metadata.version(name)
    except importlib_metadata.PackageNotFoundError:
        return False
    return (not req.specifier) or (v in req.specifier)

# helper : check or install for package requirements
def ensure(requirements: List[str], index_url: str | None = None):
    missing = [r for r in requirements if not satisfies(r)]
    if missing:
        pip_install(missing, index_url=index_url)
    else:
        print(f"✓ All satisfied: {', '.join(requirements)}")

# update pip version
ensure(["pip>=23", "setuptools>=68", "wheel>=0.41", "jedi>=0.16"])

✓ All satisfied: pip>=23, setuptools>=68, wheel>=0.41, jedi>=0.16


### Requirements

In [97]:
# check for gpu
gpu_hw = (run("nvidia-smi -L") == 0)
torch_cuda = None
try:
    import torch as _torch
    torch_cuda = _torch.version.cuda
except ModuleNotFoundError:
    pass
gpu_target = gpu_hw or bool(torch_cuda)
print(f"GPU hardware: {gpu_hw} | torch CUDA: {torch_cuda} | target GPU build: {gpu_target}")

# specs for torch version
is_colab = False

if not is_colab:
    TORCH_SPECS = ["torch", "torchvision", "torchaudio"]
    GPU_INDEX_PRIMARY = "https://download.pytorch.org/whl/nightly/cu128"
    GPU_INDEX_ALT     = "https://download.pytorch.org/whl/nightly/cu12x"
else:
    # TORCH_SPECS = ["torch>=2.6,<2.8", "torchvision>=0.21,<0.23", "torchaudio>=2.6,<2.8"]
    # GPU_INDEX_PRIMARY = "https://download.pytorch.org/whl/cu124"
    # GPU_INDEX_ALT     = "https://download.pytorch.org/whl/cu121"

    TORCH_SPECS = ["torch==2.4.*", "torchvision==0.19.*", "torchaudio==2.4.*"]
    GPU_INDEX_PRIMARY = "https://download.pytorch.org/whl/cu121"
    GPU_INDEX_ALT     = None

CPU_INDEX = "https://download.pytorch.org/whl/cpu"

# helper : check for mismatch between gpu/cpu
def current_torch_build():
    try:
        import torch
        return "gpu" if torch.version.cuda else "cpu"
    except ModuleNotFoundError:
        return None

cur_build = current_torch_build()
tgt_build = "gpu" if gpu_target else "cpu"
if cur_build and cur_build != tgt_build:
    print(f"Detected mismatched torch build ({cur_build}) → uninstalling torch/vision/audio...")
    pip_uninstall(["torch", "torchvision", "torchaudio"])

# install torch
if tgt_build != "gpu":
    ensure(TORCH_SPECS, index_url=CPU_INDEX)
else:
    try:
        ensure(TORCH_SPECS, index_url=GPU_INDEX_PRIMARY)
    except subprocess.CalledProcessError:
        if GPU_INDEX_ALT:
            try:
                ensure(TORCH_SPECS, index_url=GPU_INDEX_ALT)
            except subprocess.CalledProcessError:
                pip_uninstall(["torch", "torchvision", "torchaudio"])
                ensure(TORCH_SPECS, index_url=CPU_INDEX)

import torch
print("Torch:", torch.__version__, "| CUDA runtime:", torch.version.cuda, "| cuda.is_available:", torch.cuda.is_available())

# install dependency packages
ensure([
    "ftfy>=6.2",
    "requests>=2.32",
    "pandas>=2.2",
    "tqdm>=4.66",
    "scikit-learn"
])

# install spacy
ensure([
    "spacy",
    "thinc",
    "numpy"
])

cuda_ver = torch.version.cuda or ""
if cuda_ver.startswith("12"):
    ensure(["cupy-cuda12x>=12.0"])
elif cuda_ver.startswith("11"):
    ensure(["cupy-cuda11x>=12.0"])
else:
    print("No CUDA runtime in PyTorch → spaCy will use CPU.")
    ensure(["spacy==3.7.*"])

# install spaCy English small model
def has_spacy_model_pkg(name: str) -> bool:
    try:
        from importlib import metadata as im
        im.version(name)
        return True
    except ModuleNotFoundError:
        return False

import spacy
MODEL = "en_core_web_sm"
if not has_spacy_model_pkg(MODEL):
    rc = run(f'"{sys.executable}" -m spacy download {MODEL}')
    if rc != 0:
        raise RuntimeError(f"Failed to download {MODEL}; exit code {rc}")
    print(f"✓ Installed {MODEL} for spaCy {spacy.__version__}")
else:
    print(f"✓ spaCy {spacy.__version__} model {MODEL} already present")

try:
    used_gpu = spacy.prefer_gpu()
    if used_gpu:
        import cupy
        print("CuPy devices:", cupy.cuda.runtime.getDeviceCount())
except Exception as e:
    print("GPU check note:", e)

$ nvidia-smi -L
GPU hardware: True | torch CUDA: 12.8 | target GPU build: True
✓ All satisfied: torch, torchvision, torchaudio
Torch: 2.9.0.dev20250830+cu128 | CUDA runtime: 12.8 | cuda.is_available: True
✓ All satisfied: ftfy>=6.2, requests>=2.32, pandas>=2.2, tqdm>=4.66, scikit-learn
✓ All satisfied: spacy, thinc, numpy
✓ All satisfied: cupy-cuda12x>=12.0
✓ spaCy 3.8.7 model en_core_web_sm already present
CuPy devices: 1


In [98]:
# print(torch.cuda.device_count())
# print(torch.cuda.is_available())
# print(torch.cuda.current_device())
# print(torch.cuda.device(0))
# print(torch.cuda.get_device_name(0))

## Import Libraries

In [99]:
import tarfile, gzip, shutil, requests, re, html, unicodedata
from collections import defaultdict
from ftfy import fix_text
import pandas as pd

## Preprocessing

### Download the Dataset

In [100]:
# helper: to download the required files
def download(url, out_path):
    if not os.path.exists(out_path):
        r = requests.get(url, timeout=120)
        r.raise_for_status()
        with open(out_path, "wb") as f:
            f.write(r.content)

# download the required files
if not os.path.exists("collection.top3.tsv"):
    if not os.path.exists("collection.tsv"):
        download("https://msmarco.z22.web.core.windows.net/msmarcoranking/collection.tar.gz", "collection.tar.gz")
        with tarfile.open("collection.tar.gz", "r:gz") as tar:
            tar.extractall(path=".", filter="data")

if not os.path.exists("queries.train.tsv"):
    download("https://msmarco.z22.web.core.windows.net/msmarcoranking/queries.tar.gz", "queries.tar.gz")
    with tarfile.open("queries.tar.gz", "r:gz") as tar:
        member = tar.getmember("queries.train.tsv")
        tar.extract(member, path=".", filter="data")

if not os.path.exists("qidpidtriples.top3.tsv"):
    DUPLICATE_INDEX = 3

    if not os.path.exists("qidpidtriples.train.full.2.tsv"):
        download("https://msmarco.z22.web.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
                 "qidpidtriples.train.full.2.tsv.gz")

        with gzip.open("qidpidtriples.train.full.2.tsv.gz", "rb") as f_in, \
            open("qidpidtriples.train.full.2.tsv", "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # reduce qidpidtriples to only keep up to 3 (qid, pid_pos, pid_neg)
    src = "qidpidtriples.train.full.2.tsv"
    out = "qidpidtriples.top3.tsv"

    if not os.path.exists(out):
        counts = defaultdict(int)  # (qid, pid_pos)

        with open(src, "r", encoding="utf-8", newline="") as fin, \
             open(out, "w", encoding="utf-8", newline="") as fout:
            for line in fin:
                parts = line.rstrip("\r\n").split("\t")
                if len(parts) != 3:
                    continue
                qid_s, pid_pos_s, pid_neg_s = parts
                key = (int(qid_s), int(pid_pos_s))
                if counts[key] < DUPLICATE_INDEX:
                    fout.write(line if line.endswith(("\n", "\r\n")) else line + "\n")  # keep this row
                    counts[key] += 1

### Load and Sample the Dataset

In [101]:
# define the sample size
K_SAMPLE = 10000
DUPLICATE_INDEX = 3

In [102]:
# load and sample the qidpidtriples dataset
qidpidtriples = pd.read_csv("qidpidtriples.top3.tsv",
                            sep="\t",
                            header=None,
                            encoding="utf-8",
                            names=["qid","pid_pos","pid_neg"],
                            dtype={"qid":"int32","pid_pos":"int32","pid_neg":"int32"})

qidpidtriples = qidpidtriples.sort_values("qid", kind="stable").head(K_SAMPLE * DUPLICATE_INDEX - 1).reset_index(drop=True)

print(qidpidtriples.shape)
qidpidtriples.head(5)

(29999, 3)


Unnamed: 0,qid,pid_pos,pid_neg
0,3,1142680,970814
1,3,1142680,4148285
2,3,1142680,2279447
3,4,5613529,967278
4,4,5613529,139284


In [103]:
# define the collection pid/passage ids
if not os.path.exists("collection.top3.tsv"):
    # pid_pos first
    pid_pos = qidpidtriples["pid_pos"].astype(int).drop_duplicates()

    # pid_neg that weren't already in pid_pos
    pid_neg = (qidpidtriples["pid_neg"].astype(int)
                                   .mask(lambda s: s.isin(pid_pos))  # mark ones already in pid_pos
                                   .dropna()
                                   .astype(int)
                                   .drop_duplicates())

    pids = pid_pos.tolist() + pid_neg.tolist()
    pids = set(pids)
    print(len(pids))

    # sample the collection from the pid
    keep_passage = []
    with open("collection.tsv", "r", encoding="utf-8") as f:
        for line in f:
            pid_str, passage = line.rstrip("\n").split("\t", 1)
            pid = int(pid_str)
            if pid in pids:
                keep_passage.append((pid, passage))  # keep as a row
                pids.remove(pid)
                if not pids:
                    break

    collection = pd.DataFrame(keep_passage, columns=["pid", "passage"])

    out = "collection.top3.tsv"
    collection.to_csv(out, sep="\t", header=False, index=False, encoding="utf-8")
    print(f"Saved: {out}")
else:
    collection = pd.read_csv("collection.top3.tsv",
                             sep="\t",
                             header=None,
                             encoding="utf-8",
                             names=["pid", "passage"],
                             dtype={"pid": "int32", "passage": "string"}
                             )

print(collection.shape)
collection.sort_values("pid", kind="stable").head(5)

38355
Saved: collection.top3.tsv
(38355, 2)


Unnamed: 0,pid,passage
0,196,The Expected Family Contribution (EFC) is deri...
1,212,"Without Amazon Prime membership, you will be p..."
2,289,About the Downtown Cruise Ship Berths. Juneau ...
3,905,Amsterdam: Annual Weather Averages. August is ...
4,984,"Lynda Carter was born on July 24, 1951 in Phoe..."


In [104]:
# define the sample qid/query ids
qids = qidpidtriples["qid"].astype(int).drop_duplicates().to_list()
qids = set(qids)
print(len(qids))

# sample the queries from the qid
keep_query = []
with open("queries.train.tsv", "r", encoding="utf-8") as f:
    for line in f:
        qid_str, query = line.rstrip("\n").split("\t", 1)
        qid = int(qid_str)
        if qid in qids:
            keep_query.append((qid, query))  # keep as a row
            qids.remove(qid)
            if not qids:
                break

queries = pd.DataFrame(keep_query, columns=["qid", "query"])

print(queries.shape)
queries.sort_values("qid", kind="stable").head(5)

9649
(9649, 2)


Unnamed: 0,qid,query
437,3,Another name for the primary visual cortex is
5417,4,Defining alcoholism as a disease is associate...
3346,5,ECT is a treatment that is used for
7016,6,"Ebolavirus is an enveloped virus, which means"
7457,8,"In humans, the normal set point for body temp..."


In [109]:
# qidpidtriples: qid, pid_pos, pid_neg
# collection:    pid, passage
# queries:       qid, query

df = qidpidtriples.reset_index(names="_ord")

# merge qid with query passage
df = df.merge(queries, on="qid", how="left", validate="m:1")

# merge pid_pos with collection passage
col_pos = collection.rename(columns={"pid": "pid_pos", "passage": "passage_pos"})
df = df.merge(col_pos, on="pid_pos", how="left", validate="m:1")

# merge pid_neg with collection passage
col_neg = collection.rename(columns={"pid": "pid_neg", "passage": "passage_neg"})
df = df.merge(col_neg, on="pid_neg", how="left", validate="m:1")

# compiled datasets
compiled_df = (df.sort_values("_ord")
               .loc[:, ["qid", "query", "pid_pos", "passage_pos", "pid_neg", "passage_neg"]]
               .reset_index(drop=True))

print(compiled_df.shape)
compiled_df.head(5)

(29999, 6)


Unnamed: 0,qid,query,pid_pos,passage_pos,pid_neg,passage_neg
0,3,Another name for the primary visual cortex is,1142680,The primary (parts of the cortex that receive ...,970814,The frontal lobe of the brain. The frontal lob...
1,3,Another name for the primary visual cortex is,1142680,The primary (parts of the cortex that receive ...,4148285,Damage to the primary motor cortex results onl...
2,3,Another name for the primary visual cortex is,1142680,The primary (parts of the cortex that receive ...,2279447,Primary motor cortex (Brodmann area 4) of the ...
3,4,Defining alcoholism as a disease is associate...,5613529,The formation of AA â Alcoholics Anonymous â...,967278,"Alcoholism, also known as alcohol use disorder..."
4,4,Defining alcoholism as a disease is associate...,5613529,The formation of AA â Alcoholics Anonymous â...,139284,Apologetics may be simply defined as the defen...


### Normalise the Dataset

In [106]:
def normalize_series(s: pd.Series, use_ftfy=True, keep_na=True) -> pd.Series:
    s = s.astype("string")
    na_mask = s.isna() if keep_na else None

    if use_ftfy:
        s = s.map(fix_text)
    s = s.map(html.unescape)

    try:
        s = s.str.normalize("NFKC")  # pandas >=1.3
    except Exception:
        s = s.map(lambda x: unicodedata.normalize("NFKC", x))

    s = s.str.replace(r"\s+", " ", regex=True).str.strip().str.casefold()

    if keep_na and na_mask is not None:
        s = s.mask(na_mask, other=pd.NA)
    return s

In [113]:
id_cols = ["qid", "pid_pos"]
text_cols = ["query", "passage_pos"]
cols = id_cols + text_cols

qidpid_clean = (df.sort_values("_ord")
                .drop_duplicates(subset=id_cols, keep="first")
                .loc[:, cols]
                .reset_index(drop=True)
                )
qidpid_clean[text_cols] = qidpid_clean[text_cols].apply(normalize_series)

out = "qidpid.clean.tsv"
qidpid_clean.to_csv(out,
                    sep="\t",
                    header=False,
                    index=False,
                    encoding="utf-16")
print(f"Saved: {out}")

print(qidpid_clean.shape)
qidpid_clean.head(5)

Saved: qidpid.clean.tsv
(10000, 4)


Unnamed: 0,qid,pid_pos,query,passage_pos
0,3,1142680,another name for the primary visual cortex is,the primary (parts of the cortex that receive ...
1,4,5613529,defining alcoholism as a disease is associated...,the formation of aa – alcoholics anonymous – i...
2,5,4956428,ect is a treatment that is used for,electroconvulsive therapy (ect). guide. electr...
3,6,1931409,"ebolavirus is an enveloped virus, which means","background. the ebola virus is an ""enveloped v..."
4,8,1094214,"in humans, the normal set point for body tempe...","at the normal set point, the body temperature,..."


In [107]:
text_cols = ["query", "passage_pos", "passage_neg"]
compiled_df[text_cols] = compiled_df[text_cols].apply(normalize_series)

# if you also keep the 3-col view:
qidpidtriples_clean = compiled_df[text_cols].copy()

out = "qidpidtriples.top3.clean.tsv"
qidpidtriples_clean.to_csv(out,
                           sep="\t",
                           header=False,
                           index=False,
                           encoding="utf-16")
print(f"Saved: {out}")

print(qidpidtriples_clean.shape)
qidpidtriples_clean.head(5)

Saved: qidpidtriples.top3.clean.tsv
(29999, 3)


Unnamed: 0,query,passage_pos,passage_neg
0,another name for the primary visual cortex is,the primary (parts of the cortex that receive ...,the frontal lobe of the brain. the frontal lob...
1,another name for the primary visual cortex is,the primary (parts of the cortex that receive ...,damage to the primary motor cortex results onl...
2,another name for the primary visual cortex is,the primary (parts of the cortex that receive ...,primary motor cortex (brodmann area 4) of the ...
3,defining alcoholism as a disease is associated...,the formation of aa – alcoholics anonymous – i...,"alcoholism, also known as alcohol use disorder..."
4,defining alcoholism as a disease is associated...,the formation of aa – alcoholics anonymous – i...,apologetics may be simply defined as the defen...
