In [7]:
import os
import requests
import tarfile
import gzip, shutil
%load_ext autoreload
%autoreload 2

files = [
    {
        "url": "https://msmarco.z22.web.core.windows.net/msmarcoranking/qidpidtriples.train.full.2.tsv.gz",
        "name": "qidpidtriples.train.full.2.tsv.gz"
    }
]

for file in files:
    if os.path.exists(file["name"].replace(".gz", "")):
        print(f"File {file['name']} already exists.")
    else:
        response = requests.get(file["url"])
        with open(file["name"], "wb") as f:
            f.write(response.content)
    filename = file["name"].replace(".gz", "")
    if not os.path.exists(filename):
        with gzip.open(file["name"], "rb") as src, open(filename, "wb") as dst:
            shutil.copyfileobj(src, dst)
            print(f"Decompressed to {filename}")
    else:
        print(f"File {filename} already exists.")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
File qidpidtriples.train.full.2.tsv.gz already exists.
File qidpidtriples.train.full.2.tsv already exists.


In [None]:
import pandas as pd
import os
qrels_dev = "qrels.dev.tsv"
collection_tsv = "collection.tsv"

# Read qrels (judged passages)
qrels = pd.read_csv(
    qrels_dev,
    sep="\t",
    names=["qid", "_", "pid", "rel"],
    dtype={"qid": str, "pid": str, "rel": int}
)

judged = set(qrels["pid"])

out_tsv = "common_dataset.tsv"

# total size of out_tsv will be greater than this as you will see in below cell
target_size = 20_000

if not os.path.exists(out_tsv):
    written = set()
    limit = min(target_size, len(judged))  # only take up to target_size judged passages
    with open(out_tsv, "w", encoding="utf-8") as out:
        for chunk in pd.read_csv(
            collection_tsv,
            sep="\t",
            names=["pid", "text"],
            dtype={"pid": str, "text": str},
            chunksize=1_000_000,
            quoting=3,
            on_bad_lines="skip"
        ):
            if len(written) >= limit:
                break
            # Filter judged not yet written
            remaining = limit - len(written)
            keep = chunk[chunk["pid"].isin(judged - written)]
            if keep.empty:
                continue
            if len(keep) > remaining:
                keep = keep.iloc[:remaining]
            keep.to_csv(out, sep="\t", header=False, index=False)
            written.update(keep["pid"])


In [14]:
common_dataset = pd.read_csv(out_tsv, sep="\t", names=["pid", "text"], dtype={"pid": str, "text": str})
print(common_dataset.shape)

(20000, 2)


In [15]:
target_size = 40_000
seed=42
# add random distractors to reach target_size
need = max(0, target_size - len(written))
if need > 0:
        for chunk in pd.read_csv(collection_tsv, sep="\t",
                                 names=["pid","text"], dtype={"pid":str,"text":str},
                                 chunksize=1_000_000, quoting=3, on_bad_lines="skip"):
            cand = chunk[~chunk["pid"].isin(written)]
            if len(cand) == 0: 
                continue
            take = min(need, len(cand))
            samp = cand.sample(n=take, random_state=seed)
            with open(out_tsv, "a", encoding="utf-8") as out:
                samp.to_csv(out, sep="\t", header=False, index=False)
            written.update(samp["pid"])
            need -= take
            if need == 0:
                break
    

In [16]:
common_dataset = pd.read_csv(out_tsv, sep="\t", names=["pid", "text"], dtype={"pid": str, "text": str})
print(common_dataset.shape)

(40000, 2)


In [None]:
from tqdm.auto import tqdm
# Build merged_full incrementally up to target_size with progress bars
pid_text = common_dataset.set_index("pid")["text"]
filename = "qidpidtriples.train.full.2.tsv"

parts = []
total = 0

reader = pd.read_csv(
    filename,
    sep="\t",
    names=["qid", "pos_pid", "neg_pid"],
    dtype={"qid": str, "pos_pid": str, "neg_pid": str},
    chunksize=500_000
)

for triples_chunk in tqdm(reader, desc="Reading chunks"):
    mask = triples_chunk["pos_pid"].isin(pid_text.index) & triples_chunk["neg_pid"].isin(pid_text.index)
    if not mask.any():
        continue
    sub = triples_chunk.loc[mask].copy()
    sub["pos_text"] = sub["pos_pid"].map(pid_text)
    sub["neg_text"] = sub["neg_pid"].map(pid_text)
    parts.append(sub)
    gained = len(sub)
    total += gained

merged_full = pd.concat(parts, ignore_index=True)

print(merged_full.shape)
print(merged_full.head())

Reading chunks: 0it [00:00, ?it/s]

Reading chunks: 40it [00:16,  2.54it/s]

Collected 1000/1000 (100.0%)


Reading chunks: 796it [05:05,  2.61it/s]

Target reached.
(18691, 5)
       qid  pos_pid  neg_pid  \
0  1150887  1504586  7669864   
1  1150887  1504586  7622515   
2  1150887  1504586  7684386   
3  1150887  1504586  7672621   
4   242510  6054949  7329242   

                                            pos_text  \
0  The NVL function. You can use the NVL function...   
1  The NVL function. You can use the NVL function...   
2  The NVL function. You can use the NVL function...   
3  The NVL function. You can use the NVL function...   
4  Seagulls live anywhere between 5 to 15 years d...   

                                            neg_text  
0  The Excel Len function is a very useful functi...  
1  In math, inverse is explained as the opposite ...  
2  Method is from the object oriented paradigm an...  
3  Recursive function. Recursive function, in log...  
4  Answer by soundoff (121) Pit Bulls live betwee...  





In [18]:
merged_full.to_csv("merged_full.csv")