In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import yaml
import pyterrier as pt
import os
import json

In [2]:
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

  if not pt.started():
Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]
The following code will have the same effect:
pt.java.add_package('com.github.terrierteam', 'terrier-prf', '-SNAPSHOT')
pt.java.init() # optional, forces java initialisation
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])


In [3]:
BASE_PATH = "../data"

In [4]:
with open(BASE_PATH + "/LongEval/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [5]:
def doc_itter(sub_collection, fold: set):
    documents_path = os.path.join(
        BASE_PATH, config["subcollections"][sub_collection]["documents"]["json"]["en"]
    )
    documents = [
        os.path.join(documents_path, path) for path in os.listdir(documents_path)
    ]
    for doc_split_path in documents:
        print(doc_split_path)
        with open(doc_split_path, "r") as f:
            docs = json.load(f)
            for doc in docs:
                docno = doc["id"]
                if docno not in fold:      
                    yield {'docno' : docno, 'text' : doc["contents"]}

In [6]:
def create_index(sub_collection, split_id, fold):
    iter_indexer = pt.IterDictIndexer(BASE_PATH + f"/index/{sub_collection}_{split_id}", meta={'docno': 20, 'text': 8096}, verbose=True)
    indexref = iter_indexer.index(doc_itter(sub_collection, fold))
    return indexref

In [7]:
def load_data(sub_collection):
    # Test collection
    topics = pd.read_csv(BASE_PATH + "/" +config["subcollections"][sub_collection]["topics"]["test"]["tsv"]["en"], sep="\t", names=["qid", "query"])
    qrels = pd.read_csv(BASE_PATH + "/" +config["subcollections"][sub_collection]["qrels"]["test"], sep=" ", names=["qid", "Q0", "docno", "relevance"])
    
    # ID maps
    docid_map = pd.read_csv('../data/document-groups-relevant.csv.gz', compression='gzip')
    docid_map_patch = docid_map[[sub_collection, "t"+str(int(sub_collection[-1])+1)]].dropna().set_index(sub_collection).to_dict()["t"+str(int(sub_collection[-1])+1)]

    queryid_map = pd.read_csv('../data/query_id_map.csv')
    queryid_map = queryid_map[[sub_collection, "t"+str(int(sub_collection[-1])+1)]].dropna().set_index(sub_collection).to_dict()["t"+str(int(sub_collection[-1])+1)]

    return topics, qrels, docid_map_patch, queryid_map

In [8]:
def split_sub_collection(sub_collection, k=3):
    overlap = {sub_collection: []}
    topics_with_to_few_docs = []
    valids_topics = []
    
    folds = {}
    for i in range(0, k):
        folds[i] = {
            "train": set(),
            "test": set()
        }
        
    # Load data for sub-collection
    topics, qrels, docid_map_patch, queryid_map = load_data(sub_collection)
    
    # Filter relevant documents for logging only
    rel_docs = qrels.merge(topics, on="qid")
    rel_docs = rel_docs[rel_docs["relevance"] > 0]
    print("\nDocs rel for more topics:", rel_docs.duplicated(subset=["docno"]).sum() , "/", len(rel_docs))
    
    
    # Perform splits on topic level
    for topic in queryid_map.keys():
        # Get relevant documents for topic
        rel_docs = qrels.merge(topics, on="qid")
        rel_docs = rel_docs[rel_docs["relevance"] > 0]
        rel_docs = rel_docs[rel_docs["qid"]==topic]
        
        overlap[sub_collection].append(len(rel_docs))            
        
        # If we have fewer relevants docs than k, we skip this topic 
        if len(rel_docs) < k: 
            topics_with_to_few_docs.append(topic)
            continue
        else:
            valids_topics.append(topic)

        # split
        kf = KFold(n_splits=k)
        kf.get_n_splits(rel_docs)
        for i, (train_index, test_index) in enumerate(kf.split(rel_docs)):
            train_ids = rel_docs.iloc[train_index]["docno"].to_list()
            test_ids = rel_docs.iloc[test_index]["docno"].to_list()
            
            allready_in_train = folds[i]["train"].intersection(test_ids)
            allready_in_test = folds[i]["test"].intersection(train_ids)
            
            # update folds to ensure each fold has unique documents
            processed = set()
            for test_id in allready_in_train:
                test_ids.remove(test_id)
                processed.add(test_id)
                
                # add a train doc to test if possible to maintain balance
                if len(allready_in_test) > 0:
                    train_id = allready_in_test.pop()
                    train_ids.remove(train_id)
                    
            allready_in_train -= processed
            
            # repeat for new train split
            for train_id in allready_in_test:
                train_ids.remove(train_id)
                
                # add a test doc to train if possible to maintain balance
                if len(allready_in_train) > 0:
                    test_id = allready_in_train.pop()
                    test_ids.remove(test_id)

            folds[i]["test"].update(test_ids)
            folds[i]["train"].update(train_ids)
                    

    # report on folds
    for i in range(0, k):
        overlap = len(folds[i]["train"].intersection(folds[i]["test"])) 
        train_size = len(folds[i]["train"])
        test_size = len(folds[i]["test"])
        ratio = train_size / (train_size + test_size)
        print(f"Fold {i}: train size: {train_size}, test size: {test_size}, overlap: {overlap}, ratio: {ratio:.2f}")
        
    return folds

In [9]:
folds = split_sub_collection("t0", k=3)
folds = split_sub_collection("t1", k=3)
folds = split_sub_collection("t2", k=3)
folds = split_sub_collection("t3", k=3)
folds = split_sub_collection("t4", k=3)


Docs rel for more topics: 18 / 400
Fold 0: train size: 84, test size: 60, overlap: 0, ratio: 0.58
Fold 1: train size: 98, test size: 46, overlap: 0, ratio: 0.68
Fold 2: train size: 106, test size: 38, overlap: 0, ratio: 0.74

Docs rel for more topics: 565 / 3370
Fold 0: train size: 500, test size: 338, overlap: 0, ratio: 0.60
Fold 1: train size: 570, test size: 268, overlap: 0, ratio: 0.68
Fold 2: train size: 606, test size: 232, overlap: 0, ratio: 0.72

Docs rel for more topics: 683 / 3835
Fold 0: train size: 365, test size: 230, overlap: 0, ratio: 0.61
Fold 1: train size: 396, test size: 199, overlap: 0, ratio: 0.67
Fold 2: train size: 429, test size: 166, overlap: 0, ratio: 0.72

Docs rel for more topics: 1296 / 4362
Fold 0: train size: 547, test size: 328, overlap: 0, ratio: 0.63
Fold 1: train size: 592, test size: 283, overlap: 0, ratio: 0.68
Fold 2: train size: 611, test size: 264, overlap: 0, ratio: 0.70

Docs rel for more topics: 580 / 2689
Fold 0: train size: 829, test size: 