In [1]:
import sys
sys.path.append('../')

from sklearn.model_selection import KFold
import yaml
import json

from src.create_index import load_data

Java started and loaded: pyterrier.java, pyterrier.terrier.java [version=5.10 (build: craigm 2024-08-22 17:33), helper_version=0.0.8]
The following code will have the same effect:
pt.java.add_package('com.github.terrierteam', 'terrier-prf', '-SNAPSHOT')
pt.java.init() # optional, forces java initialisation
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])


In [2]:
BASE_PATH = "../data"
RESULTS_PATH = BASE_PATH + "/results"

In [3]:
with open(BASE_PATH + "/LongEval/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [4]:
def split_sub_collection(sub_collection, topics, queryid_map, qrels, k=3):
    overlap = {sub_collection: []}
    topics_with_to_few_docs = []
    valids_topics = []
    
    folds = {}
    for i in range(0, k):
        folds[i] = {
            "train": set(),
            "test": set()
        }
        
    # Filter relevant documents for logging only
    rel_docs = qrels.merge(topics, on="qid")
    rel_docs = rel_docs[rel_docs["relevance"] > 0]
    print("\nDocs rel for more topics:", rel_docs.duplicated(subset=["docno"]).sum() , "/", len(rel_docs))
    
    
    # Perform splits on topic level
    for topic in queryid_map.keys():
        # Get relevant documents for topic
        rel_docs = qrels.merge(topics, on="qid")
        rel_docs = rel_docs[rel_docs["relevance"] > 0]
        rel_docs = rel_docs[rel_docs["qid"]==topic]
        
        overlap[sub_collection].append(len(rel_docs))            
        
        # If we have fewer relevants docs than k, we skip this topic 
        if len(rel_docs) < k: 
            topics_with_to_few_docs.append(topic)
            continue
        else:
            valids_topics.append(topic)

        # split
        kf = KFold(n_splits=k)
        kf.get_n_splits(rel_docs)
        for i, (train_index, test_index) in enumerate(kf.split(rel_docs)):
            train_ids = rel_docs.iloc[train_index]["docno"].to_list()
            test_ids = rel_docs.iloc[test_index]["docno"].to_list()
            
            allready_in_train = folds[i]["train"].intersection(test_ids)
            allready_in_test = folds[i]["test"].intersection(train_ids)
            
            # update folds to ensure each fold has unique documents
            processed = set()
            for test_id in allready_in_train:
                test_ids.remove(test_id)
                processed.add(test_id)
                
                # add a train doc to test if possible to maintain balance
                if len(allready_in_test) > 0:
                    train_id = allready_in_test.pop()
                    train_ids.remove(train_id)
                    
            allready_in_train -= processed
            
            # repeat for new train split
            for train_id in allready_in_test:
                train_ids.remove(train_id)
                
                # add a test doc to train if possible to maintain balance
                if len(allready_in_train) > 0:
                    test_id = allready_in_train.pop()
                    test_ids.remove(test_id)

            folds[i]["test"].update(test_ids)
            folds[i]["train"].update(train_ids)
            
    
    # report on folds
    for i in range(0, k):
        overlap = len(folds[i]["train"].intersection(folds[i]["test"])) 
        train_size = len(folds[i]["train"])
        test_size = len(folds[i]["test"])
        ratio = train_size / (train_size + test_size)
        print(f"Fold {i}: train size: {train_size}, test size: {test_size}, overlap: {overlap}, ratio: {ratio:.2f}")
        
    return folds

In [5]:
k = 3
sub_collections = ["t1", "t2", "t3", "t4", "t5"]  # we skip t0 as it is the base collection and has no history
splits = {}

for sub_collection in sub_collections:
    splits[sub_collection] = {}
    # Load data for sub-collection
    topics, qrels, docid_map_patch, queryid_map = load_data(sub_collection)
    
    # for pyterrier 
    topics["query"] = topics["query"].str.replace("'", "").replace("/", "")
    
    # Create Folds
    folds = split_sub_collection(sub_collection, topics, queryid_map, qrels, k=3)
     
    for fold_no in range(0, k):
        splits[sub_collection][fold_no] = {
            "train": list(folds[fold_no]["train"]),
            "test": list(folds[fold_no]["test"])
        }


Docs rel for more topics: 565 / 3370
Fold 0: train size: 535, test size: 348, overlap: 0, ratio: 0.61
Fold 1: train size: 592, test size: 291, overlap: 0, ratio: 0.67
Fold 2: train size: 639, test size: 244, overlap: 0, ratio: 0.72

Docs rel for more topics: 683 / 3835
Fold 0: train size: 570, test size: 380, overlap: 0, ratio: 0.60
Fold 1: train size: 651, test size: 299, overlap: 0, ratio: 0.69
Fold 2: train size: 679, test size: 271, overlap: 0, ratio: 0.71

Docs rel for more topics: 1296 / 4362
Fold 0: train size: 544, test size: 313, overlap: 0, ratio: 0.63
Fold 1: train size: 577, test size: 280, overlap: 0, ratio: 0.67
Fold 2: train size: 593, test size: 264, overlap: 0, ratio: 0.69

Docs rel for more topics: 580 / 2689
Fold 0: train size: 476, test size: 263, overlap: 0, ratio: 0.64
Fold 1: train size: 494, test size: 245, overlap: 0, ratio: 0.67
Fold 2: train size: 508, test size: 231, overlap: 0, ratio: 0.69

Docs rel for more topics: 2337 / 10259
Fold 0: train size: 1005, t

In [6]:
with open(BASE_PATH + "/splits.json", "w") as f:
    f.write(json.dumps(splits))