In [2]:
import sys
sys.path.append('../')

from sklearn.model_selection import KFold
import yaml

from src.create_index import load_data

terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


In [3]:
BASE_PATH = "../data"
RESULTS_PATH = BASE_PATH + "/results"

In [4]:
with open(BASE_PATH + "/LongEval/metadata.yml", "r") as yamlfile:
    config = yaml.load(yamlfile, Loader=yaml.FullLoader)

In [5]:
def split_sub_collection(sub_collection, topics, queryid_map, qrels, k=3):
    overlap = {sub_collection: []}
    topics_with_to_few_docs = []
    valids_topics = []
    
    folds = {}
    for i in range(0, k):
        folds[i] = {
            "train": set(),
            "test": set()
        }
        
    # Filter relevant documents for logging only
    rel_docs = qrels.merge(topics, on="qid")
    rel_docs = rel_docs[rel_docs["relevance"] > 0]
    print("\nDocs rel for more topics:", rel_docs.duplicated(subset=["docno"]).sum() , "/", len(rel_docs))
    
    
    # Perform splits on topic level
    for topic in queryid_map.keys():
        # Get relevant documents for topic
        rel_docs = qrels.merge(topics, on="qid")
        rel_docs = rel_docs[rel_docs["relevance"] > 0]
        rel_docs = rel_docs[rel_docs["qid"]==topic]
        
        overlap[sub_collection].append(len(rel_docs))            
        
        # If we have fewer relevants docs than k, we skip this topic 
        if len(rel_docs) < k: 
            topics_with_to_few_docs.append(topic)
            continue
        else:
            valids_topics.append(topic)

        # split
        kf = KFold(n_splits=k)
        kf.get_n_splits(rel_docs)
        for i, (train_index, test_index) in enumerate(kf.split(rel_docs)):
            train_ids = rel_docs.iloc[train_index]["docno"].to_list()
            test_ids = rel_docs.iloc[test_index]["docno"].to_list()
            
            allready_in_train = folds[i]["train"].intersection(test_ids)
            allready_in_test = folds[i]["test"].intersection(train_ids)
            
            # update folds to ensure each fold has unique documents
            processed = set()
            for test_id in allready_in_train:
                test_ids.remove(test_id)
                processed.add(test_id)
                
                # add a train doc to test if possible to maintain balance
                if len(allready_in_test) > 0:
                    train_id = allready_in_test.pop()
                    train_ids.remove(train_id)
                    
            allready_in_train -= processed
            
            # repeat for new train split
            for train_id in allready_in_test:
                train_ids.remove(train_id)
                
                # add a test doc to train if possible to maintain balance
                if len(allready_in_train) > 0:
                    test_id = allready_in_train.pop()
                    test_ids.remove(test_id)

            folds[i]["test"].update(test_ids)
            folds[i]["train"].update(train_ids)
            
    
    # report on folds
    for i in range(0, k):
        overlap = len(folds[i]["train"].intersection(folds[i]["test"])) 
        train_size = len(folds[i]["train"])
        test_size = len(folds[i]["test"])
        ratio = train_size / (train_size + test_size)
        print(f"Fold {i}: train size: {train_size}, test size: {test_size}, overlap: {overlap}, ratio: {ratio:.2f}")
        
    return folds

In [39]:
k = 3
sub_collections = ["t1", "t2", "t3", "t4", "t5"]  # we skip t0 as it is the base collection and has no history
splits = {}

for sub_collection in sub_collections:
    splits[sub_collection] = {}
    # Load data for sub-collection
    topics, qrels, docid_map_patch, queryid_map = load_data(sub_collection)
    
    # for pyterrier 
    topics["query"] = topics["query"].str.replace("'", "").replace("/", "")
    
    # Create Folds
    folds = split_sub_collection(sub_collection, topics, queryid_map, qrels, k=3)
     
    for fold_no in range(0, k):
        splits[sub_collection][fold_no] = {
            "train": list(folds[fold_no]["train"]),
            "test": list(folds[fold_no]["test"])
        }


Docs rel for more topics: 565 / 3370
Fold 0: train size: 535, test size: 348, overlap: 0, ratio: 0.61
Fold 1: train size: 592, test size: 291, overlap: 0, ratio: 0.67
Fold 2: train size: 639, test size: 244, overlap: 0, ratio: 0.72

Docs rel for more topics: 683 / 3835
Fold 0: train size: 570, test size: 380, overlap: 0, ratio: 0.60
Fold 1: train size: 651, test size: 299, overlap: 0, ratio: 0.69
Fold 2: train size: 679, test size: 271, overlap: 0, ratio: 0.71

Docs rel for more topics: 1296 / 4362
Fold 0: train size: 544, test size: 313, overlap: 0, ratio: 0.63
Fold 1: train size: 577, test size: 280, overlap: 0, ratio: 0.67
Fold 2: train size: 593, test size: 264, overlap: 0, ratio: 0.69

Docs rel for more topics: 580 / 2689
Fold 0: train size: 476, test size: 263, overlap: 0, ratio: 0.64
Fold 1: train size: 494, test size: 245, overlap: 0, ratio: 0.67
Fold 2: train size: 508, test size: 231, overlap: 0, ratio: 0.69

Docs rel for more topics: 2337 / 10259
Fold 0: train size: 1005, t

In [7]:
folds.keys()

dict_keys(['t1', 't2', 't3', 't4', 't5'])

In [None]:
# ID maps
docid_map = pd.read_csv(
    BASE_PATH + "/document-groups-relevant.csv.gz", compression="gzip"
)
docid_map_patch = (
    docid_map[[sub_collection, "t" + str(int(sub_collection[-1]) - 1)]]
    .dropna()
    .set_index(sub_collection)
    .to_dict()["t" + str(int(sub_collection[-1]) - 1)]
)

queryid_map = pd.read_csv(BASE_PATH + "/query_id_map.csv")
queryid_map = (
    queryid_map[[sub_collection, "t" + str(int(sub_collection[-1]) - 1)]]
    .dropna()
    .set_index(sub_collection)
    .to_dict()["t" + str(int(sub_collection[-1]) - 1)]
)

In [9]:
len(folds["t1"])

3

In [16]:
import pandas as pd
import sqlite3

In [17]:
conn = sqlite3.connect(BASE_PATH + "/database.db")


In [46]:
history = ["t1", "t2", "t3"]

query = """SELECT topic.queryid as qid_1, T2.queryid as qid_2 from topic
JOIN topic as T2
ON topic.text_fr = T2.text_fr
WHERE T2.sub_collection IN (%s)""" % ",".join(
    "?" * len(history)
)

query_map = pd.read_sql_query(query, conn, params=history)

In [77]:
docid_map = pd.read_csv(
    BASE_PATH + "/document-groups-relevant.csv.gz", compression="gzip"
)
queryid_map = pd.read_csv(BASE_PATH + "/query_id_map.csv")

In [74]:
topics, qrels, docid_map_patch, queryid_map = load_data("t4")
sub_collection = "t4"
history = ["t1", "t2", "t3"]

In [59]:
topics

Unnamed: 0,qid,query
0,q062345,water agency
1,q0623107,free antivirus
2,q0623129,colloidal silver
3,q0623312,office valley
4,q0623345,gift fete des meres
...,...,...
402,q062360129545292,leek pie
403,q062360129545306,Housing Tax 2022
404,q062360129545367,terreal
405,q062360129545387,total energy


In [98]:
a = queryid_map.dropna(subset=[sub_collection]).set_index(sub_collection)[history]#.dropna(subset=history, how="all")

In [117]:
queryid_map = queryid_map.dropna(subset=[sub_collection]).set_index(sub_collection)[history]


In [118]:
new_topics = topics[topics["qid"].isin(queryid_map[queryid_map.isna().all(axis=1)].index)]
extended_topics = topics[topics["qid"].isin(queryid_map.dropna(subset=history, how="all").index)]

['q072222493', 'q092218988']

In [148]:
train_docids = splits[sub_collection][0]["train"]

history_doc_ids_filtered = docid_map[docid_map[sub_collection].isin(train_docids)][history].dropna(subset=history, how="all").values.flatten().tolist()

In [150]:
history_doc_ids_filtered = list(set(history_doc_ids_filtered))

In [164]:
extended_topics

Unnamed: 0,qid,query
1,q0623107,free antivirus
2,q0623129,colloidal silver
13,q06231275,potato patty
15,q06231297,apple gateau
17,q06231315,coconut gateau
...,...,...
402,q062360129545292,leek pie
403,q062360129545306,Housing Tax 2022
404,q062360129545367,terreal
405,q062360129545387,total energy


In [175]:
topics

Unnamed: 0,qid,query
0,q062345,water agency
1,q0623107,free antivirus
2,q0623129,colloidal silver
3,q0623312,office valley
4,q0623345,gift fete des meres
...,...,...
402,q062360129545292,leek pie
403,q062360129545306,Housing Tax 2022
404,q062360129545367,terreal
405,q062360129545387,total energy


In [174]:
for topic in extended_topics["qid"].to_list():
    queries_to_extend = queryid_map.loc[topic].dropna().to_list()
    
    query = """SELECT url, text_en, qrel.docid
    FROM qrel
    JOIN document ON qrel.docid = document.docid
    WHERE queryid IN (%s)
    AND relevance > 0""" % ",".join(
        "?" * len(queries_to_extend)
    )

    rel_docs = pd.read_sql_query(query, conn, params=queries_to_extend)
    rel_docs = rel_docs[rel_docs["docid"].isin(history_doc_ids_filtered)]
    print(len(rel_docs))
    
    
    

0
0
1
4
2
0
3
4
4
0
4
0
5
2
3
4
0
6
0
1
2
1
0
0
0
0
4
8
1
0
5
2
1
2
1
0
0
1
3
4
2
0
0
1
1
0
0
0
0
3
5
10
3
4
0
1
0
5
0
0
5
2
3
1
0
4
0
1
3
4
0
0
0
2
2
3
2
2
1
3
4
3
2
4
0
6
0
0
4
3
3
7
0
0
0
2
3
5
9
0
2
1
4
0
0
7
2
1
1
0
1
0
2
0
2
0
0
2
0
0
1
2
3
5
0
1
0
6
0
4
1
5
0
0
3
0
0
0
1
10
3
3
2
4
0
0
3
0
1
0
1
2
3
4
0
0
6
0
0
0
6
0
1
0
0
5
5
0
2
7
4
2
1
0
1
4
3
5
0
2
3
4
4
3
1
6
1
5
2
0
1
0


Unnamed: 0,url,text_en,docid


In [155]:
history_doc_ids_filtered.remove(np.nan)

In [156]:
history_doc_ids_filtered

['doc012308900372',
 'doc012302005243',
 'doc072207803651',
 'doc092208300744',
 'doc092203607460',
 'doc012302114772',
 'doc012300218105',
 'doc012304304043',
 'doc012312210620',
 'doc012303101744',
 'doc092210600313',
 'doc012304102490',
 'doc012303707552',
 'doc072212400696',
 'doc092204900865',
 'doc012311608452',
 'doc092201605256',
 'doc012307813497',
 'doc072209307026',
 'doc012302308927',
 'doc092208102054',
 'doc012302714498',
 'doc012302309338',
 'doc012302714841',
 'doc092208101835',
 'doc012308601843',
 'doc012306406969',
 'doc012309103104',
 'doc072204503433',
 'doc012307806444',
 'doc012309507887',
 'doc012303014621',
 'doc012304405849',
 'doc012304313537',
 'doc092208400294',
 'doc012303908365',
 'doc092203008753',
 'doc072201201921',
 'doc012303115409',
 'doc012307813727',
 'doc012307102592',
 'doc012305204855',
 'doc012304817378',
 'doc072203201348',
 'doc012310401936',
 'doc012311305956',
 'doc012312300597',
 'doc012306405220',
 'doc072215803774',
 'doc092203410483',


In [33]:
query = """SELECT url, text_en, document.docid
FROM qrel
JOIN document ON qrel.docid = document.docid
AND relevance > 0"""


rel_docs = pd.read_sql_query(query, conn)

In [34]:
rel_docs

Unnamed: 0,url,text_en,docid
0,https://www.bois-direct-scierie.fr/carport-abr...,BOIS\nDIRECT\nSCIENCE shelter camping car carp...,doc062200204465
1,http://abri-proteccar.com/,"Car shelter, campsite, terrace …\nProtec Car\n...",doc062200205493
2,https://www.aeroports-voyages.fr/fr/aeroport/b...,"Volotea\nat Bordeaux Airport: flights, timetab...",doc062200116555
3,https://www.bordeaux.aeroport.fr/,Bordeaux Airport\n-\nMerignac\n- Official webs...,doc062200116273
4,https://www.presse-citron.net/comparatif-meill...,"Best antivirus 2022 (comparative): Secure PC, ...",doc062200209981
...,...,...,...
117333,https://jeparticipe-evenement.pole-emploi.fr/v...,\n,doc082301304186
117334,https://fr.wikipedia.org/wiki/Dioc%C3%A8se_de_...,\n,doc082315210469
117335,https://www.emploi-store.fr/portail/accueil,Web and mobile employment services\n| Employme...,doc082303012218
117336,https://www.pole-emploi.fr/accueil,\n,doc082303906500


In [45]:
splits["t1"][0]["train"]

['doc072207501357',
 'doc072211304301',
 'doc072207504706',
 'doc072204503888',
 'doc072205301783',
 'doc072212400775',
 'doc072201202457',
 'doc072210607931',
 'doc072207504811',
 'doc072215206132',
 'doc072203201268',
 'doc072206606766',
 'doc072207500864',
 'doc072212601615',
 'doc072211304027',
 'doc072206208771',
 'doc072209307035',
 'doc072202501784',
 'doc072212601077',
 'doc072201701394',
 'doc072203201769',
 'doc072201203221',
 'doc072205600542',
 'doc072215702055',
 'doc072201701223',
 'doc072212401132',
 'doc072207501338',
 'doc072207501077',
 'doc072202501213',
 'doc072210000987',
 'doc072213307764',
 'doc072202308518',
 'doc072207501230',
 'doc072215702561',
 'doc072206607389',
 'doc072200305226',
 'doc072201601566',
 'doc072205301569',
 'doc072211501590',
 'doc072201201921',
 'doc072201700886',
 'doc072203801050',
 'doc072205600007',
 'doc072213307834',
 'doc072210607739',
 'doc072204503670',
 'doc072203201348',
 'doc072206208624',
 'doc072215804015',
 'doc072203800910',


In [176]:
from sklearn.model_selection import KFold

In [177]:
kf = KFold(n_splits=3)

In [179]:
rel_docs = [1,2,3,4,5,6,7,8,9]
kf.get_n_splits(rel_docs)

3

In [183]:
for i, (train_index, test_index) in enumerate(kf.split(rel_docs)):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [181]:
train_index

array([3, 4, 5, 6, 7, 8])

In [182]:
test_index

array([0, 1, 2])