In [None]:
import pandas as pd
import glob

In [None]:
config = {
    'trec19': {
        'RUN':  './trec19/web.adhoc/*.gz',
        'QREL': './qrels/qrels.web.51-100.txt',
        'SEP':  '\t| '
    },
    'trec20': {
        'RUN':  './trec20/web.adhoc/*.gz',
        'QREL': './qrels/qrels.web.101-150.txt',
        'SEP':  '\t'
    },
    'trec21': {
        'RUN':  './trec21/web.adhoc/*.gz',
        'QREL': './qrels/qrels.web.151-200.txt',
        'SEP':  '\t'
    },
    'trec22': {
        'RUN':  './trec22/web.adhoc/*.gz',
        'QREL': './qrels/qrels.web.201-250.txt',
        'SEP':  '\t'
    },
    'trec23': {
        'RUN':  './trec23/web.adhoc/*.gz',
        'QREL': './qrels/qrels.web.251-300.txt',
        'SEP':  '\t'
    }
}

In [None]:
for trec in config.keys():
    l = []
    for file in glob.glob(config[trec]['RUN']):
        df = (
            pd.read_csv(file, sep = config[trec]["SEP"], header = None)
            .rename({0: 'Topic', 1: 'Iteration', 2: 'Document', 3: 'Rank', 4: 'Score', 5: 'Run'}, axis = 1)
            .drop('Iteration', axis=1)
            .astype({"Topic": int, "Document": str, "Rank": int, "Score": float, "Run": str})
        )
        l.append(df)
    df = pd.concat(l)
    df.TREC = trec
    
    if df.Rank.min() == 0:
        df.Rank = df.Rank + 1

    qrels = (
        pd.read_csv(config[trec]['QREL'], sep = " ", header = None) 
        .rename({0: 'Topic', 1:'Iteration', 2: 'Document', 3: 'Relevancy'}, axis = 1)
        .drop('Iteration', axis = 1)
        .astype({"Topic": int, "Document": str, "Relevancy": int})
        .reset_index(drop = True)
    )
    
    df = pd.merge(
        qrels,
        df,
        on = ['Topic','Document'],
        how = 'outer'
    )
    
    df = df.drop_duplicates().dropna(subset = ['Relevancy']).dropna(subset = ['Score']).sort_values(['Topic','Run','Rank'], ascending = [True, True, True]).groupby(['Topic','Run']).head(20)
    df.Rank = df.groupby(['Topic','Run']).cumcount() + 1
    df['TREC'] = str(trec) 
    df.reset_index(drop=True).to_parquet(trec + '.parquet', compression = 'GZIP')

In [None]:
df = pd.concat([pd.read_parquet(path).reset_index(drop=True) for path in glob.glob('*.parquet')]).reset_index(drop=True)

In [None]:
df = (
    df
    .dropna(subset=['Run'])
    .dropna(subset=['Relevancy'])
    .dropna(subset=['Score'])
    .drop_duplicates()
    .drop('Score',axis=1)
    .set_index(['TREC', 'Topic', 'Run'])
)

In [None]:
runs_omitted = (
    df
    .groupby(['TREC','Topic','Run'])
    .filter(lambda group: len(group) != 20)
).reset_index().drop_duplicates(subset=['Topic','Run']).Run.unique()

runs_omitted

In [None]:
df = df.reset_index()[~df.reset_index().Run.isin(runs_omitted)]

In [None]:
df[~(df.Topic == 109)].to_parquet('runs.parquet', compression = 'GZIP')