In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols, mixedlm
import statsmodels.formula.api as smf
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Data Import

In [None]:
df = pd.read_parquet('data/scores.parquet').reset_index(drop=True).replace({
    "trec23": "2014",
    "trec22": "2013",
    "trec21": "2012",
    "trec20": "2011",
    "trec19": "2010"
})

# NDCG Domain

In [None]:
(
    pd.merge(
        (
            df
            .loc[:,["WDCG","TREC","Topic"]]
            .groupby(["TREC","Topic"])
            .min()
            .groupby(["TREC"])
            .apply(lambda group: len(list(filter(None, map(lambda value: value < 0, group['WDCG'])))) / len(group))
            .reset_index()
            .rename({0: "≤ 0"}, axis=1)
        ),
        (
            df
            .loc[:,["WDCG","TREC","Topic"]]
            .groupby(["TREC","Topic"])
            .min()
            .groupby(["TREC"])
            .apply(lambda group: len(list(filter(None, map(lambda value: value < -1, group['WDCG'])))) / len(group))
            .reset_index()
            .rename({0: "≤ -1"}, axis=1)
        )
    )
    .set_index("TREC")
    .round(2)
)

# Rank Correlation

In [None]:
(
    df
    .groupby(["TREC","K"])
    .corr(method='spearman')
    .NDCG_min
    .unstack()
    .NDCG_org
    .reset_index()
    .pivot("TREC","K","NDCG_org")
    .round(2)
)

# Reliability Evaluation
Adapted from *Evangelos Kanoulas, Javed A. Aslam: Empirical justification of the gain and discount function for nDCG. CIKM 2009: 611-620*

###### Parameters

In [None]:
# Brennan (2001): Generalizability Theory, New York: Springer. Eq. 3.1
# "Score ~ (1 | Person) + (1 | Task) + (1 | Person:Task)" 
# Task -> Topic, Person -> Run
f = 'Score ~ Run + Topic + Run*Topic -1'  

###### Method Implementation

In [None]:
def dependability(data):
    a = data.Topic.nunique()
    b = data.Run.nunique()
    data["Group"] = 1  

    result = ols(formula = 'Score ~ Run + Topic + Run*Topic -1', data=data).fit()

    table = sm.stats.anova_lm(result, typ=1)
    var_sys = abs(table.loc['Run','mean_sq'] - table.loc['Run:Topic','mean_sq'])/a
    var_topic = abs(table.loc['Topic','mean_sq'] - table.loc['Run:Topic','mean_sq'])/b
    var_sys_topic = abs(table.loc['Run:Topic','mean_sq'] - table.loc['Residual','mean_sq'])
    
    return var_sys / (var_sys + var_topic + var_sys_topic/a)

###### Output

In [None]:
pd.concat(
    [
        (
            df
            .rename({'NDCG_org':'Score'},axis=1)
            .groupby(['TREC','K'])
            .apply(dependability)
            .reset_index()
            .rename({0:'NDCG_org'},axis=1)
            .pivot("TREC","K","NDCG_org")
        ),
        (
            df
            .rename({'NDCG_min':'Score'},axis=1)
            .groupby(['TREC','K'])
            .apply(dependability)
            .reset_index()
            .rename({0:'NDCG_min'},axis=1)
            .pivot("TREC","K","NDCG_min")
        ),
        (
            df
            .rename({'NDCG_0':'Score'},axis=1)
            .groupby(['TREC','K'])
            .apply(dependability)
            .reset_index()
            .rename({0:'NDCG_0'},axis=1)
            .pivot("TREC","K","NDCG_0")
        )
    ],
    keys = ["NDCG_org", "NDCG_min", "NDCG_0"]
).round(4)

# Sensitivity Evaluation 
*Tetsuya Sakai: Evaluating evaluation metrics based on the bootstrap. SIGIR 2006: 525-532*

###### Parameters

In [None]:
B = 1000

###### Method Implementation

In [None]:
def bootstrap(data):
    sn = np.sqrt(len(data))
    z = data.X - data.Y
    w = z - z.mean()
    t_z = abs(z.mean() / (z.std() / sn))

    res = (
        sum(
            map(
                lambda w: 1 if abs(w.mean() / (w.std() / sn)) >= t_z  else 0,
                [w.sample(frac = 1, replace = True) for b in range(1,B+1)]
            )
        )
        /B
    )
    return res


###### Applied to Data

In [None]:
bootstrap_data = []
for metric in ["NDCG_org","NDCG_0","NDCG_min"]:
    for trec in df.TREC.unique():
        for k in df.K.unique():
            for x,y in itertools.combinations(df[df.TREC == trec].Run.unique(),2):
                data = (
                    pd.merge(
                        df.loc[(df.TREC == trec) & (df.K == k) & (df.Run == x), [metric,'Topic']].drop_duplicates(),
                        df.loc[(df.TREC == trec) & (df.K == k) & (df.Run == y), [metric,'Topic']].drop_duplicates(),
                        on = 'Topic'
                    )
                    .loc[:,[metric+'_x',metric+'_y']]
                    .rename({metric+'_x':'X', metric+'_y':'Y'}, axis = 1)
                )
                bootstrap_data.append({'Metric': metric, 'TREC': trec, 'K': k, 'X': x,'Y': y, 'ASL': bootstrap(data)})
                
bootstrap_data = pd.DataFrame(bootstrap_data)

###### Output

In [None]:
g = sns.FacetGrid(bootstrap_data, col="K", row="TREC",  hue = "Metric", legend_out=True, height=1.75, aspect=1.5)
g.map(
    sns.distplot, 
    "ASL", 
    bins=np.arange(0,0.25,0.001), 
    kde=False, 
    hist_kws={
        "cumulative":True, 
        "histtype": "step", 
        "alpha": 1, 
    }, 
    rug=False,
    norm_hist=True,
).add_legend()
g.set_axis_labels("Level", "Ratio")

# Stability Evaluation
*Chris Buckley, Ellen M. Voorhees: Evaluating Evaluation Measure Stability. SIGIR Forum 51(2): 235-242 (2017)*

###### Parameters

In [None]:
f = 0.05
m = 200

###### Method Implementation

In [None]:
def error_rate(data):
    result = []
    for i in range(0,m):
        for n in range(5, data.Topic.nunique() + 1):
            print(i,n,end=" \r")
            topics = data.Topic.sample(n = n)
            for x, y in itertools.combinations(data.Run.unique(), 2):
                X = data[(data.Topic.isin(topics)) & (data.Run == x)].set_index('Topic').Score
                Y = data[(data.Topic.isin(topics)) & (data.Run == y)].set_index('Topic').Score
                x_better = sum(X > (Y + Y*f))
                y_better = sum(Y > (X + X*f))
                result.append({'X': x,'Y': y, 'N': n, 'M': i, 'X Better': x_better, 'Y Better': y_better})
    return (
        pd.DataFrame(result)
        .groupby(['N','M'])
        .apply(lambda group: pd.Series([min(row[1]['X Better'],row[1]['Y Better'])/row[1]['N'] for row in group.iterrows()]).mean())
        .groupby('N')
        .mean()
    )

###### Applied to Data

In [None]:
tmp = (
    df
    .loc[df.Run != 'uogTrB47Vm',['TREC','K','Topic','Run','NDCG_org','NDCG_0', 'NDCG_min']]
    .melt(id_vars = ['TREC','Topic','K','Run'], var_name = 'Metric', value_name = 'Score')
    .drop_duplicates()
)

data_pool = []

i = 0
for metric in tmp.Metric.unique():
    for trec in tmp.TREC.unique():
        for k in tmp.K.unique():
            data_pool.append((i,tmp.loc[(tmp.K == k) & (tmp.TREC == trec) & (tmp.Metric == metric),:].reset_index()))
            i += 1

In [None]:
from multiprocessing.pool import ThreadPool

def execution_wrapper(data):
    res = error_rate(data[1])
    res = pd.DataFrame(res)
    res['TREC'] = data[1].TREC.unique()[0]
    res['K'] = data[1].K.unique()[0]
    res['Metric'] = data[1].Metric.unique()[0]
    return res

threads = ThreadPool(8) 
results = threads.map(execution_wrapper, data_pool)
tmp = (
    pd.concat(results)
    .reset_index()
    .rename({0:'Error Rate'}, axis = 1)
    .loc[:,['TREC','K','Metric','N','Error Rate']]
    .reset_index(drop=True)
)

###### Output

In [None]:
kw = {"color": ["k", "k", "k"], "linestyle" : ["-","--",":"]}
g = sns.FacetGrid(tmp.rename({"N":"Number of Topics"}, axis=1), row = "TREC", col = "K",  hue = "Metric", hue_kws=kw, legend_out=True, height=1.75, aspect=1.5)
g = (g.map(plt.plot, "Number of Topics", "Error Rate").add_legend())