# Evaluation

## Imports

In [1]:
import os
from pathlib import Path
import codecs
import re
from math import sqrt
from statistics import mean, stdev

import pandas as pd
from scipy.stats import ttest_ind
from tabulate import tabulate
pd.set_option('display.max_rows', 5)

## Workspace Directories

In [2]:
TIMESTAMP = "2020-05-06-04-13"

NOTEBOOKS_DIR = Path(os.getcwd())
WORKSPACE_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = WORKSPACE_DIR / "data"
EVALUATION_DIR = (DATA_DIR / "experiments" / TIMESTAMP).resolve(True)
OUTPUT_DIR = (DATA_DIR / "figures")
OUTPUT_DIR.mkdir(exist_ok=True)
OUTPUT_DIR

WindowsPath('C:/Users/Heinrich/Development/uni-halle-big-data/sigir20-sampling-bias-due-to-near-duplicates-in-learning-to-rank/data/figures')

## Name mappings

In [3]:
evaluation_names = {
    "ndcg@10-per-topic": r"\ndcg{10}~Performance",
    "ndcg@20-per-topic": r"\ndcg{20}~Performance",
    "ndcg-per-topic": r"\ndcg{}~Performance",
    "map-per-topic": r"\map{}~Performance",
    "first-wikipedia-rank-per-topic": r"Mean First Rank of Wikipedia Documents",
    "first-irrelevant-wikipedia-rank-per-topic": r"Mean First Rank of Irrelevant Wikipedia Documents",
    "first-duplicate-rank-per-topic": r"Mean First Rank of Wikipedia Documents",
    "first-irrelevant-duplicate-rank-per-topic": r"Mean First Rank of Irrelevant Wikipedia Documents",
    "domain-fairness-per-topic": r"Fairness of Exposure Across Domains"
}

corpus_names = {
    "clueweb09": r"ClueWeb~09",
    "gov2": r"GOV2"
}

run_sampling_names = {
    "identity": r"Duplicates Unmodified",
    "duplicates-irrelevant": r"Duplicates Irrelevant",
    "remove-duplicates": r"Duplicates Removed"
}

ranker_names = {
    "bm25": r"BM25",
    "ada-rank": r"AdaRank",
    "coordinate-ascent": r"Coor.~Ascent",
    "lambda-mart": r"LambdaMART",
    "list-net": r"ListNET",
    "rank-boost": r"RankBoost",
    "linear-regression": r"Regression"
}

sampling_names = {
    ("identity", "identity", "identity"): r"100\,\%",
    ("no-wikipedia-redundancy", "identity", "identity"): r"0\,\%",
    ("filter-canonical", "identity", "identity"): r"0\,\%",
#     ("identity", "identity", "novelty-relevance-feedback-null"): r"NOV\textsubscript{0}",
#     ("identity", "identity", "novelty-relevance-feedback-null-novelty-feature"): r"NOV\textsubscript{0,F}",
#     ("identity", "identity", "novelty-relevance-feedback-scale"): r"NOV\textsubscript{S}",
    ("identity", "identity", "novelty-relevance-feedback-scale-novelty-feature"): r"NOV\textsubscript{S,F}",
}

split_names = {
    "most-redundant-training": r"Worst-Case Scenario",
#     "3-fold-cross-validation-1": r"3-Fold Cross Validation",
#     "3-fold-cross-validation-2": r"3-Fold Cross Validation",
#     "3-fold-cross-validation-3": r"3-Fold Cross Validation",
    "5-fold-cross-validation-1": r"5-Fold Cross Validation",
    "5-fold-cross-validation-2": r"5-Fold Cross Validation",
    "5-fold-cross-validation-3": r"5-Fold Cross Validation",
    "5-fold-cross-validation-4": r"5-Fold Cross Validation",
    "5-fold-cross-validation-5": r"5-Fold Cross Validation",
    "clueweb09-mostredundanttraining": r"Worst-Case Scenario",
    "clueweb09-fold1": r"5-Fold Cross Validation",
    "clueweb09-fold2": r"5-Fold Cross Validation",
    "clueweb09-fold3": r"5-Fold Cross Validation",
    "clueweb09-fold4": r"5-Fold Cross Validation",
    "clueweb09-fold5": r"5-Fold Cross Validation",
    "letor-trec-millionquery2007-fold-1": r"5-Fold Cross Validation MQ\,2007",
    "letor-trec-millionquery2007-fold-2": r"5-Fold Cross Validation MQ\,2007",
    "letor-trec-millionquery2007-fold-3": r"5-Fold Cross Validation MQ\,2007",
    "letor-trec-millionquery2007-fold-4": r"5-Fold Cross Validation MQ\,2007",
    "letor-trec-millionquery2007-fold-5": r"5-Fold Cross Validation MQ\,2007",
    "letor-trec-millionquery2008-fold-1": r"5-Fold Cross Validation MQ\,2008",
    "letor-trec-millionquery2008-fold-2": r"5-Fold Cross Validation MQ\,2008",
    "letor-trec-millionquery2008-fold-3": r"5-Fold Cross Validation MQ\,2008",
    "letor-trec-millionquery2008-fold-4": r"5-Fold Cross Validation MQ\,2008",
    "letor-trec-millionquery2008-fold-5": r"5-Fold Cross Validation MQ\,2008",
    "trec-millionquery2007-fold1": r"5-Fold Cross Validation",
    "trec-millionquery2007-fold2": r"5-Fold Cross Validation",
    "trec-millionquery2007-fold3": r"5-Fold Cross Validation",
    "trec-millionquery2007-fold4": r"5-Fold Cross Validation",
    "trec-millionquery2007-fold5": r"5-Fold Cross Validation",
    "trec-millionquery2008-fold1": r"5-Fold Cross Validation",
    "trec-millionquery2008-fold2": r"5-Fold Cross Validation",
    "trec-millionquery2008-fold3": r"5-Fold Cross Validation",
    "trec-millionquery2008-fold4": r"5-Fold Cross Validation",
    "trec-millionquery2008-fold5": r"5-Fold Cross Validation"
#     "trec-millionquery2007-fold1": r"5-Fold Cross Validation MQ\,2007",
#     "trec-millionquery2007-fold2": r"5-Fold Cross Validation MQ\,2007",
#     "trec-millionquery2007-fold3": r"5-Fold Cross Validation MQ\,2007",
#     "trec-millionquery2007-fold4": r"5-Fold Cross Validation MQ\,2007",
#     "trec-millionquery2007-fold5": r"5-Fold Cross Validation MQ\,2007",
#     "trec-millionquery2008-fold1": r"5-Fold Cross Validation MQ\,2008",
#     "trec-millionquery2008-fold2": r"5-Fold Cross Validation MQ\,2008",
#     "trec-millionquery2008-fold3": r"5-Fold Cross Validation MQ\,2008",
#     "trec-millionquery2008-fold4": r"5-Fold Cross Validation MQ\,2008",
#     "trec-millionquery2008-fold5": r"5-Fold Cross Validation MQ\,2008"
}

evaluations = list(evaluation_names.keys())

corpora = list(corpus_names.keys())

evaluation_filter_metrics = {
    "ndcg@10-per-topic": "ndcg@10",
}
evaluation_filter_metrics = { e : evaluation_filter_metrics.get(e, "ndcg@20") for e in evaluations }

## Configuration

In [4]:
baseline_ranker = "BM25"
baseline_sampling = sampling_names[("identity", "identity", "identity")]

## Parse evaluation data frame

In [5]:
# Read from JSON-Lines file.
def get_evaluation_raw(name):
    file = EVALUATION_DIR / ("evaluation-of-experiments-" + name + ".jsonl")
    return pd.read_json(file.open(), lines=True)

# Only print for debugging.
get_evaluation_raw(evaluations[0])

Unnamed: 0,corpus,trainTestSplit,ranker,metric,underSampling,overSampling,featureMutation,trial,runSampling,evaluation,test-set-result,train-set-result
0,clueweb09,clueweb09-fold2,rank-net,ndcg@20,filter-canonical,identity,novelty-relevance-feedback-null-novelty-feature,trial-0,remove-duplicates,ndcg@10-per-topic,"[0.0, 0.0, 0.09111153494792501, 0.138375429649...",
1,clueweb09,clueweb09-fold2,rank-net,ndcg@20,filter-canonical,identity,novelty-relevance-feedback-null-novelty-feature,trial-0,duplicates-irrelevant,ndcg@10-per-topic,"[0.0, 0.0, 0.09111153494792501, 0.138375429649...",
...,...,...,...,...,...,...,...,...,...,...,...,...
23038,gov2,trec-millionquery2008-fold5,list-net,ndcg@10,identity,identity,identity,trial-3,duplicates-irrelevant,ndcg@10-per-topic,"[0.5, 0.386852807234541, 0.93277838931011, 0.6...",
23039,gov2,trec-millionquery2008-fold5,list-net,ndcg@10,identity,identity,identity,trial-3,identity,ndcg@10-per-topic,"[0.5, 0.386852807234541, 0.93277838931011, 0.8...",


In [6]:
def get_evaluation(evaluation_name, corpus=None):
    evaluation = get_evaluation_raw(evaluation_name)
    
    # Drop training set results.
    evaluation = evaluation.drop(columns=["train-set-result"])

    # Drop evaluation column.
    evaluation = evaluation.drop(columns=["evaluation"])

    # Drop trial column.
    evaluation = evaluation.drop(columns=["trial"])

    # Filter corpus.
    if corpus:
        evaluation = evaluation[evaluation["corpus"] == corpus]\
            .drop(columns=["corpus"])

    # Filter models with metric.
    filter_metric = evaluation_filter_metrics[evaluation_name]
    evaluation = evaluation[evaluation["metric"] == filter_metric]\
        .drop(columns=["metric"])

    # Merge samplings into one column.
    evaluation["sampling"] = evaluation[["underSampling","overSampling","featureMutation"]]\
        .aggregate(tuple, axis=1)
    evaluation = evaluation.drop(columns=["underSampling","overSampling","featureMutation"])

    return evaluation

# Only print for debugging.
get_evaluation(evaluations[0], corpora[0])

Unnamed: 0,trainTestSplit,ranker,runSampling,test-set-result,sampling
90,clueweb09-fold2,rank-net,remove-duplicates,"[0.0, 0.0, 0.09111153494792501, 0.138375429649...","(filter-canonical, identity, novelty-relevance..."
91,clueweb09-fold2,rank-net,duplicates-irrelevant,"[0.0, 0.0, 0.09111153494792501, 0.138375429649...","(filter-canonical, identity, novelty-relevance..."
...,...,...,...,...,...
8638,clueweb09-fold4,list-net,duplicates-irrelevant,"[0.334022806616867, 0.0, 0.38156289940793703, ...","(identity, identity, identity)"
8639,clueweb09-fold4,list-net,identity,"[0.29131539076245705, 0.0, 0.251687887754597, ...","(identity, identity, identity)"


In [7]:
def get_evaluation_labeled(evaluation_name, corpus=None):
    evaluation = get_evaluation(evaluation_name, corpus)

    # Map names.
    if "corpus" in evaluation.columns:
        evaluation["corpus"] = evaluation["corpus"].map(lambda split : corpus_names.get(split, ""))
    evaluation["trainTestSplit"] = evaluation["trainTestSplit"].map(lambda split : split_names.get(split, ""))
    evaluation["ranker"] = evaluation["ranker"].map(lambda ranker : ranker_names.get(ranker, ""))
    evaluation["runSampling"] = evaluation["runSampling"].map(lambda run_sampling : run_sampling_names.get(run_sampling, ""))
    evaluation["sampling"] = evaluation["sampling"].map(lambda sampling : sampling_names.get(sampling, ""))

    # Filter empty (ignored) names.
    if "corpus" in evaluation.columns:
        evaluation=evaluation[evaluation["corpus"] != ""]
    evaluation=evaluation[evaluation["trainTestSplit"] != ""]
    evaluation=evaluation[evaluation["ranker"] != ""]
    evaluation=evaluation[evaluation["runSampling"] != ""]
    evaluation=evaluation[evaluation["sampling"] != ""]
    
    return evaluation

# Only print for debugging.
get_evaluation_labeled(evaluations[0], corpora[0])

Unnamed: 0,trainTestSplit,ranker,runSampling,test-set-result,sampling
300,5-Fold Cross Validation,Regression,Duplicates Removed,"[0.562913917456436, 0.0, 0.023363055231470004,...","0\,\%"
301,5-Fold Cross Validation,Regression,Duplicates Irrelevant,"[0.562913917456436, 0.0, 0.023363055231470004,...","0\,\%"
...,...,...,...,...,...
8638,5-Fold Cross Validation,ListNET,Duplicates Irrelevant,"[0.334022806616867, 0.0, 0.38156289940793703, ...","100\,\%"
8639,5-Fold Cross Validation,ListNET,Duplicates Unmodified,"[0.29131539076245705, 0.0, 0.251687887754597, ...","100\,\%"


In [8]:
def categorical_type(categories):
    categories = list(categories)
    categories = sorted(set(categories), key=categories.index)
    return pd.api.types.CategoricalDtype(categories=categories, ordered=True)

# Categories:
corpus_categorical_type = categorical_type(corpus_names.values())
split_categorical_type = categorical_type(split_names.values())
ranker_categorical_type = categorical_type(ranker_names.values())
run_sampling_categorical_type = categorical_type(run_sampling_names.values())
sampling_categorical_type = categorical_type(sampling_names.values())

def get_evaluation_aggregated(evaluation_name, corpus=None):
    evaluation = get_evaluation_labeled(evaluation_name, corpus)

    # Make types categorical.
    types = {
        "trainTestSplit": split_categorical_type,
        "ranker": ranker_categorical_type,
        "runSampling": run_sampling_categorical_type,
        "sampling": sampling_categorical_type
    }
    if "corpus" in evaluation.columns:
        types.update({"corpus" : corpus_categorical_type})
    evaluation = evaluation.astype(types)
    
    # Sort.
    sort_cols = ["trainTestSplit", "ranker", "runSampling", "sampling"]
    if "corpus" in evaluation.columns:
        sort_cols.insert(0, "corpus")
    evaluation = evaluation.sort_values(by=sort_cols)

    # Aggregate trials.
    evaluation = evaluation.groupby(sort_cols)\
        .aggregate(lambda lists : [item for sublist in lists for item in sublist])\
        .dropna()\
        .reset_index()
    
    return evaluation

# Only print for debugging.
get_evaluation_aggregated(evaluations[0], corpora[0])

Unnamed: 0,trainTestSplit,ranker,runSampling,sampling,test-set-result
0,Worst-Case Scenario,BM25,Duplicates Unmodified,"100\,\%","[0.06968312249684201, 0.0, 0.140698320123606, ..."
1,Worst-Case Scenario,BM25,Duplicates Unmodified,"0\,\%","[0.06968312249684201, 0.0, 0.140698320123606, ..."
...,...,...,...,...,...
124,5-Fold Cross Validation,Regression,Duplicates Removed,"0\,\%","[0.562913917456436, 0.0, 0.023363055231470004,..."
125,5-Fold Cross Validation,Regression,Duplicates Removed,"NOV\textsubscript{S,F}","[0.253991566674711, 0.0, 0.009088684028421001,..."


## Statistic utils

In [9]:
MAX_P_VALUE = 0.05

def significantly_better(compare, baseline):
    test = ttest_ind(compare,baseline)
    return test.statistic > 0 and test.pvalue <= MAX_P_VALUE

def cohens_d(compare, baseline):
    return (mean(compare) - mean(baseline)) / (sqrt((stdev(compare) ** 2 + stdev(baseline) ** 2) / 2))

## Generate LaTeX table from data frame

In [18]:
def empty_columns(n):
    return [""] * n

def table(name, corpus=None, decimals=3):
    evaluation = get_evaluation_aggregated(name, corpus)
    
    rankers = evaluation["ranker"].unique()
    run_samplings = evaluation["runSampling"].unique()
    samplings = evaluation["sampling"].unique()


    def table_head():
        if not corpus:
            head = ["Corpus", "Split", "Algorithm"]
        else: 
            head = ["Split", "Algorithm"]
        head.append(evaluation_names[name])
        head += empty_columns(len(samplings) * len(run_samplings) - 1)
        head = list(map(lambda item : r"\textbf{" + item + r"}" if len(item) > 0 else item, head))
        return head
    
    
    def table_subhead():
        head = empty_columns(3 if not corpus else 2)
        for run_sampling in run_samplings:
            head.append(run_sampling)
            head += empty_columns(len(samplings) - 1)
        return head
    
    
    def table_subsubhead():
        head = empty_columns(3 if not corpus else 2)
        for _ in run_samplings:
            for sampling in samplings:
                head.append(sampling)
        return head


    def table_cell(baseline, compare):
        column = r"\("

        significant = significantly_better(compare, baseline)
        if significant:
            column += r"\mathbf{"

        column += ("{:." + str(decimals) + "f}").format(mean(compare))

        d = cohens_d(compare, baseline)
        if d > 0:
            column += r"\updiff{"
            column += "{:.1f}".format(d)
            column += r"}"
        elif d < 0:
            column += r"\downdiff{"
            column += "{:.1f}".format(-d)
            column += r"}"
        else:
            column += r"\nodiff{"
            column += "{:.1f}".format(d)
            column += r"}"

        if significant:
            column += r"}"

        column += r"\)"
        return column


    def table_row(split, split_tex, ranker, row_corpus=None):
        if row_corpus:
            row = [row_corpus, split_tex, ranker]
        else:
            row = [split_tex, ranker]
        for run_sampling in run_samplings:
            df = evaluation
            if row_corpus:
                df = df[df["corpus"] == row_corpus]
            df = df[df["trainTestSplit"] == split]
            df = df[df["ranker"] == ranker]
            df = df[df["runSampling"] == run_sampling]
            if row_corpus:
                drop_columns = ["corpus", "trainTestSplit", "ranker", "runSampling"]
            else: 
                drop_columns = ["trainTestSplit", "ranker", "runSampling"]
            df = df.drop(columns=drop_columns)
            baseline_result = df[df["sampling"] == baseline_sampling]["test-set-result"].iloc[0]
            
            row.append(r"\(" + ("{:." + str(decimals) + "f}").format(mean(baseline_result)) + r"\)")
            for sampling in samplings:
                if sampling != baseline_sampling:
                    if ranker == baseline_ranker:
                        # We don't see sampling differences in BM25 Ranking, 
                        # as those don't depend on training data.
                        # Therefore hide all except the first.
                        row.append(r"---")
                    else:
                        compare_result = df[df["sampling"] == sampling]["test-set-result"].iloc[0]
                        row.append(table_cell(baseline_result, compare_result))
        return row


    def table_rows():
        def split_rotated(split_name, num_rankers):
            return r"\multirow{" + str(num_rankers) +\
                r"}{*}{\rotatebox[origin=c]{90}{\parbox[c]{" +\
                str(num_rankers + 1) +\
                r"em}{\centering \textbf{" + split_name + "}}}}"
        
        rows = []
        if not corpus:
            for corp in evaluation["corpus"].unique():
                corpus_df = evaluation[evaluation["corpus"] == corp]
                for split in corpus_df["trainTestSplit"].unique():
                    split_tex = split_rotated(split, len(rankers))
                    for ranker in rankers:
                        rows.append(table_row(split, split_tex, ranker, corp))
                        split_tex = ""
        else:
            for split in evaluation["trainTestSplit"].unique():
                split_tex = split_rotated(split, len(rankers))
                for ranker in rankers:
                    rows.append(table_row(split, split_tex, ranker))
                    split_tex = ""
        return rows


    table_data = [
        table_head(),
        table_subhead(),
        table_subsubhead()
    ] + table_rows()

    return tabulate(table_data, tablefmt="latex_raw")

def write_table(evaluation, corpus=None, decimals=3):
    file_name = OUTPUT_DIR / (corpus + ("-" if corpus else "") + evaluation + ".tex")
    with codecs.open(file_name, 'w', 'utf-8') as file:
        content = table(evaluation, corpus, decimals)
        content = re.sub(r"\s+&\s+", " & ",content)
        content = re.sub(r"\s+\\\\", r" \\\\",content)
        file.write(r"\documentclass[preview]{standalone}" + "\n" +\
                   r"\usepackage{amsmath}" + "\n" +\
                   r"\usepackage{graphicx}" + "\n" +\
                   r"\newcommand{\ndcg}[1]{nDCG\def\tempndcg{#1}\ifx\tempndcg\empty\else{@}\tempndcg\fi}" + "\n" +\
                   r"\newcommand{\map}{MAP}" + "\n" +\
                   r"\newcommand{\updiff}[1]{^{\text{↑}#1}}" + "\n" +\
                   r"\newcommand{\downdiff}[1]{^{\text{↓}#1}}" + "\n" +\
                   r"\newcommand{\nodiff}[1]{^{\text{=}#1}}" + "\n" +\
                   r"\begin{document}" + "\n")
        file.write(content)
        file.write(r"\end{document}")

## Generate tables

In [19]:
# write_table("domain-fairness-per-topic")
write_table("domain-fairness-per-topic", corpus="gov2")
write_table("domain-fairness-per-topic", corpus="clueweb09")

In [20]:
# write_table("map-per-topic")
# write_table("map-per-topic", corpus="gov2")
# write_table("map-per-topic", corpus="clueweb09")
# write_table("ndcg@10-per-topic")
write_table("ndcg@10-per-topic", corpus="gov2")
write_table("ndcg@10-per-topic", corpus="clueweb09")
# write_table("ndcg@20-per-topic")
write_table("ndcg@20-per-topic", corpus="gov2")
write_table("ndcg@20-per-topic", corpus="clueweb09")

In [21]:
# write_table("first-wikipedia-rank-per-topic", decimals=0, corpus="clueweb09")
write_table("first-irrelevant-wikipedia-rank-per-topic", decimals=0, corpus="clueweb09")
# write_table("first-duplicate-rank-per-topic", decimals=0)
# write_table("first-duplicate-rank-per-topic", decimals=0, corpus="gov2")
# write_table("first-duplicate-rank-per-topic", decimals=0, corpus="clueweb09")
# write_table("first-irrelevant-duplicate-rank-per-topic", decimals=0)
write_table("first-irrelevant-duplicate-rank-per-topic", decimals=0, corpus="gov2")
write_table("first-irrelevant-duplicate-rank-per-topic", decimals=0, corpus="clueweb09")