In [1]:
import numpy as np
import pandas as pd

from typing import Dict

pd.options.display.float_format = "{:.3f}".format


In [2]:
sentences = pd.read_pickle("samples/df_sentences.pkl")
pairs = pd.read_pickle("samples/pairs_sentences.pkl")
pairs_QA = pd.read_pickle("samples/pairs_sentences_QA.pkl")


In [3]:
pairs.head()


Unnamed: 0,0,1,2
0,0,"[200, 114, 314, 324, 124]","[0.96460503, 0.7580036, 0.74133974, 0.73496366..."
1,1,"[201, 389, 347, 165, 230]","[0.87307894, 0.84328634, 0.8360214, 0.8266194,..."
2,2,"[202, 363, 271, 118, 163]","[0.98577964, 0.8576993, 0.8569913, 0.85006714,..."
3,3,"[203, 240, 121, 40, 321]","[0.9916861, 0.8930633, 0.869796, 0.8686416, 0...."
4,4,"[204, 99, 184, 94, 312]","[0.90511703, 0.83434254, 0.82449657, 0.8137258..."


In [4]:
def add_info(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop(columns=0).rename(columns={1: "top5", 2: "score"})
    gt = 2 * sentences.iloc[:]["avg"].tolist()
    df = pd.concat([df, pd.Series(gt)], axis=1).rename(columns={0: "gt"})

    df["top5"] = df["top5"].apply(np.array)
    df["norm"] = np.abs(np.array(df["top5"]) - df.index) - sentences.shape[0]
    df["hit@1"] = df["norm"].apply(lambda x: x[0] == 0)
    df["hit@5"] = df["norm"].apply(lambda x: 0 in x)
    return df


In [5]:
sample = add_info(pairs).head()
sample


Unnamed: 0,top5,score,gt,norm,hit@1,hit@5
0,"[200, 114, 314, 324, 124]","[0.96460503, 0.7580036, 0.74133974, 0.73496366...",3.0,"[0, -86, 114, 124, -76]",True,True
1,"[201, 389, 347, 165, 230]","[0.87307894, 0.84328634, 0.8360214, 0.8266194,...",2.0,"[0, 188, 146, -36, 29]",True,True
2,"[202, 363, 271, 118, 163]","[0.98577964, 0.8576993, 0.8569913, 0.85006714,...",4.0,"[0, 161, 69, -84, -39]",True,True
3,"[203, 240, 121, 40, 321]","[0.9916861, 0.8930633, 0.869796, 0.8686416, 0....",5.0,"[0, 37, -82, -163, 118]",True,True
4,"[204, 99, 184, 94, 312]","[0.90511703, 0.83434254, 0.82449657, 0.8137258...",3.0,"[0, -105, -20, -110, 108]",True,True


In [6]:
def get_metrics(df: pd.DataFrame) -> Dict:
    averages = {
        "max_score": df["score"].str.get(0).mean(),
        "other_scores": np.mean(df["score"].str.slice(1, 5).tolist()),
        "hit@1": df["hit@1"].mean(),
        "hit@5": df["hit@5"].mean(),
        "miss@1_sum": df.shape[0] - df["hit@1"].sum(),
        "miss@5_sum": df.shape[0] - df["hit@5"].sum(),
        "hit@1_gt": df[df["hit@1"]]["gt"].mean(),
        "hit@5_gt": df[df["hit@5"]]["gt"].mean(),
        "miss@1_gt": df[~df["hit@1"]]["gt"].mean(),
        "miss@5_gt": df[~df["hit@5"]]["gt"].mean(),
    }
    averages["scores_diff"] = averages["max_score"] - averages["other_scores"]
    return averages


In [7]:
get_metrics(sample)


{'max_score': 0.9440532922744751,
 'other_scores': 0.8222481,
 'hit@1': 1.0,
 'hit@5': 1.0,
 'miss@1_sum': 0,
 'miss@5_sum': 0,
 'hit@1_gt': 3.4,
 'hit@5_gt': 3.4,
 'miss@1_gt': nan,
 'miss@5_gt': nan,
 'scores_diff': 0.12180519104003906}

In [8]:
final_df = pd.DataFrame()

for (model_name, df_aux) in (("BERTa", pairs), ("multi-qa", pairs_QA)):
    df_aux = add_info(df_aux)
    metrics = get_metrics(df_aux)

    final_df = pd.concat([final_df, pd.DataFrame(metrics, index=[model_name])])

cols = final_df.columns[:-1].insert(2, "scores_diff")
final_df = final_df[cols]


In [9]:
final_df.iloc[:, :7]


Unnamed: 0,max_score,other_scores,scores_diff,hit@1,hit@5,miss@1_sum,miss@5_sum
BERTa,0.912,0.8,0.112,0.948,0.988,21,5
multi-qa,0.742,0.553,0.189,0.87,0.943,52,23


In [10]:
final_df.iloc[:, 7:]


Unnamed: 0,hit@1_gt,hit@5_gt,miss@1_gt,miss@5_gt
BERTa,2.677,2.653,2.044,1.884
multi-qa,2.722,2.666,2.122,2.279
