# Preparation

## Functions

### Load the model dataframe

In [1]:
import pandas as pd
import glob
import os
import sys

def load_model_df():
    model_rankings = 'monoT5'
    
    base_path = f'../indexing/results/{model_rankings}/'
    # all_model_dirs = glob.glob(f'../indexing/results/{model_rankings}/*')
    all_model_csvs = glob.glob(f'{base_path}/*.csv') 
    plot_path = f'./plots'
    if not os.path.exists(plot_path):
        os.makedirs(plot_path)
    
    number_of_answers = 10
    
    model_dfs = []
    for model_csv in all_model_csvs:
        df = pd.read_csv(model_csv)
        model_name = model_csv.split("/")[-1].split(".")[0].replace("_rankings", "")
        df["model"] = model_name
        model_dfs.append(df)
    
    all_model_df = pd.concat(model_dfs, ignore_index=True)
    all_model_df['is_natural_question'] = all_model_df['query'].str.endswith('?')
    all_model_df['weighted_position']  = 1 - all_model_df['weighted_position']

    # replace na answers with empty string
    all_model_df['text'] = all_model_df['text'].fillna('')
    return all_model_df

### Load the expert annotations

In [2]:
import json

def get_annotations():
    with open("expert-annotations.json", "r") as file:
        return json.load(file)

### Parse the annotations

#### Helper

In [3]:
def _get_answer_components(answer_id):
    splits = answer_id.split("_")
    model = "_".join(splits[:-3])
    qid = splits[-3]
    answer_number = splits[-1]
    return model, int(qid), int(answer_number)

def _clean_dict(result_dict):
    results = {}
    for k, v in result_dict.items():
        try: 
            model, _, _ = _get_answer_components(k)
        except:
            model = "document"
        results[model] = v
    return results

#### Get the expert ranks

In [4]:
def get_expert_ranks(rankings):
    rank_dict = _clean_dict({model: i+1 for i, model in enumerate(rankings)})
    return pd.DataFrame([{"model": k, "expert_rank": v} for k,v in rank_dict.items()]).sort_values("expert_rank")

#### Get the monoT5 scores

In [5]:
def get_monoT5_scores(items):
    score_dict = _clean_dict({item["docno"]: item["score"] for item in items})
    return pd.DataFrame([{"model": k, "monoT5_score": v} for k,v in score_dict.items()]).sort_values("monoT5_score", ascending=False)

### Get the information about query and answer

In [6]:
def get_ids(rankings, df):
    results = []
    for model in rankings:
        try:
            model, qid, answer_number = _get_answer_components(model)
            results.append({
                "model": model,
                "qid": qid,
                "answer_number": answer_number,
            })
        except:
            results.append({
                "model": "document",
                "qid": None,
                "answer_number": None,
            })
    df = pd.DataFrame(results)
    df.loc[:, "qid"] = df.loc[:, "qid"].ffill()
    return df

### Combined

In [7]:
def process_annotation_dict(q_dict, df):
    items = q_dict['data']["items"]
    rankings = q_dict["annotations"][0]["result"][0]["value"]["ranker"]["rank"]
    df1 = get_expert_ranks(rankings)
    df2 = get_monoT5_scores(items)
    df3 = get_ids(rankings, df)
    return df1.merge(df2.merge(df3, on="model"), on="model")

In [8]:
def process_annotations(annotations, df):
    result_df = pd.DataFrame()
    for q_dict in annotations:
        result_df = pd.concat([result_df, process_annotation_dict(q_dict, df)], ignore_index=True)
    return result_df

# Application

In [9]:
df = load_model_df()
annotations = get_annotations()

## Example for one query

In [10]:
q_dict = annotations[0]
items = q_dict['data']["items"]
rankings = q_dict["annotations"][0]["result"][0]["value"]["ranker"]["rank"]
rankings

['chatgpt_22_multimedqa_6',
 'meta-llama_Llama-2-13b-chat-hf_22_multimedqa_10',
 'gpt2-xl_22_multimedqa_10',
 '01f617ac-9919-44fe-8fc8-2db9ff72b5e7',
 'gpt2_22_multimedqa_9']

In [11]:
df1 = get_expert_ranks(rankings)
df2 = get_monoT5_scores(items)
df3 = get_ids(rankings, df)

In [12]:
df1.merge(df2.merge(df3, on="model"), on="model").head()

Unnamed: 0,model,expert_rank,monoT5_score,qid,answer_number
0,chatgpt,1,-0.013656,22.0,6.0
1,meta-llama_Llama-2-13b-chat-hf,2,-0.010791,22.0,10.0
2,gpt2-xl,3,-0.094876,22.0,10.0
3,document,4,-0.024986,22.0,
4,gpt2,5,-5.417914,22.0,9.0


## Full annotations

In [13]:
result_df = process_annotations(annotations, df)
result_df.head()

Unnamed: 0,model,expert_rank,monoT5_score,qid,answer_number
0,chatgpt,1,-0.013656,22.0,6.0
1,meta-llama_Llama-2-13b-chat-hf,2,-0.010791,22.0,10.0
2,gpt2-xl,3,-0.094876,22.0,10.0
3,document,4,-0.024986,22.0,
4,gpt2,5,-5.417914,22.0,9.0


# Correlation

In [14]:
import os

import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy.stats as st

In [15]:
qid = 95
measure = "monoT5_score"

In [16]:
def get_correlation_df(result_df, measure="monoT5_score", include_docs=True, inverse_ranking=True):
    taus = []
    measure_mult = -1 if inverse_ranking else 1
    for qid in result_df["qid"].unique():
        tmp = result_df.loc[(result_df["qid"]==qid) & (result_df[measure].notna())]
        if not include_docs:
            tmp = tmp.loc[tmp["model"] != "document"]
        
        tau, p = st.kendalltau(tmp["expert_rank"], measure_mult*tmp[measure])
        taus.append({"qid": qid, "tau": tau, "p": p, "measure": measure})
    return pd.DataFrame(taus)

In [17]:
tau_df = get_correlation_df(result_df=result_df)
tau_df.head()

Unnamed: 0,qid,tau,p,measure
0,22.0,0.6,0.233333,monoT5_score
1,35.0,1.0,0.016667,monoT5_score
2,1.0,0.8,0.083333,monoT5_score
3,68.0,0.8,0.083333,monoT5_score
4,54.0,1.0,0.016667,monoT5_score


Tau values without documents

In [18]:
tau_df_no_doc = get_correlation_df(result_df, include_docs=False)
tau_df_no_doc.head()

Unnamed: 0,qid,tau,p,measure
0,22.0,0.666667,0.333333,monoT5_score
1,35.0,1.0,0.083333,monoT5_score
2,1.0,1.0,0.083333,monoT5_score
3,68.0,1.0,0.083333,monoT5_score
4,54.0,1.0,0.083333,monoT5_score


# Visualization

## Determine confidence intervals for Kendalls Tau

In [19]:
def _ci(agg_df, alpha, col_prefix="tau"):
    df_new = pd.DataFrame()
    df_new['mean'] = agg_df[col_prefix]['mean']
    df_new[f'{int(alpha*100)}_lo'], df_new[f'{int(alpha*100)}_hi'] = st.t.interval(
        alpha, 
        df=agg_df[col_prefix]['count'] - 1,
        loc=agg_df[col_prefix]['mean'],
        scale=agg_df[col_prefix]['sem'])
    df_new['measure'] = col_prefix
    df_new.set_index(['measure'], inplace=True, append=True)
    return df_new

agg_df = tau_df.groupby("measure").agg(["mean", "count", "sem"])
df_ci = _ci(agg_df, .95, 'tau')
df_ci

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,95_lo,95_hi
measure,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
monoT5_score,tau,0.64,0.502165,0.777835


Confidence intervals without documents

In [20]:
agg_df_no_doc = tau_df_no_doc.groupby("measure").agg(["mean", "count", "sem"])
df_ci_no_doc = _ci(agg_df_no_doc, .95, 'tau')
df_ci_no_doc

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,95_lo,95_hi
measure,measure,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
monoT5_score,tau,0.766667,0.6318,0.901534


### Correlation values per Query

In [21]:
tau_df.pivot_table(values='tau', index='qid', columns='measure', aggfunc='first')

measure,monoT5_score
qid,Unnamed: 1_level_1
1.0,0.8
22.0,0.6
35.0,1.0
52.0,1.0
54.0,1.0
55.0,0.6
57.0,0.2
68.0,0.8
81.0,0.8
83.0,0.4


### Position according to expert and monoT5

In [22]:
pos_df = result_df.copy(deep=True)

model_map = {"chatgpt": "ChatGPT",
             "meta-llama_Llama-2-13b-chat-hf": "Lla 13B",
             "gpt2-xl": "GPT-2 XL",
             "gpt2": "GPT-2",
             "document": "Doc."}
pos_df["model"] = pos_df["model"].map(model_map)
pos_df["monoT5_rank"] = pos_df.groupby("qid")["monoT5_score"].rank(method="dense", ascending=False)

In [23]:
pos_df

Unnamed: 0,model,expert_rank,monoT5_score,qid,answer_number,monoT5_rank
0,ChatGPT,1,-0.013656,22.0,6.0,2.0
1,Lla 13B,2,-0.010791,22.0,10.0,1.0
2,GPT-2 XL,3,-0.094876,22.0,10.0,4.0
3,Doc.,4,-0.024986,22.0,,3.0
4,GPT-2,5,-5.417914,22.0,9.0,5.0
...,...,...,...,...,...,...
95,ChatGPT,1,-0.010407,85.0,5.0,2.0
96,Lla 13B,2,-0.009214,85.0,10.0,1.0
97,Doc.,3,-0.012032,85.0,,3.0
98,GPT-2 XL,4,-0.026931,85.0,3.0,4.0


In [24]:
pivot_df = pos_df.pivot_table(values=["expert_rank", "monoT5_rank"], 
                              index='qid', 
                              columns=['model'], 
                              aggfunc='first', 
                              dropna=False)
pivot_df = pivot_df.reorder_levels(order=[1, 0], axis=1).reindex(columns=model_map.values(), level="model")
pivot_df.head()

model,ChatGPT,ChatGPT,Lla 13B,Lla 13B,GPT-2 XL,GPT-2 XL,GPT-2,GPT-2,Doc.,Doc.
Unnamed: 0_level_1,expert_rank,monoT5_rank,expert_rank,monoT5_rank,expert_rank,monoT5_rank,expert_rank,monoT5_rank,expert_rank,monoT5_rank
qid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1.0,1,1.0,2,2.0,3,3.0,4,5.0,5,4.0
22.0,1,2.0,2,1.0,3,4.0,5,5.0,4,3.0
35.0,1,1.0,2,2.0,3,3.0,5,5.0,4,4.0
52.0,1,1.0,2,2.0,3,3.0,5,5.0,4,4.0
54.0,1,1.0,2,2.0,4,4.0,5,5.0,3,3.0


#### Visualization for each model
- Five blocks for each query (possible ranks)
- Each block can have one of 4 scenarios
    - Rank not set
    - expert = monoT5
    - only expert
    - only monoT5

In [25]:
def _label_model_ranks(model_row):
    positions = {idx: "not set" for idx in range(1, 6)}
    exp_rank, t5_rank = model_row["expert_rank"], model_row["monoT5_rank"]

    # Assign initial labels as if there were no concordant ranks
    positions[exp_rank] = "only expert"
    positions[t5_rank] = "only t5"
    
    # Correct for concordant ranks
    if exp_rank == t5_rank:
        positions[exp_rank] = "expert == t5"
    return positions

In [26]:
model_row=pivot_df.iloc[1]["Lla 13B"]
model_row

expert_rank    2.0
monoT5_rank    1.0
Name: 22.0, dtype: float64

In [27]:
model_dict = _label_model_ranks(model_row)
model_dict

{1: 'only t5', 2: 'only expert', 3: 'not set', 4: 'not set', 5: 'not set'}

Apply labeling to each query and model

In [28]:
def label_query(row):
    return pd.Series({model: _label_model_ranks(row[model])for model in model_map.values()})

In [29]:
vis_df = pivot_df.apply(lambda row: label_query(row), axis=1)
vis_df.head()

Unnamed: 0_level_0,ChatGPT,Lla 13B,GPT-2 XL,GPT-2,Doc.
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,"{1: 'expert == t5', 2: 'not set', 3: 'not set'...","{1: 'not set', 2: 'expert == t5', 3: 'not set'...","{1: 'not set', 2: 'not set', 3: 'expert == t5'...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ..."
22.0,"{1: 'only expert', 2: 'only t5', 3: 'not set',...","{1: 'only t5', 2: 'only expert', 3: 'not set',...","{1: 'not set', 2: 'not set', 3: 'only expert',...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'only t5', 4: ..."
35.0,"{1: 'expert == t5', 2: 'not set', 3: 'not set'...","{1: 'not set', 2: 'expert == t5', 3: 'not set'...","{1: 'not set', 2: 'not set', 3: 'expert == t5'...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ..."
52.0,"{1: 'expert == t5', 2: 'not set', 3: 'not set'...","{1: 'not set', 2: 'expert == t5', 3: 'not set'...","{1: 'not set', 2: 'not set', 3: 'expert == t5'...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ..."
54.0,"{1: 'expert == t5', 2: 'not set', 3: 'not set'...","{1: 'not set', 2: 'expert == t5', 3: 'not set'...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'not set', 4: ...","{1: 'not set', 2: 'not set', 3: 'expert == t5'..."


### Create the visualizations

In [30]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.offsetbox import AnnotationBbox, AuxTransformBox

box_size = 1
spacing = 0.15

def _rank_design(rank, letter, ax):
    # Define designs
    t5_color, t5_hatch, t5_edgecolor = '#ffd87d', '/', '#c78824'
    exp_color, exp_hatch, exp_edgecolor = '#d3f8f7', '\\', '#3a91a5'
    t5_exp_color, t5_exp_hatch, t5_exp_edgecolor = '#9ed977',  'x', '#349850'

    # Determine box position
    x = (rank - 1) * (box_size + spacing)
    y = 0
    
    # Set the annotation boxes
    r = patches.Rectangle((x, y), box_size, box_size, facecolor='none', edgecolor='none', linewidth=0)
    offsetbox = AuxTransformBox(ax.transData)
    offsetbox.add_artist(r)
    lw = 6.5
    frame = AnnotationBbox(offsetbox, (x + box_size/2., y + box_size/2.), boxcoords="data", pad=0, fontsize=lw/2,
                           bboxprops=dict(facecolor="none", edgecolor='grey', linewidth=lw/2))

    # Create the square based on t5 and expert rank
    match letter:
        case "expert == t5":
            square = patches.Rectangle((x, y), box_size, box_size, hatch_linewidth=lw,
                                       facecolor=t5_exp_color, edgecolor=t5_exp_edgecolor, hatch=t5_exp_hatch)
        case "only expert":
            square = patches.Rectangle((x, y), box_size, box_size, hatch_linewidth=lw,
                                       facecolor=exp_color, edgecolor=exp_edgecolor, hatch=exp_hatch)
        case "only t5":
            square = patches.Rectangle((x, y), box_size, box_size, hatch_linewidth=lw,
                                       facecolor=t5_color, edgecolor=t5_edgecolor, hatch=t5_hatch)
        case _:
            square = patches.Rectangle((x, y), box_size, box_size, facecolor='white', edgecolor='none')
    
    ax.add_patch(square)
    ax.add_artist(frame)

def _create_rank_visualization(model_dict, model, qid):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    # Iterate over the dictionary to create boxes
    for rank, letter in model_dict.items():
        _rank_design(rank, letter, ax)
    
    ax.set_xlim(0, 5 * box_size + 4 * spacing)
    ax.set_ylim(0, box_size)
    ax.set_aspect('equal', adjustable='box')
    plt.axis('off') 

    filename = _create_filename(model, qid)
    plt.tight_layout()
    plt.savefig(os.path.join('rank_visualization', f'{filename}.pdf'), dpi=500, transparent=True)
    plt.close()


def _create_filename(model, qid):
    model_name = model.lower().replace(" ", "_").replace(".", "")
    return f'{model_name}_{int(qid)}'

In [31]:
os.makedirs("rank_visualization", exist_ok=True)
for qid, row in vis_df.iterrows():
    for model in row.index:
        _create_rank_visualization(model_dict=row[model], model=model, qid=int(qid))

# Latex Table:
Create joint table of Kendalls Tau and Rank visualizations

In [32]:
def create_latex_df(vis_df, tau_df):
    df1 = pd.DataFrame([_row_to_latex_cmd(row, qid) for qid, row in vis_df.iterrows()])
    df2 = tau_df.pivot_table(values='tau', index='qid', columns='measure', aggfunc='first').reset_index()
    df2["qid"] = df2["qid"].astype(int) 
    df = df2.merge(df1, on="qid")[['qid', 'ChatGPT', 'Lla 13B', 'GPT-2 XL', 'GPT-2','Doc.', 'monoT5_score']]
    return df.rename(columns={"monoT5_score": r"Kendall's $\tau$"})
        
def _row_to_latex_cmd(row, qid):
    dict_ = {model: _model_to_latex_cmd(model, qid) for model in row.index}
    dict_["qid"] = int(qid)
    return dict_

def _model_to_latex_cmd(model, qid):
    filename = _create_filename(model, qid)
    return "\\provideranks{" + filename + "}"    
    

In [33]:
latex_df = create_latex_df(vis_df, tau_df)
latex_df.head()

Unnamed: 0,qid,ChatGPT,Lla 13B,GPT-2 XL,GPT-2,Doc.,Kendall's $\tau$
0,1,\provideranks{chatgpt_1},\provideranks{lla_13b_1},\provideranks{gpt-2_xl_1},\provideranks{gpt-2_1},\provideranks{doc_1},0.8
1,22,\provideranks{chatgpt_22},\provideranks{lla_13b_22},\provideranks{gpt-2_xl_22},\provideranks{gpt-2_22},\provideranks{doc_22},0.6
2,35,\provideranks{chatgpt_35},\provideranks{lla_13b_35},\provideranks{gpt-2_xl_35},\provideranks{gpt-2_35},\provideranks{doc_35},1.0
3,52,\provideranks{chatgpt_52},\provideranks{lla_13b_52},\provideranks{gpt-2_xl_52},\provideranks{gpt-2_52},\provideranks{doc_52},1.0
4,54,\provideranks{chatgpt_54},\provideranks{lla_13b_54},\provideranks{gpt-2_xl_54},\provideranks{gpt-2_54},\provideranks{doc_54},1.0


In [34]:
print(latex_df.round(4).to_latex(index=False))

\begin{tabular}{rlllllr}
\toprule
qid & ChatGPT & Lla 13B & GPT-2 XL & GPT-2 & Doc. & Kendall's $\tau$ \\
\midrule
1 & \provideranks{chatgpt_1} & \provideranks{lla_13b_1} & \provideranks{gpt-2_xl_1} & \provideranks{gpt-2_1} & \provideranks{doc_1} & 0.800000 \\
22 & \provideranks{chatgpt_22} & \provideranks{lla_13b_22} & \provideranks{gpt-2_xl_22} & \provideranks{gpt-2_22} & \provideranks{doc_22} & 0.600000 \\
35 & \provideranks{chatgpt_35} & \provideranks{lla_13b_35} & \provideranks{gpt-2_xl_35} & \provideranks{gpt-2_35} & \provideranks{doc_35} & 1.000000 \\
52 & \provideranks{chatgpt_52} & \provideranks{lla_13b_52} & \provideranks{gpt-2_xl_52} & \provideranks{gpt-2_52} & \provideranks{doc_52} & 1.000000 \\
54 & \provideranks{chatgpt_54} & \provideranks{lla_13b_54} & \provideranks{gpt-2_xl_54} & \provideranks{gpt-2_54} & \provideranks{doc_54} & 1.000000 \\
55 & \provideranks{chatgpt_55} & \provideranks{lla_13b_55} & \provideranks{gpt-2_xl_55} & \provideranks{gpt-2_55} & \provideranks{d

## Create examples for latex caption

In [35]:
for letter in ["only t5", "only expert", "expert == t5"]:
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    _rank_design(rank=1, letter=letter, ax=ax)
    ax.set_xlim(0, 5 * box_size + 4 * spacing)
    ax.set_ylim(0, box_size)
    ax.set_aspect('equal', adjustable='box')
    plt.axis('off') 

    suffix = letter.replace(" == ", "-eq-").replace("only ", "")
    plt.tight_layout()
    plt.savefig(os.path.join('rank_visualization', f'example-{suffix}.pdf'), dpi=500, transparent=True)
    plt.close()