### This file is to automate comparisons between the actual numerical results for MonoLLM and PEBOL methods

In [3]:
import pandas as pd

In [2]:
datasets = ['recipes', 'restaurants', 'movies']
method_order = ['GPT MonoLLM', 'Gemini MonoLLM', 'ER', 'Greedy', 'Random', 'Thompson', 'UCB']


##### Get best PEBOL value vs MonoLLM value for each dataset at noise 0

In [4]:
mono_methods = ['GPT MonoLLM', 'Gemini MonoLLM']
pebol_methods = ['ER', 'Greedy', 'Random', 'Thompson', 'UCB']

for dataset in datasets:
    results_df = pd.read_csv(f"organized_results/{dataset}/best/noise0/aggregated_results.csv")

    for mono in mono_methods:
        mono_idx = method_order.index(mono)
        mono_val = results_df.loc[mono_idx, 'map@9']
        biggest_pebol_val = 0
        for pebol in pebol_methods:
            pebol_idx = method_order.index(pebol)
            pebol_val = results_df.loc[pebol_idx, 'map@9']
            if pebol_val > biggest_pebol_val:
                biggest_pebol_val = pebol_val
        print(f"{dataset}: {mono} {mono_val.round(4)} vs PEBOL {biggest_pebol_val.round(4)}  (diff of {(biggest_pebol_val - mono_val).round(4)})")


recipes: GPT MonoLLM 0.1125 vs PEBOL 0.1745  (diff of 0.062)
recipes: Gemini MonoLLM 0.1552 vs PEBOL 0.1745  (diff of 0.0193)
restaurants: GPT MonoLLM 0.1167 vs PEBOL 0.2696  (diff of 0.1529)
restaurants: Gemini MonoLLM 0.1717 vs PEBOL 0.2696  (diff of 0.098)
movies: GPT MonoLLM 0.0937 vs PEBOL 0.1766  (diff of 0.0829)
movies: Gemini MonoLLM 0.1469 vs PEBOL 0.1766  (diff of 0.0297)


#### Get minimum improvement from PEBOL-P over PEBOL-B

In [5]:
pb_methods = ['PEBOL-B ER','PEBOL-B Greedy','PEBOL-B Random','PEBOL-B TS','PEBOL-B UCB']

b_results_df = pd.read_csv(f"organized_results/movies/mnli0/aggregated_results.csv")
results_df = pd.read_csv(f"organized_results/movies/best/noise0/aggregated_results.csv")

for turn in range(10):
    largest_b_val = 0
    pebol_p_idx = method_order.index("Thompson")
    pebol_p_val = results_df.loc[pebol_p_idx, f'map@{turn}']
    
    for pb_method in pb_methods:
        pebol_b_idx = pb_methods.index(pb_method)
        pebol_b_val = b_results_df.loc[pebol_b_idx, f'map@{turn}']

        if pebol_b_val > largest_b_val:
            largest_b_val = pebol_b_val
    print(f"Largest diff at turn {turn}: {pebol_p_val.round(4)} - {largest_b_val.round(4)} = {(pebol_p_val - largest_b_val).round(4)}")
        



Largest diff at turn 0: 0.0455 - 0.0348 = 0.0106
Largest diff at turn 1: 0.0788 - 0.0495 = 0.0293
Largest diff at turn 2: 0.1127 - 0.0577 = 0.055
Largest diff at turn 3: 0.1013 - 0.0707 = 0.0306
Largest diff at turn 4: 0.1174 - 0.0755 = 0.042
Largest diff at turn 5: 0.1537 - 0.0891 = 0.0646
Largest diff at turn 6: 0.1533 - 0.0978 = 0.0556
Largest diff at turn 7: 0.1629 - 0.1183 = 0.0447
Largest diff at turn 8: 0.1678 - 0.1247 = 0.0432
Largest diff at turn 9: 0.1766 - 0.1285 = 0.0481


#### Aspect history

In [5]:
nh_methods = ['Thompson', 'UCB']


for dataset in datasets:
    b_results_df = pd.read_csv(f"organized_results/{dataset}/no_history/aggregated_results.csv")
    results_df = pd.read_csv(f"organized_results/{dataset}/best/noise0/aggregated_results.csv")

    hist_idx = method_order.index('Thompson')
    hist_val = results_df.loc[hist_idx, 'map@9']
        
    no_hist_idx = nh_methods.index('Thompson')
    no_hist_val = b_results_df.loc[no_hist_idx, 'map@9']


    print(f"TS on {dataset} overall avg vals: History {(hist_val).round(4)} - No History {(no_hist_val).round(4)}")
            


TS on recipes overall avg vals: History 0.1315 - No History 0.0941
TS on restaurants overall avg vals: History 0.2696 - No History 0.1575
TS on movies overall avg vals: History 0.1766 - No History 0.1351
