In [25]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv('collected_outputs.csv')

In [27]:
llms = ['openai/o1-preview-2024-09-12','anthropic/claude-3-5-sonnet-20240620',
              'openai/gpt-3.5-turbo-0125',
              'openai/gpt-4o-2024-05-13',
              'meta/llama-3.1-70b-instruct-turbo',
             'google/gemini-1.5-pro-001']

In [28]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", FutureWarning)

def process_responses_gpt_3(text):
    if type(text)==float:
        return text
    if text=='Recipe 2':
        return 2
    elif text=='Recipe 1':
        return 1
    elif ('Recipe 1' in text) and ~('Recipe 2' in text):
        return 1
    elif ('Recipe 2' in text) and ~('Recipe 1' in text):
        return 2
    else:
        return int(text)

def process_responses_gemini(text):
    if type(text)==float:
        return text
    return text[0]

def process_responses_o1(text):
    if type(text)==float:
        return text
    if 'Answer: ' in text:
        text = text.strip('Answer: ')
    return text[0]

In [29]:
df[llms[2]] = df[llms[2]].apply(lambda x: process_responses_gpt_3(x))
df[llms[5]] = df[llms[5]].apply(lambda x: process_responses_gemini(x))
df[llms[0]] = df[llms[0]].apply(lambda x: process_responses_o1(x))

In [30]:
df[llms[0]] = df[llms[0]].astype(float).astype(str)
df[llms[1]] = df[llms[1]].astype(float).astype(str)
df[llms[2]] = df[llms[2]].astype(float).astype(str)
df[llms[3]] = df[llms[3]].astype(float).astype(str)
df[llms[4]] = df[llms[4]].astype(float).astype(str)
df[llms[5]] = df[llms[5]].astype(float).astype(str)

In [31]:
df['ground_truth'] = pd.read_csv('pairs_metadata.csv')['ground_truth'].values

df['ground_truth'] = df['ground_truth'].astype(float).astype(str)

### 1. Accuracy overall and  by the rating difference

In [32]:
df['actual_score_1'] = pd.read_csv('pairs_metadata.csv')['actual_score_1'].values
df['actual_score_2'] = pd.read_csv('pairs_metadata.csv')['actual_score_2'].values

In [33]:
df['diff'] = np.abs((df['actual_score_2'] - df['actual_score_1']))
df['bin'], bins  = pd.cut(df['diff'], bins=4, labels=False, retbins=True)

In [34]:
import numpy as np

In [35]:
from scipy.stats import chi2_contingency

In [36]:
l_e = []
for llm in llms:
    lowers = []
    uppers = []
    means = []

    entry = {}
    entry['llm'] = llm
    entry['bin'] = 'overall'

    entry['mean'] = sum(df['ground_truth']==df[llm])/len(df.dropna(subset=[llm]))*100

    observed = np.array([[sum(df['ground_truth']==df[llm]), sum(df['ground_truth']!=df[llm])],
                         [len(df[llm])/2, len(df[llm])/2]])
    
    chi2_stat, p_value, dof, expected = chi2_contingency(observed)

    entry['p'] = p_value
    l_e.append(entry)
    
    for name, gr in df.dropna(subset = [llm]).groupby('bin'):
        entry = {}
        entry['llm'] = llm
        entry['bin'] = name
        entry['mean'] = 100*sum(gr[llm]==gr['ground_truth'])/len(gr)

        observed = np.array([[sum(gr['ground_truth']==gr[llm]), sum(gr['ground_truth']!=gr[llm])],
                         [len(gr[llm])/2, len(gr[llm])/2]])
    
        chi2_stat, p_value, dof, expected = chi2_contingency(observed)
    
        entry['p'] = p_value
        l_e.append(entry)

In [37]:
table = pd.DataFrame(l_e)
table = table.pivot(index='llm', columns='bin', values='mean')
significance = pd.DataFrame(l_e).pivot(index='llm', columns='bin', values='p')

In [38]:
sorting = ['openai/gpt-4o-2024-05-13',
           'anthropic/claude-3-5-sonnet-20240620',
           'openai/o1-preview-2024-09-12',
           'openai/gpt-3.5-turbo-0125',
           'meta/llama-3.1-70b-instruct-turbo',
           'google/gemini-1.5-pro-001']

In [39]:
(table.round(2).astype(str) + \
 significance.applymap(lambda x: ('\\%*') if x <0.05/5 else '\\%')).loc[sorting]

bin,0,1,2,3,overall
llm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
openai/gpt-4o-2024-05-13,62.32\%*,65.31\%,67.74\%,71.43\%,64.0\%*
anthropic/claude-3-5-sonnet-20240620,64.79\%*,61.22\%,58.06\%,85.71\%,63.2\%*
openai/o1-preview-2024-09-12,60.92\%,57.14\%,59.68\%,57.14\%,59.6\%*
openai/gpt-3.5-turbo-0125,55.99\%,55.78\%,58.06\%,85.71\%,56.6\%
meta/llama-3.1-70b-instruct-turbo,53.17\%,48.3\%,43.55\%,57.14\%,50.6\%
google/gemini-1.5-pro-001,50.0\%,50.34\%,33.87\%,28.57\%,47.8\%


### 2. Accuracy by vegetarian or not

In [40]:
df['vegetarian_condition'] = pd.read_csv('pairs_metadata.csv')['vegetarian_condition']
df['vegetarian_1'] = pd.read_csv('pairs_metadata.csv')['vegetarian_1']
df['vegetarian_2'] = pd.read_csv('pairs_metadata.csv')['vegetarian_2']

In [41]:
def check_veg_winner(row):
    if row['actual_score_1'] > row['actual_score_2']:
        return row['vegetarian_1']
    else:
        return row['vegetarian_2']
    
def check_predicted(row):
    if row['gpt-4o-2024-05-13'] == 2:
        return row['vegetarian_2']
    else:
        return row['vegetarian_1']

In [42]:
df['veg winner'] = df.apply(lambda x: check_veg_winner(x), axis = 1)

In [43]:
n_iter = 1000

l_e = []
for llm in llms:
    lowers = []
    uppers = []
    means = []

    entry = {}
    entry['llm'] = llm
    entry['bin'] = 'overall'
    entry['mean'] = sum(df['ground_truth']==df[llm])/len(df.dropna(subset=[llm]))*100

    observed = np.array([[sum(df['ground_truth']==df[llm]), sum(df['ground_truth']!=df[llm])],
                         [len(df[llm])/2, len(df[llm])/2]])
    
    chi2_stat, p_value, dof, expected = chi2_contingency(observed)

    entry['p'] = p_value

    l_e.append(entry)
    
    for name, gr in df.dropna(subset = [llm]).groupby('vegetarian_condition'):
        entry = {}
        entry['llm'] = llm
        entry['bin'] = name
        entry['mean'] = 100*sum(gr[llm]==gr['ground_truth'])/len(gr)
        observed = np.array([[sum(gr['ground_truth']==gr[llm]), sum(gr['ground_truth']!=gr[llm])],
                         [len(gr[llm])/2, len(gr[llm])/2]])
    
        chi2_stat, p_value, dof, expected = chi2_contingency(observed)
    
        entry['p'] = p_value
        l_e.append(entry)

    for name, gr in df.dropna(subset = [llm]).groupby('veg winner'):
        entry = {}
        entry['llm'] = llm
        entry['bin'] = name
        entry['mean'] = 100*sum(gr[llm]==gr['ground_truth'])/len(gr)
        observed = np.array([[sum(gr['ground_truth']==gr[llm]), sum(gr['ground_truth']!=gr[llm])],
                         [len(gr[llm])/2, len(gr[llm])/2]])
    
        chi2_stat, p_value, dof, expected = chi2_contingency(observed)
    
        entry['p'] = p_value
        l_e.append(entry)
    


In [44]:
types = pd.DataFrame(l_e)
types = types.pivot(index='llm', columns='bin', values='mean')
significance = pd.DataFrame(l_e).pivot(index='llm', columns='bin', values='p')

In [45]:
(types.round(2).astype(str) + \
 significance.applymap(lambda x: ('\\%*') if x <0.05/5 else '\\%')).loc[sorting]\
[[  'overall', 'non-vegetarian vs non-vegetarian','vegetarian vs non-vegetarian', 'vegetarian vs vegetarian','YES','NO']]

bin,overall,non-vegetarian vs non-vegetarian,vegetarian vs non-vegetarian,vegetarian vs vegetarian,YES,NO
llm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
openai/gpt-4o-2024-05-13,64.0\%*,70.81\%*,50.77\%,37.5\%,53.66\%,66.03\%*
anthropic/claude-3-5-sonnet-20240620,63.2\%*,67.34\%*,56.15\%,41.67\%,43.9\%,66.99\%*
openai/o1-preview-2024-09-12,59.6\%*,65.32\%*,46.15\%,50.0\%,48.78\%,61.72\%*
openai/gpt-3.5-turbo-0125,56.6\%,63.87\%*,42.31\%,29.17\%,40.24\%,59.81\%*
meta/llama-3.1-70b-instruct-turbo,50.6\%,54.91\%,42.31\%,33.33\%,43.9\%,51.91\%
google/gemini-1.5-pro-001,47.8\%,50.58\%,43.08\%,33.33\%,45.12\%,48.33\%
