In [1]:
import pandas as pd
from itertools import combinations
from collections import Counter
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv('recent_recipes.csv')

### 1. Setup comparison pairs

In [3]:
# Generate all possible pairs of rows
pairs = list(combinations(df.index, 2))

# Display the results
pairs_df = pd.DataFrame(pairs)

In [4]:
list_pairs = []
for _,row in pairs_df.iterrows():
    
    #print(row['Pair'][0], row['Pair'][1])
    item1 = df.loc[row[0]]
    item2 = df.loc[row[1]]
    
    if item1['Rating']!=item2['Rating']:
        pair = {}

        pair['id_1'] = row[0]
        pair['id_2'] = row[1]

        pair['text_1'] = item1['Recipe']
        pair['text_2'] = item2['Recipe']

        pair['actual_score_1'] = item1['Rating']
        pair['actual_score_2'] = item2['Rating']

        list_pairs.append(pair)

In [5]:
df_pairs = pd.DataFrame(list_pairs)

In [6]:
df_pairs['delta'] = (df_pairs['actual_score_1'] - df_pairs['actual_score_2']).abs()

In [7]:
df_pairs = df_pairs.loc[df_pairs['delta']>0.5]

In [8]:
df_pairs.reset_index().to_csv('recipe_pairs_recent.csv')

### 2. Collect predctions

In [9]:
df = pd.read_csv('recipe_pairs_recent.csv')

In [11]:
import getpass
import helm

from helm.common.authentication import Authentication
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
from helm.common.request import Request, RequestResult
from helm.common.tokenization_request import TokenizationRequest, TokenizationRequestResult
from helm.proxy.accounts import Account

#google, gpt3, gpt4, anthropic and together
from helm.proxy.services.remote_service import RemoteService

import numpy as np
import pandas as pd
from tqdm import tqdm
import time

In [12]:
api_key = getpass.getpass(prompt="Enter a valid API key: ")

#add key here
key = ""

auth = Authentication(api_key=key)
service = RemoteService("https://crfm-models.stanford.edu")

# Access account and show my current quotas and usages
account: Account = service.get_account(auth)
print(account.usages.keys())

for key in account.usages.keys():
    print(key, account.usages[key])

Enter a valid API key:  ········


dict_keys(['gpt3', 'gpt4', 'codex', 'jurassic', 'gooseai', 'cohere', 'dall_e', 'together_vision', 'simple', 'together', 'anthropic', 'google', 'o1'])
gpt3 {'daily': Usage(period='2024-12-22', used=301, quota=None), 'monthly': Usage(period='2024-12', used=301, quota=None), 'total': Usage(period='all', used=1325509, quota=10000000)}
gpt4 {'daily': Usage(period='2024-12-22', used=290, quota=None), 'monthly': Usage(period='2024-12', used=290, quota=None), 'total': Usage(period='all', used=1443087, quota=10000000)}
codex {'daily': Usage(period=None, used=0, quota=10000)}
jurassic {'daily': Usage(period=None, used=0, quota=10000)}
gooseai {'daily': Usage(period=None, used=0, quota=10000)}
cohere {'daily': Usage(period=None, used=0, quota=10000)}
dall_e {'daily': Usage(period=None, used=0, quota=5)}
together_vision {'daily': Usage(period=None, used=0, quota=30)}
simple {'daily': Usage(period=None, used=0, quota=10000)}
together {'daily': Usage(period='2024-12-22', used=300, quota=None), 'mont

In [13]:
valid_llms = ['openai/o1-preview-2024-09-12','anthropic/claude-3-5-sonnet-20240620',
              'openai/gpt-3.5-turbo-0125',
              'openai/gpt-4o-2024-05-13',
              'meta/llama-3.1-70b-instruct-turbo',
             'google/gemini-1.5-pro-001']

In [14]:
with open("prompt.txt", "r", encoding="utf-8") as file:
    prompt = file.read()

def prompt_llm(prompt, llm):
    if llm == 'openai/o1-preview-2024-09-12':
        mx_tokens = 32768
        request = Request(
            model=llm, prompt=prompt, echo_prompt=False,
            max_tokens=mx_tokens,
        )
    else:
        request = Request(
            model=llm, prompt=prompt, echo_prompt=False
        )
    request_result: RequestResult = service.make_request(auth, request)
    return request_result.completions[0].text


In [15]:
list_outputs = []
for _,row in tqdm(df.iterrows()):

    entry = {}
    for llm in valid_llms:
        
        input_text = prompt + "Recipe 1:\n"+row['text_1'] + "\n\nRecipe 2:\n"+row['text_2']+ "\n\nAnswer:"
        
        try:
            entry[llm] = prompt_llm(input_text, llm)
            time.sleep(0.5)
        except:
            entry[llm] = np.nan
    list_outputs.append(entry)  

58it [09:36,  9.93s/it]


In [16]:
pd.DataFrame(list_outputs).to_csv('outputs_recent_updated.csv')

In [17]:
df = pd.DataFrame(list_outputs)

### 3. Analyze performance

In [19]:
def process_responses_gpt_3(text):
    if type(text)==float:
        return text
    if text=='Recipe 2':
        return 2
    elif text=='Recipe 1':
        return 1
    elif ('Recipe 1' in text) and ~('Recipe 2' in text):
        return 1
    elif ('Recipe 2' in text) and ~('Recipe 1' in text):
        return 2
    else:
        return int(text)

def process_responses_gemini(text):
    if type(text)==float:
        return text
    if text[0]=='1' or text[0]=='2':
        return text[0]
    else:
        return np.nan

In [20]:
llms = ['openai/o1-preview-2024-09-12','anthropic/claude-3-5-sonnet-20240620',
              'openai/gpt-3.5-turbo-0125',
              'openai/gpt-4o-2024-05-13',
              'meta/llama-3.1-70b-instruct-turbo',
             'google/gemini-1.5-pro-001']

In [21]:
df[llms[2]] = df[llms[2]].apply(lambda x: process_responses_gpt_3(x))
df[llms[5]] = df[llms[5]].apply(lambda x: process_responses_gemini(x))

In [40]:
df[llms[0]] = df[llms[0]].astype(float).astype(str)
df[llms[1]] = df[llms[1]].astype(float).astype(str)
df[llms[2]] = df[llms[2]].astype(float).astype(str)
df[llms[3]] = df[llms[3]].astype(float).astype(str)
df[llms[4]] = df[llms[4]].astype(float).astype(str)

In [41]:
df[llms[5]] = df[llms[5]].astype(float).astype(str)

In [42]:
df['ground_truth'] = pd.read_csv('recipe_pairs_recent.csv').apply(lambda x: 1 if (x['actual_score_1']>x['actual_score_2']) else 2, axis = 1)
df['ground_truth'] = df['ground_truth'].astype(float).astype(str)

In [43]:
df['actual_score_1'] = pd.read_csv('recipe_pairs_recent.csv')['actual_score_1'].values
df['actual_score_2'] = pd.read_csv('recipe_pairs_recent.csv')['actual_score_2'].values

In [44]:
df['diff'] = np.abs((df['actual_score_2'] - df['actual_score_1']))
df['bin'], bins  = pd.cut(df['diff'], bins=4, labels=False, retbins=True)

In [45]:
import numpy as np
from scipy.stats import chi2_contingency

In [46]:
l_e = []
for llm in llms:
    lowers = []
    uppers = []
    means = []

    entry = {}
    entry['llm'] = llm
    entry['bin'] = 'overall'


    entry['mean'] = sum(df['ground_truth']==df[llm])/len(df.dropna(subset=[llm]))*100
    observed = np.array([[sum(df['ground_truth']==df[llm]), sum(df['ground_truth']!=df[llm])],
                         [len(df[llm])/2, len(df[llm])/2]])
    
    chi2_stat, p_value, dof, expected = chi2_contingency(observed)

    entry['p'] = p_value
    
    l_e.append(entry)
    
    for name, gr in df.dropna(subset = [llm]).groupby('bin'):
        entry = {}
        entry['llm'] = llm
        entry['bin'] = name

        entry['mean'] = 100*sum(gr[llm]==gr['ground_truth'])/len(gr)
        observed = np.array([[sum(gr['ground_truth']==gr[llm]), sum(gr['ground_truth']!=gr[llm])],
                         [len(gr[llm])/2, len(gr[llm])/2]])
    
        chi2_stat, p_value, dof, expected = chi2_contingency(observed)
    
        entry['p'] = p_value
        
        l_e.append(entry)

In [47]:
table = pd.DataFrame(l_e)
table = table.pivot(index='llm', columns='bin', values='mean')
significance = pd.DataFrame(l_e).pivot(index='llm', columns='bin', values='p')

In [48]:
sorting = ['openai/o1-preview-2024-09-12','openai/gpt-4o-2024-05-13',
           'anthropic/claude-3-5-sonnet-20240620',
           'openai/gpt-3.5-turbo-0125',
           'meta/llama-3.1-70b-instruct-turbo',
           'google/gemini-1.5-pro-001']

In [49]:
(table.round(2).astype(str) + \
 significance.applymap(lambda x: ('\%*') if x<0.05 else '\%')).loc[sorting]

bin,0,1,2,3,overall
llm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
openai/o1-preview-2024-09-12,62.5\%,73.08\%,57.14\%,77.78\%,68.97\%
openai/gpt-4o-2024-05-13,68.75\%,42.31\%,71.43\%,55.56\%,55.17\%
anthropic/claude-3-5-sonnet-20240620,56.25\%,50.0\%,85.71\%,55.56\%,56.9\%
openai/gpt-3.5-turbo-0125,62.5\%,38.46\%,100.0\%,55.56\%,55.17\%
meta/llama-3.1-70b-instruct-turbo,62.5\%,73.08\%,100.0\%,77.78\%,74.14\%*
google/gemini-1.5-pro-001,31.25\%,46.15\%,42.86\%,44.44\%,41.38\%
