In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from openai import OpenAI
import sys
sys.path.append('..')
from prompt_utils import assemble_prompt_chat_model, LABELS

In [3]:
client = OpenAI(api_key='ENTER-KEY-HERE')

In [5]:
from pathlib import Path
import numpy as np
import pandas as pd
from run_common import load_reviews, load_dataset, init_config

init_config(experiment_name='gpt-eval')

reviews = pd.read_csv('../data/covidence/reviews.csv', index_col='index')

eval_df = load_dataset('eval', return_df=True)


Using config from /home/tom/msc/diss/experiments/openai/../configs/gpt-eval.yaml
eval split has 455 items (98 positive)


In [6]:
def eval_with_gpt(eval_idx, num_shots=0, verbose=False, with_prob=True):
    paper = eval_df.loc[eval_idx]
    messages = assemble_prompt_chat_model(
        None,
        reviews.loc[paper.review_id],
        paper.title,
        paper.abstract,
        num_examples=num_shots,
        prompt_format='system_message',
        return_messages=True,
        model_class='gpt',
    )
    if verbose == True:
        for message in messages:
            print(message['role'] + ':')
            print(message['content'])
            print()
    
    response = client.chat.completions.create(
      model="gpt-4o",
      messages=messages,
      temperature=0,
      max_tokens=1,
      logprobs=with_prob,
      top_logprobs=5,
    )

    pred_label = response.choices[0].message.content
    print('Correct label:', LABELS[eval_df.loc[eval_idx]['label']])
    print('GPT answer:', pred_label)
    label_idx = LABELS.index(pred_label)
    if not with_prob:
        return label_idx
        
    top_logprobs = response.choices[0].logprobs.content[0].top_logprobs
    try:
        yes_logprob = [lp for lp in top_logprobs if lp.token == LABELS[1]][0].logprob
        yes_prob = np.exp(yes_logprob)
    except IndexError:
        yes_prob = 0.0

    return label_idx, yes_prob
        

In [22]:
from sklearn.metrics import average_precision_score
from run_common import calculate_threshold_scores 

def metrics(df):
    acc = (df.predicted_label == df.label).mean()
    ap = average_precision_score(df.label, df.yes_probability)
    tp = sum((df.label == 1) & (df.predicted_label == 1))
    fp = sum((df.label != 1) & (df.predicted_label == 1))
    tn = sum((df.label != 1) & (df.predicted_label != 1))
    fn = sum((df.label == 1) & (df.predicted_label != 1))
    prec = tp / (tp + fp) if tp + fp > 0 else 0
    rec = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = (2 * prec * rec) / (prec + rec) if prec + rec > 0 else 0
    neg_recall = tn / (tn + fp) if tn + fp > 0 else 0
    pos_rate = df.predicted_label.mean()
    threshold_scores = calculate_threshold_scores(df.label, df.yes_probability)

    out = {
        'acc': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'neg_recall': neg_recall,
        'pos_rate': pos_rate,
        'average_precision': ap,
    }
    out.update(threshold_scores)
    return out


Run a GPT evaluation run by adjusting config lines and running cell below

In [16]:
import time
from global_config import CONFIG

import wandb


CONFIG.model = 'gpt-4o'
CONFIG.run_config['num_shots'] = 4
CONFIG.run_config['trunc_dataset'] = None
CONFIG.run_config['prompt_format'] = 'system_message'
CONFIG.run_config['prompt_remind'] = False
CONFIG.run_config['random_sample_data'] = False
CONFIG.train_config = {}

run = wandb.init(
    entity='tberm-org',
    project='zero-shot-final',
    config=CONFIG.__dict__,
)
        
if CONFIG.run_config.get('trunc_dataset'):
    eval_items = eval_df.sample(CONFIG.run_config['trunc_dataset'])
else:
    eval_items = eval_df.copy()

for idx, item in eval_items.iterrows():
    pred_label, prob = eval_with_gpt(idx, num_shots=CONFIG.run_config['num_shots'])
    eval_items.loc[idx, 'predicted_label'] = int(pred_label)
    eval_items.loc[idx, 'yes_probability'] = prob
    time.sleep(0.8)

eval_items['index'] = eval_items.index
res_table = wandb.Table(dataframe=eval_items[['index', 'label', 'predicted_label', 'yes_probability']])
wandb.log({"Eval results": res_table})
results = metrics(eval_items)
wandb.log(results)
run.finish()

VBox(children=(Label(value='0.011 MB of 0.018 MB uploaded (0.004 MB deduped)\r'), FloatProgress(value=0.571879…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114599022807346, max=1.0…

Correct label: Yes
GPT answer: No
Correct label: No
GPT answer: No
Correct label: Yes
GPT answer: Yes
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: Yes
GPT answer: Yes
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: Yes
GPT answer: Yes
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: Yes
GPT answer: Yes
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: Yes
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No
Correct label: No
GPT answer: No


NameError: name 'metrics' is not defined