## Prototype Notebook

prototype building functions which:
- parse + collate the json outputs
- queries/aggregates the data into relevant tables

In [1]:
import os
import pandas as pd
import json

results_fp = '../../wordle-qa-2/results/'
model_names = ['gpt-3.5-turbo', 'gpt-4', 'llama_13b_chat']

### Collation + Parsing section

In [2]:
def get_json_result_fns(results_fp):
    results = os.listdir(results_fp)
    results = [r for r in results if r.endswith('.json')]
    results = [r for r in results if r.startswith('output')]
    return results

result_fns = get_json_result_fns(results_fp)
result_fns

['output-json-state-1-gpt-3.5-turbo-5d4a.json',
 'output-json-state-1-gpt-4-8309.json',
 'output-json-state-1-llama_13b_chat-fe35.json',
 'output-rules-qa-1-gpt-3.5-turbo-2047.json',
 'output-rules-qa-1-gpt-4-09fa.json',
 'output-rules-qa-1-llama_13b_chat-5f82.json',
 'output-wsu-1-gpt-3.5-turbo-4bd8.json',
 'output-wsu-1-gpt-4-1063.json']

In [3]:
def parse_sheet_meta(result_fn):

    result_fn = result_fn.lower()
    result_fn = result_fn[:result_fn.find('.json')]
    run_id = result_fn.split('-')[-1]

    result_fn = '-'.join((result_fn.split('-')[:-1]))
    model_name = 'unknown'
    for _name in model_names:
        if _name in result_fn:
            model_name = _name
            result_fn = result_fn.replace(_name, '')

    input_name = result_fn.replace('output-', '')
    if input_name.endswith('-'):
        input_name = input_name[:-1]

    return {
        'input_name': input_name, 
        'model_name': model_name, 
        'run_id': run_id,
    }

parse_sheet_meta(result_fns[0])

{'input_name': 'json-state-1', 'model_name': 'gpt-3.5-turbo', 'run_id': '5d4a'}

In [4]:
def question_table(result_fn, results_fp):
    with open(os.path.join(results_fp, result_fn)) as f:
        data = json.load(f)
    return pd.DataFrame(data['questions'])

q_tbl = question_table(result_fns[0], results_fp)
q_tbl.head(3)

Unnamed: 0,name,meta_data,ground_truth,question,completion,error,model_name,eval_time,grade
0,Reason-Win,"{'answer_type': 'mutliple-choice', 'answer_sug...",B) No,Below is the state of a wordle game. Use the o...,B) No,,gpt-3.5-turbo,1.054595,True
1,Reason-Win-2,"{'answer_type': 'mutliple-choice', 'answer_sug...",B) No,Below is the state of a wordle game. Use the o...,B) No,,gpt-3.5-turbo,0.639488,True
2,Reason-Win-3,"{'answer_type': 'mutliple-choice', 'answer_sug...",C) No - the player has not guessed all five le...,Below is the state of a wordle game. Use the o...,B) No,,gpt-3.5-turbo,1.957797,True


In [5]:
def sheet_table_info(result_fn, results_fp):
    with open(os.path.join(results_fp, result_fn)) as f:
        data = json.load(f)
    sheet_data = data['sheet']
    return {
        'input_name':   sheet_data.get('name'), 
        'model_name':   sheet_data.get('model_name'), 
        'run_id':       sheet_data.get('run_id'),
    }

tbl_info = sheet_table_info(result_fns[0], results_fp)
tbl_info

{'input_name': 'JSON-state-reasoning-1',
 'model_name': 'gpt-3.5-turbo',
 'run_id': '5d4a'}

In [6]:
def built_full_table(result_fp, result_fn):
    q_tbl = question_table(result_fn, results_fp)
    tbl_info = sheet_table_info(result_fn, results_fp)
    for col_name, col_val in tbl_info.items():
        q_tbl[col_name] = col_val
    return q_tbl

In [7]:
def build_data(results_fp):
    result_fns = get_json_result_fns(results_fp)
    tbls = []
    for result_fn in result_fns:
        tbls.append(built_full_table(results_fp, result_fn))
    return pd.concat(tbls)

In [8]:
data = build_data(results_fp)

In [9]:
data.shape

(55, 11)

### Aggregate Query Section

In [10]:
(
    data.groupby(['input_name', 'model_name'])
    .agg({'run_id': 'nunique'})
    .reset_index()
    .sort_values('run_id', ascending=False)
)

Unnamed: 0,input_name,model_name,run_id
0,JSON-state-reasoning-1,gpt-3.5-turbo,1
1,JSON-state-reasoning-1,gpt-4,1
2,JSON-state-reasoning-1,llama_13b_chat,1
3,Rule-QA-1,gpt-3.5-turbo,1
4,Rule-QA-1,gpt-4,1
5,Rule-QA-1,llama_13b_chat,1
6,What Shows Up,gpt-3.5-turbo,1
7,What Shows Up,gpt-4,1


In [11]:
tmp = (
    data.groupby(['input_name'])
    .agg({'name': 'nunique'})
)
tmp

Unnamed: 0_level_0,name
input_name,Unnamed: 1_level_1
JSON-state-reasoning-1,8
Rule-QA-1,7
What Shows Up,1


In [65]:
tmp = (
    data.groupby(['input_name', 'name'])
    .agg({'name': 'count'})
    .drop(columns=['name'])
)
tmp

input_name,name
JSON-state-reasoning-1,Reason-Current-Turn-Num
JSON-state-reasoning-1,Reason-Letters-Guessed
JSON-state-reasoning-1,Reason-Letters-Guessed-2
JSON-state-reasoning-1,Reason-Win
JSON-state-reasoning-1,Reason-Win-2
JSON-state-reasoning-1,Reason-Win-3
JSON-state-reasoning-1,Reason-Words-Guessed
JSON-state-reasoning-1,Reason-Words-Guessed-2
Rule-QA-1,Mechanics-Basic-Reasoning-1
Rule-QA-1,Mechanics-Guess-Valid-Word


In [62]:
tmp = (
    data.groupby(['input_name', 'model_name'])
    .agg({'name': 'count', 'grade': 'mean'}) 
    .rename(columns={'name': 'num_questions', 'grade': 'pct_correct'})
    .sort_values(['input_name', 'pct_correct'], ascending=False)
)
tmp['pct_correct'] = pd.to_numeric(tmp['pct_correct'], errors='coerce').round(2)
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,num_questions,pct_correct
input_name,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1
What Shows Up,gpt-4,1,1.0
What Shows Up,gpt-3.5-turbo,1,
Rule-QA-1,gpt-4,8,1.0
Rule-QA-1,gpt-3.5-turbo,8,0.38
Rule-QA-1,llama_13b_chat,8,0.38
JSON-state-reasoning-1,gpt-4,10,0.7
JSON-state-reasoning-1,gpt-3.5-turbo,10,0.5
JSON-state-reasoning-1,llama_13b_chat,9,0.11


In [91]:
def model_input_results(
    model_name,
    input_name,
    run_id = None, # if None, use first run_id
):

    a = (data['model_name'] == model_name)
    b = (data['input_name'] == input_name)

    run_ids = data[a & b]['run_id'].unique()
    if run_id is None:
        run_id = run_ids[0]
    c = (data['run_id'] == run_id)

    num_questions = data[a & b & c].shape[0]

    slice_wrong = data[a & b & c & (data['grade'] == 0)]
    num_wrong = slice_wrong.shape[0]
    questions_wrong_name = slice_wrong['name'].tolist()

    return {
        'num_questions': num_questions,
        'num_wrong': num_wrong,
        'questions_wrong_name': questions_wrong_name,
    }

In [94]:
results = model_input_results(
    model_name = 'gpt-3.5-turbo',
    input_name = 'JSON-state-reasoning-1',
    run_id = None
)

print(json.dumps(results, indent=2))

{
  "num_questions": 10,
  "num_wrong": 5,
  "questions_wrong_name": [
    "Reason-Current-Turn-Num",
    "Reason-Words-Guessed-2",
    "Reason-Letters-Guessed",
    "Reason-Letters-Guessed",
    "Reason-Letters-Guessed-2"
  ]
}


In [95]:
d_results = {}
models = ['gpt-3.5-turbo', 'gpt-4']
for model in models:
    d_results[model] = model_input_results(
        model_name = model,
        input_name = 'JSON-state-reasoning-1',
    )

print(json.dumps(d_results, indent=2))

{
  "gpt-3.5-turbo": {
    "num_questions": 10,
    "num_wrong": 5,
    "questions_wrong_name": [
      "Reason-Current-Turn-Num",
      "Reason-Words-Guessed-2",
      "Reason-Letters-Guessed",
      "Reason-Letters-Guessed",
      "Reason-Letters-Guessed-2"
    ]
  },
  "gpt-4": {
    "num_questions": 10,
    "num_wrong": 3,
    "questions_wrong_name": [
      "Reason-Win-3",
      "Reason-Current-Turn-Num",
      "Reason-Letters-Guessed"
    ]
  }
}
