In [59]:
WANDB_PROJECT = "alpaca_ft"

## Let's download the model predictions on the test_dataset

In [9]:
import wandb
run = wandb.init()
artifact = run.use_artifact('capecape/alpaca_ft/run-b0pjf9y7-eval_predictions:v0', type='run_table')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [10]:
table = artifact.get("eval_predictions")

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [11]:
import pandas as pd
llama7b_ft_results_df = pd.DataFrame(data=table.data, columns=table.columns)

In [12]:
llama7b_ft_results_df.head()

Unnamed: 0,prompt,generation,concat,output,max_new_tokens,temperature,top_p
0,"Below is an instruction that describes a task,...","Based on the input, suggest a medical diagnosi...","Below is an instruction that describes a task,...","Based on the test result, the patient is showi...",256,0.6,0.9
1,"Below is an instruction that describes a task,...",Estoy muy feliz de trabajar con usted en este ...,"Below is an instruction that describes a task,...",Estamos emocionados de trabajar contigo en est...,256,0.6,0.9
2,Below is an instruction that describes a task....,1. Hacking\n2. Malware\n3. Phishing\n4. Social...,Below is an instruction that describes a task....,1. Malware and Viruses: Malware and viruses ca...,256,0.6,0.9
3,Below is an instruction that describes a task....,One potential area for improvement for my favo...,Below is an instruction that describes a task....,One of my favorite websites is Wikipedia - an ...,256,0.6,0.9
4,Below is an instruction that describes a task....,Drug testing is important in professional spor...,Below is an instruction that describes a task....,Drug testing is an important measure implement...,256,0.6,0.9


## Generate responses with GPT 3.5

In [32]:
from types import SimpleNamespace

from tqdm.auto import tqdm

from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential, # for exponential backoff
)  

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [39]:
openai.ChatCompletion.create(

<method 'items' of 'dict' objects>

In [40]:
gpt35_config = SimpleNamespace(
    max_tokens=256,
    system_prompt="You are a helpful assistant.",
    temperature=1,
)

In [41]:
def generate_35(prompt):
    completion = completion_with_backoff(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": gpt35_config.system_prompt},
                  {"role": "user","content": prompt}
                 ],
        max_tokens=gpt35_config.max_tokens,
        temperature=gpt35_config.temperature,
    
    )
    return completion.choices[0].message.content

In [42]:
llama7b_ft_results_df.iloc[0].prompt

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate an medical diagnosis from the following test result.\n\n### Input:\nPatient has elevated levels of white blood cells.\n\n### Response:\n'

In [43]:
generate_35(llama7b_ft_results_df.iloc[0].prompt)

"Based on the test results, the patient may have an infection or inflammation. It is important to further evaluate the patient's symptoms and conduct additional tests to determine the specific cause of the elevated white blood cell levels."

In [44]:
results = []
for i, prompt in enumerate(tqdm(llama7b_ft_results_df.prompt.to_list())):
    results.append((i, generate_35(prompt)))

  0%|          | 0/250 [00:00<?, ?it/s]

### Let's store those precious results!

In [45]:
gpt35_df = pd.DataFrame(results, columns=["index", "generation"])

merge the prompts

In [48]:
gpt35_df = gpt35_df.assign(prompt=llama7b_ft_results_df["prompt"])

In [51]:
gpt35_df = gpt35_df[["index", "prompt", "generation"]]

In [55]:
print("{prompt}{generation}".format_map(gpt35_df.iloc[10]))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Brainstorm 5 ways to reduce stress.

### Response:
Here are 5 ways to reduce stress:

1. Engage in regular physical activity or exercise: Exercise helps to release endorphins, which are natural mood boosters. It can also help to reduce muscle tension and promote relaxation.

2. Practice relaxation techniques: Techniques such as deep breathing exercises, meditation, and yoga can help to calm the mind and reduce stress levels.

3. Prioritize self-care: Taking care of yourself is important in managing stress. This can include getting enough sleep, eating a healthy diet, and making time for activities you enjoy.

4. Set realistic expectations and boundaries: Setting realistic goals and boundaries can help to reduce feelings of overwhelm and prevent burnout. It is important to establish healthy boundaries in both personal and professional relationships.

5. Seek suppor

In [56]:
gpt35_table = wandb.Table(dataframe=gpt35_df)

In [60]:
with wandb.init(project=WANDB_PROJECT, job_type="eval", tags=["gpt-3.5"], config=gpt35_config):
    wandb.log({"gpt35_results":gpt35_table})

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01111254137682004, max=1.0)…

## Evaluate with GPT-4

In [63]:
import pandas as pd

gpt4_config = SimpleNamespace(
    openai_model = "gpt-4",
    temperature=1,
    system_prompt = ("You will be presented with a choice of two possible responses for an instruction"
                     "You have to pick the best one and give a reason why.\n"
                     "The reponse should follow the instructions and use the provided context if there is some\n"
                     "If both answers are equivalent, pick the value 0")
)

In [70]:
run = wandb.init(project=WANDB_PROJECT, job_type="gpt4_eval", config=gpt4_config)
artifact = run.use_artifact('capecape/alpaca_ft/run-vkr1q6bw-gpt35_results:v0', type='run_table')
gpt35_table = artifact.get("gpt35_results")
gpt35_df = pd.DataFrame(data=gpt35_table.data, columns=gpt35_table.columns)
                        
artifact = run.use_artifact('capecape/alpaca_ft/run-b0pjf9y7-eval_predictions:v0', type='run_table')
llama7b_ft_table = artifact.get("eval_predictions")
llama7b_ft_results_df = pd.DataFrame(data=llama7b_ft_table.data, columns=llama7b_ft_table.columns)

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112533742561936, max=1.0…

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


Let's merge the tables:

In [205]:
llama7b_ft_results_df.iloc[0].prompt

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate an medical diagnosis from the following test result.\n\n### Input:\nPatient has elevated levels of white blood cells.\n\n### Response:\n'

In [206]:
gpt35_df.iloc[0].prompt

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nGenerate an medical diagnosis from the following test result.\n\n### Input:\nPatient has elevated levels of white blood cells.\n\n### Response:\n'

In [79]:
merged_df = pd.merge(
    llama7b_ft_results_df[["prompt", "generation"]], 
    gpt35_df[["prompt", "generation"]], on="prompt", suffixes=("_llama", "_gpt35")
)

In [197]:
def display_rows(series):
    # iterate over the series items
    for index, value in series.items():
        # print index and value
        print(f"{index}\n====\n{value}\n")

In [199]:
row = merged_df.iloc[200]

In [200]:
display_rows(row)

prompt
====
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Recognize the named entity in the sentence and output the named entity.

### Input:
He is going to the store at Berkeley tonight.

### Response:


generation_llama
====
The store at Berkeley

### Input:
She is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley tonight.

### Response:
The store at Berkeley

### Input:
He is going to the store at Berkeley t

In [95]:
import openai

def gpt4_judge(instruction, gen1, gen2, model="gpt-4", system_prompt=gpt4_config.system_prompt):
    message = "{instruction}\n Answer 1: \n{gen1}\n Answer 2:\n{gen2}".format(instruction=instruction, gen1=gen1, gen2=gen2)
    completion = completion_with_backoff(
        model="gpt-4",
        messages=[{"role": "system",
                   "content": system_prompt,
                  },
                  {"role": "user",
                   "content": message,
                  },],
        function_call = {"name": "make_choice"},
        functions = [
            {
                "name": "make_choice",
                "description": "Select the best generation and argument why",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "choice": {
                            "type": "integer",
                            "description": "the choosen alternative, zero if equivalent",
                        },
                        "argument":{
                            "type": "string",
                            "description": "Reason why the choice was made",
                        },
                    }
                },
                    "required": ["choice", "argument"],
            },
        ],
    )
    return completion

In [96]:
prompt = "Identify the primary contributing factors to climate change"

In [97]:
gen1 = "The primary contributing factors to climate change are greenhouse gases, such as carbon dioxide, methane, and nitrous oxide, which trap heat in the atmosphere. Other contrib"
gen2 = "The primary contributing factors to climate change are greenhouse gases, such as carbon dioxide, methane, and nitrous oxide, which trap heat in the atmosphere. Other contrib"

In [98]:
res = gpt4_judge(prompt, gen1, gen2)

In [99]:
import json
json_response = json.loads(res.choices[0].message.function_call.arguments)
print(json_response)

{'choice': 0, 'argument': 'Both answers are exactly the same, therefore they are equivalent.'}


In [100]:
gpt4_results = []

In [102]:
def judge_row(row):
    prompt = row.prompt
    gen_llama = row.generation_llama
    gen_gpt35 = row.generation_gpt35
    res = gpt4_judge(prompt, gen_llama, gen_gpt35)
    res_inverted = gpt4_judge(prompt, gen_gpt35, gen_llama)
    return res, res_inverted

In [104]:
merged_df.iloc[0]

prompt              Below is an instruction that describes a task,...
generation_llama    Based on the input, suggest a medical diagnosi...
generation_gpt35    Based on the elevated levels of white blood ce...
Name: 0, dtype: object

In [103]:
judge_row(merged_df.iloc[0])

(<OpenAIObject chat.completion id=chatcmpl-8GZtKCcDTMctkR6bZVRk4EklOAzMu at 0x7f502904ab60> JSON: {
   "id": "chatcmpl-8GZtKCcDTMctkR6bZVRk4EklOAzMu",
   "object": "chat.completion",
   "created": 1698961906,
   "model": "gpt-4-0613",
   "choices": [
     {
       "index": 0,
       "message": {
         "role": "assistant",
         "content": null,
         "function_call": {
           "name": "make_choice",
           "arguments": "{\n\"choice\": 2,\n\"argument\": \"Answer 2 is the best alternative as it not only provides a plausible diagnosis based on the input provided, but also offers a cautionary recommendation to consult with a healthcare professional.\"\n}"
         }
       },
       "finish_reason": "stop"
     }
   ],
   "usage": {
     "prompt_tokens": 274,
     "completion_tokens": 47,
     "total_tokens": 321
   }
 },
 <OpenAIObject chat.completion id=chatcmpl-8GZtRbrPROhGvYqPPnXeNmp9ICha1 at 0x7f50290c18f0> JSON: {
   "id": "chatcmpl-8GZtRbrPROhGvYqPPnXeNmp9ICha1",
   

In [129]:
def extract_function_call(res):
    try:
        response = json_response = json.loads(res.choices[0].message.function_call.arguments)
        choice = response["choice"]
        reason = response["argument"]
    except:
        choice = -1
        reason = "gpt4 fail"
    return choice, reason

In [130]:
def judge(merged_df):
    gpt4_results = []
    for i in tqdm(range(len(merged_df))):
        row = merged_df.iloc[i]
        res, res_inverted = judge_row(row)
        choice, reason = extract_function_call(res)
        choice_inverted, reason_inverted = extract_function_call(res_inverted)
        agree = (choice - choice_inverted) % 2 == 1
        print(f"GPT4 prefers: {choice} and {choice_inverted}: Agree={agree}")
        gpt4_results.append((i, row.prompt, row.generation_llama, row.generation_gpt35, choice, choice_inverted, reason, reason_inverted))
    return gpt4_results

In [131]:
gpt4_results = judge(merged_df)

  0%|          | 0/250 [00:00<?, ?it/s]

GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 1 and 1: Agree=False
GPT4 prefers: 0 and 0: Agree=False
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 0 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 0: Agree=False
GPT4 prefers: 0 and 2: Agree=False
GPT4 prefers: 0 and 0: Agree=False
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 1 and 1: Agree=False
GPT4 prefers: 1 and 2: Agree=True
GPT4 prefers: 1 and 1: Agree=False
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 0: Agree=False
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 0 and 2: Agree=False
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 prefers: 2 and 1: Agree=True
GPT4 

In [139]:
results_df = pd.DataFrame(gpt4_results,
                          columns=["index", "prompt", "llama7b_ft", "gpt3.5", "choice", "choice_inverted", "reason", "reason_inverted"]).set_index("index")

In [141]:
results_df.head()

Unnamed: 0_level_0,prompt,llama7b_ft,gpt3.5,choice,choice_inverted,reason,reason_inverted
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Identify the primary contributing factors to c...,"Based on the input, suggest a medical diagnosi...",Based on the elevated levels of white blood ce...,2,1,Option 2 is more suitable since it provides a ...,Answer 1 is better because it presents a more ...
1,Identify the primary contributing factors to c...,Estoy muy feliz de trabajar con usted en este ...,Estamos emocionados de trabajar contigo en est...,2,1,Answer 2 is more accurate. The original text u...,The first option is more accurate because it i...
2,Identify the primary contributing factors to c...,1. Hacking\n2. Malware\n3. Phishing\n4. Social...,Here are five potential threats to digital sec...,2,1,Answer 2 not only lists the threats but also p...,Answer 1 is more thorough and provides an expl...
3,Identify the primary contributing factors to c...,One potential area for improvement for my favo...,One potential area for improvement for my favo...,1,1,Answer 1 provides a detailed response to the i...,The first response is more concise and straigh...
4,Identify the primary contributing factors to c...,Drug testing is important in professional spor...,The purpose behind drug testing in professiona...,0,0,Both responses provide a comprehensive explana...,Both responses accurately explain the purpose ...


In [144]:
def agree_check(row):
    if row.choice == 0 and row.choice_inverted==0:
        return True
    elif ((row.choice - row.choice_inverted) % 2 == 1):
        return True
    else:
        return False

In [149]:
results_df["agree"] = results_df.apply(agree_check, axis=1)

In [212]:
final_results = results_df[results_df.agree]
len(final_results)

216

In [None]:
wa

In [225]:
results_df[~results_df.agree].head()

Unnamed: 0_level_0,prompt,llama7b_ft,gpt3.5,choice,choice_inverted,reason,reason_inverted,agree
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,Below is an instruction that describes a task....,One potential area for improvement for my favo...,One potential area for improvement for my favo...,1,1,Answer 1 provides a detailed response to the i...,The first response is more concise and straigh...,False
10,Below is an instruction that describes a task....,1. Exercise regularly: Exercise helps reduce s...,Here are 5 ways to reduce stress:\n\n1. Engage...,2,0,Both answers provide good suggestions for redu...,Both answers provide adequate and valuable sug...,False
11,Below is an instruction that describes a task....,Friendship can be classified into two main cat...,Two common types of friendships are casual fri...,0,2,Both answers are accurate and follow the instr...,Answer 2 provides more detailed descriptions o...,False
14,Below is an instruction that describes a task....,The gravitational force acting on a 1 kg mass ...,The force on a 1 kg mass due to the gravitatio...,1,1,Answer 1 directly provides the the force actin...,Choice 1 not only provides the same answer as ...,False
16,Below is an instruction that describes a task....,"The article ""The Social Media Revolution"" by A...","To analyze the article ""The Social Media Revol...",1,1,The first response correctly provides a detail...,Answer 1 provides a better response to the ins...,False


log the disagreeements

In [238]:
results_df[~results_df.agree]

Unnamed: 0_level_0,prompt,llama7b_ft,gpt3.5,choice,choice_inverted,reason,reason_inverted,agree
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,Below is an instruction that describes a task....,One potential area for improvement for my favo...,One potential area for improvement for my favo...,1,1,Answer 1 provides a detailed response to the i...,The first response is more concise and straigh...,False
10,Below is an instruction that describes a task....,1. Exercise regularly: Exercise helps reduce s...,Here are 5 ways to reduce stress:\n\n1. Engage...,2,0,Both answers provide good suggestions for redu...,Both answers provide adequate and valuable sug...,False
11,Below is an instruction that describes a task....,Friendship can be classified into two main cat...,Two common types of friendships are casual fri...,0,2,Both answers are accurate and follow the instr...,Answer 2 provides more detailed descriptions o...,False
14,Below is an instruction that describes a task....,The gravitational force acting on a 1 kg mass ...,The force on a 1 kg mass due to the gravitatio...,1,1,Answer 1 directly provides the the force actin...,Choice 1 not only provides the same answer as ...,False
16,Below is an instruction that describes a task....,"The article ""The Social Media Revolution"" by A...","To analyze the article ""The Social Media Revol...",1,1,The first response correctly provides a detail...,Answer 1 provides a better response to the ins...,False
20,"Below is an instruction that describes a task,...",The independent variable is the number of hour...,The independent variable in this scenario woul...,2,0,Both answers identify the correct independent ...,Both responses accurately identify the indepen...,False
25,Below is an instruction that describes a task....,Quantum computing is a new type of computing t...,Quantum computing is a type of computing that ...,0,2,Both answers accurately and succinctly define ...,Answer 2 provides a more comprehensive explana...,False
32,"Below is an instruction that describes a task,...","As an AI, I am not able to collate data about ...","To collate data about Tesla, you can start by ...",1,1,Answer 1 provides the desired information abou...,Answer 1 is more appropriate because it direct...,False
47,"Below is an instruction that describes a task,...",To find all rows in the table which meets cert...,To construct a query that finds all rows in th...,2,2,"Answer 2 is more detailed, explaining what the...","Answer 2, although abridged, captures the inst...",False
69,Below is an instruction that describes a task....,Poverty is a complex issue that has a multitud...,"Sure, I can help you with that. Here is a 500-...",2,0,The second response provides a more detailed a...,Both responses adequately address the instruct...,False


In [241]:
disagree_df = results_df[~results_df.agree]

human readable choices

In [242]:
disagree_df.head()

Unnamed: 0_level_0,prompt,llama7b_ft,gpt3.5,choice,choice_inverted,reason,reason_inverted,agree
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,Below is an instruction that describes a task....,One potential area for improvement for my favo...,One potential area for improvement for my favo...,1,1,Answer 1 provides a detailed response to the i...,The first response is more concise and straigh...,False
10,Below is an instruction that describes a task....,1. Exercise regularly: Exercise helps reduce s...,Here are 5 ways to reduce stress:\n\n1. Engage...,2,0,Both answers provide good suggestions for redu...,Both answers provide adequate and valuable sug...,False
11,Below is an instruction that describes a task....,Friendship can be classified into two main cat...,Two common types of friendships are casual fri...,0,2,Both answers are accurate and follow the instr...,Answer 2 provides more detailed descriptions o...,False
14,Below is an instruction that describes a task....,The gravitational force acting on a 1 kg mass ...,The force on a 1 kg mass due to the gravitatio...,1,1,Answer 1 directly provides the the force actin...,Choice 1 not only provides the same answer as ...,False
16,Below is an instruction that describes a task....,"The article ""The Social Media Revolution"" by A...","To analyze the article ""The Social Media Revol...",1,1,The first response correctly provides a detail...,Answer 1 provides a better response to the ins...,False


In [243]:
choices = ["both", "llama7b_ft", "gpt3.5"]
disagree_df["choice"] = [choices[c] for c in disagree_df["choice"]]

choices_inverted = ["both", "gpt3.5", "llama7b_ft"]
disagree_df["choice_inverted"] = [choices_inverted[c] for c in disagree_df["choice_inverted"]]

In [244]:
# run = wandb.init(project=WANDB_PROJECT, job_type="gpt4_eval", config=gpt4_config)

gpt4_eval_disagree_table  = wandb.Table(dataframe=disagree_df)
wandb.log({"gpt4_eval_disagree":gpt4_eval_disagree_table})

In [245]:
# wandb.finish()

In [213]:
choices = ["both", "llama7b_ft", "gpt3.5"]
final_results["choice_name"] = [choices[c] for c in final_results["choice"]]

In [214]:
def print_row(row):
    res = """
## PROMPT
{prompt}

## GPT3.5:
{gpt3}

## LLama
{llama7b_ft}
==============
## Choice
{choice_name}
""".format(prompt=row["prompt"], gpt3=row["gpt3.5"], llama7b_ft=row["llama7b_ft"], choice_name=row["choice_name"])
    print(res)

In [215]:
print_row(final_results.iloc[0])


## PROMPT
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Generate an medical diagnosis from the following test result.

### Input:
Patient has elevated levels of white blood cells.

### Response:


## GPT3.5:
Based on the elevated levels of white blood cells, the medical diagnosis could possibly be an infection or inflammation in the body. However, it is important to consult with a healthcare professional for an accurate diagnosis and appropriate treatment.

## LLama
Based on the input, suggest a medical diagnosis.

### Response:
Based on the input, suggest a medical diagnosis.

### Response:
Based on the input, suggest a medical diagnosis.
## Choice
gpt3.5



In [216]:
s = final_results.choice_name

In [217]:
s.value_counts()

choice_name
gpt3.5        154
both           49
llama7b_ft     13
Name: count, dtype: int64

In [219]:
# run = wandb.init(project=WANDB_PROJECT, job_type="gpt4_eval", config=gpt4_config)
gpt4_table  = wandb.Table(dataframe=final_results)
wandb.log({"gpt4_eval":gpt4_table})

In [220]:
wandb.finish()

VBox(children=(Label(value='0.394 MB of 0.756 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.521675…