In [1]:
import pandas as pd
import numpy as np
import openai
import pickle

openai.api_key = "ca392a5651064a37b2207fc766e8a3ae"
openai.api_base = "https://text-and-code-1.openai.azure.com/"
openai.api_type = 'azure'
openai.api_version = "2023-05-15"
deployment_name='gpt-35-turbo-1'

In [3]:
data_dict = pickle.load(open('data/data_dict.pkl', 'rb'))

In [5]:
index_keys = list(data_dict.keys())
np.random.shuffle(index_keys)
train_keys, test_keys = index_keys[:int(len(index_keys)*0.8)], index_keys[int(len(index_keys)*0.8):]

In [6]:
system_content = "You are WeatherBot, an AI expert in global weather patterns. You will be given a series of monthly average temperatures for some city and some year and asked to predict if the city is in North America or not."
user_content_1 = "You will be given the average temperature for each month in Fahrenheit. The average temperatures will be given in list format. For example, if given the list [32, 45, 67, 89, 90, 87, 76, 65, 54, 43, 32, 21], the first number is the average temperature for January, the second number is the average temperature for February, and so on. You will be asked to predict if the city is in North America or not."
assistant_content_1 = "Yes I understand. I am WeatherBot, and I will help identify if the city is in North America or not from its average monthly temperatures."
user_content_2 = "Great! Let's begin :)\n"

In [7]:
def get_train_ts_label(sample_size, train_data):
    ts_list = []
    label_list = []
    train_keys = list(train_data.keys())
    np.random.shuffle(train_keys)
    train_keys = train_keys[:sample_size]
    for key in train_keys:
        sample_ts = train_data[key]['MonthlyAvgTemperature']
        sample_label = train_data[key]['Label']
        ts_list.append(sample_ts)
        label_list.append(sample_label)
    return ts_list, label_list

def get_test_ts_label(test_data):
    return test_data['MonthlyAvgTemperature'], test_data['Label']


In [8]:
def ts_to_string(ts):
    return "Please answer following this template: (A) This city is in North America OR (B) This city is not in North America.\nMonthly average temperatures: " + str(ts)

def label_to_string(label):
    if label == 1:
        return "(A) This city is in North America."
    else:
        return "(B) This city is not in North America."

def parse_response(response_string):
    # return 1 if correct, 0 if incorrect, -1 if invalid response
    if ('(A)' in response_string and '(B)' in response_string) or ('(A)' not in response_string and '(B)' not in response_string): # invalid response
        return -1
    else:
        return int('(A)' in response_string)
    
def get_response(prompt):
    response = openai.ChatCompletion.create(
        engine=deployment_name,
        messages=prompt,
        temperature=0.0,
    )
    return response.choices[0]['message']['content']

In [9]:
def timed_get_response(prompt):
    return parse_response(get_response(prompt))

In [18]:
import time
test_size_global = 100

def create_prompt(train_data, test_ts, num_shots):
    messages = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content_1}, {"role": "assistant", "content": assistant_content_1}]
    messages.append({"role": "user", "content": user_content_2})
    train_ts_list, train_label_list = get_train_ts_label(num_shots, train_data)
    for i in range(num_shots):
        messages.append({"role": "user", "content": ts_to_string(train_ts_list[i])})
        messages.append({"role": "assistant", "content": label_to_string(train_label_list[i])})
    messages.append({"role": "user", "content": ts_to_string(test_ts)})
    return messages


def experiment(data_dict, test_size=100, num_shots=4, verbose=True):
    global test_size_global
    first = True
    correct = 0
    incorrect = 0
    invalid = 0
    total = 0
    index_keys = list(data_dict.keys())
    np.random.shuffle(index_keys)
    train_keys, test_keys = index_keys[:int(len(index_keys)*0.8)], index_keys[int(len(index_keys)*0.8):]
    train_data = {key: data_dict[key] for key in train_keys}
    test_keys = test_keys[:test_size]
    responses = []
    gts = []
    for key in test_keys:
        test_ts, test_label = get_test_ts_label(data_dict[key])
        prompt = create_prompt(train_data, test_ts, num_shots)
        response = parse_response(get_response(prompt))
        test_size_global -= 1
        time.sleep(.1)
        responses.append(response)
        gts.append(test_label)
        if verbose:
            if first:
                print(prompt)
                first = False
            # print('Response:', response, 'GT:', test_label)
            if response == -1:
                invalid += 1
            elif response == test_label:
                correct += 1
            else:
                incorrect += 1
            total += 1
            # find true positive rate
            tp = (np.array(responses) == np.array(gts)).sum()
            fp = (np.array(responses) != np.array(gts)).sum()
            tn = (np.array(responses) == np.array(gts)).sum()
            fn = (np.array(responses) != np.array(gts)).sum()
            recall = tp/(tp + fn)
            precision = tp/(tp + fp)
            f1 = 2 * (precision * recall) / (precision + recall)
            try:
                print('Accuracy:', round(correct/(incorrect + correct), 3), 'Correct:', correct, 'Incorrect:', incorrect, 'Invalid:', invalid, 'Total', total, 'TP:', tp, 'FP:', fp, 'TN:', tn, 'FN:', fn, 'F1:', round(f1, 3), 'Recall:', round(recall, 3), 'Precision:', round(precision, 3))
            except:
                print('Accuracy:', 0, 'Correct:', correct, 'Incorrect:', incorrect, 'Invalid:', invalid, 'Total', total, 'TP:', tp, 'FP:', fp, 'TN:', tn, 'FN:', fn, 'F1:', 0, 'Recall:', 0, 'Precision:', 0)

    return responses, gts



In [None]:
if test_size_global == 0:
    test_size_global = 100
new_test_size = int(test_size_global)
results = experiment(data_dict, test_size=new_test_size, num_shots=16, verbose=True)

In [None]:
user_content_list = []
assistant_content_list = []
num_shots = 10
for i in range(num_shots):
    sample_key = train_keys[i]
    sample = data_dict[sample_key]
    sample_label = sample['Label']
    sample_ts = sample['MonthlyAvgTemperature']
    user_content_list.append("Please answer following this template: (A) This city is in North America OR (B) This city is not in North America.\nMonthly average temperatures: " + str(sample_ts))
    assistant_content_list.append("(A) This city is in North America" if sample_label == 1 else "(B) This city is not in North America")

In [None]:
messages = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content_1}, {"role": "assistant", "content": assistant_content_1}]
messages.append({"role": "user", "content": user_content_2})
for i in range(num_shots):
    messages.append({"role": "user", "content": user_content_list[i]})
    messages.append({"role": "assistant", "content": assistant_content_list[i]})


In [None]:
test_size = 100
correct = 0
incorrect = 0
invalid_response = 0

for i in range(test_size):
    prompt = messages.copy()
    sample_key = test_keys[i]
    sample = data_dict[sample_key]
    sample_label = sample['Label']
    sample_ts = sample['MonthlyAvgTemperature']
    user_content = "Please answer following this template: (A) This city is in North America OR (B) This city is not in North America.\nMonthly average temperatures: " + str(sample_ts)
    prompt.append({"role": "user", "content": user_content})
    # ask the model to predict the label
    response = openai.ChatCompletion.create(engine=deployment_name, messages=prompt, temperature=0.0)
    if data_dict[sample_key]['Label'] == 1:
        look_for = '(A)'
        wrong = '(B)'
    else:
        look_for = '(B)'
        wrong = '(A)'
    if look_for in response.choices[0]['message']['content'] and wrong in response.choices[0]['message']['content'] or (not look_for in response.choices[0]['message']['content'] and not wrong in response.choices[0]['message']['content']):
        invalid_response += 1
    else:
        if look_for in response.choices[0]['message']['content']:
            correct += 1
        else:
            incorrect += 1
    print(correct, incorrect, invalid_response)
 

In [None]:
correct, incorrect, invalid_response

(6, 4, 0)

In [None]:
response['choices'][0]

<OpenAIObject at 0x133016630> JSON: {
  "finish_reason": "stop",
  "index": 0,
  "message": {
    "content": "(A) This city is in North America",
    "role": "assistant"
  }
}

In [None]:
# test gpt-3-5-turbo deployment

