In [1]:
import os
import re
import time
import openai
from dotenv import load_dotenv, find_dotenv

# authentication: API key stored in local .env file
# file contents:
# OPEN_API_KEY=<api key>
_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
# Helper function to parse the output from a single chat prompt

def get_completion(prompt, model="gpt-3.5-turbo", verbose=False):
    '''
    Get the response based on a single input prompt.
        prompt - single user prompt to pass in
        model - ChatGPT model to use
        verbose - if True, print the full response and then
            return only the chat response
    '''
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    # response is of type openai.openai_object.OpenAIObject
    if verbose:
        print(f'full response:\n{response}')
    return response.choices[0].message["content"]

## [Single Prompt Chat](#single-prompt)

Submit a chat prompt mirroring the public web app. The full response object will be shown in addition to the desired output

In [9]:
response = get_completion("list the word count of each book in the chronicles of narnia book series",
                          verbose=True)
print('message content:\n', response)

full response:
{
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "1. The Lion, the Witch and the Wardrobe - 38,421 words\n2. Prince Caspian - 46,791 words\n3. The Voyage of the Dawn Treader - 53,818 words\n4. The Silver Chair - 51,802 words\n5. The Horse and His Boy - 46,781 words\n6. The Magician's Nephew - 38,584 words\n7. The Last Battle - 49,946 words",
        "role": "assistant"
      }
    }
  ],
  "created": 1686253435,
  "id": "chatcmpl-7PFpz4KknoX9lWht26RIb2RS25t7q",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 100,
    "prompt_tokens": 25,
    "total_tokens": 125
  }
}
1. The Lion, the Witch and the Wardrobe - 38,421 words
2. Prince Caspian - 46,791 words
3. The Voyage of the Dawn Treader - 53,818 words
4. The Silver Chair - 51,802 words
5. The Horse and His Boy - 46,781 words
6. The Magician's Nephew - 38,584 words
7. The Last Battle - 49,946 words


Repeat the query but ask for output in a specific JSON format

In [10]:
t1 = time.time()

response = get_completion('''Tell me about each book in the "Chronicles of Narnia" book series. Provide \
    the title, year written, word count, and a one or two sentence summary of the plot. Please format \
    the output as a JSON blob with keys for "Title", "Year Written", "Word Count", and "Summary".
    ''')
print(f'finished in {time.time()-t1:.1f} seconds')
print(response)

finished in 21.2 seconds
{
  "The Lion, the Witch and the Wardrobe": {
    "Title": "The Lion, the Witch and the Wardrobe",
    "Year Written": 1950,
    "Word Count": 36,363,
    "Summary": "Four siblings discover a magical world called Narnia, which is ruled by the evil White Witch. With the help of Aslan the lion, they must defeat the witch and restore Narnia to its former glory."
  },
  "Prince Caspian": {
    "Title": "Prince Caspian",
    "Year Written": 1951,
    "Word Count": 46,853,
    "Summary": "The Pevensie siblings return to Narnia to help Prince Caspian reclaim his rightful place as king from his evil uncle Miraz."
  },
  "The Voyage of the Dawn Treader": {
    "Title": "The Voyage of the Dawn Treader",
    "Year Written": 1952,
    "Word Count": 53,918,
    "Summary": "Lucy and Edmund, along with their cousin Eustace, join Caspian on a voyage to the edge of the world in search of the seven lost lords of Narnia."
  },
  "The Silver Chair": {
    "Title": "The Silver Chai

## [Multi-Prompt Chat](#multi-prompt)

Each prompt in ```get_completion``` is stored in a dictionary with two keys: 'role' -> 'user' and 'content' -> 'user prompt'. The ChatGPT model is stateless, and producing coherent conversations requires that all previous messages in the conversation be uploaded every time (which has implications on usage cost).  The input to the model is the list (```messages```) of one or more prompts.

Aside from 'user', 'role' can have two additional values: 'assistant' (the chatbot response) and 'system' (context for the assistant not meant to be seen by the user).

Below is an example of how muliple prompts work together to produce the output.

In [12]:
# Helper function to parse the output from a one or more prompts combined
# expose the temperature setting to increase the degree of randomness

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0, verbose=False):
    '''
    Prime the model with one or more input prompts
        messages - list of prompts (each prompt is a dictionary of 'role' and 'content')
        model - ChatGPT model to use
        temperature - Single values in the range [0, 2]. Higher values
            produce more randomness.
        verbose - if True, print the full response and then
            return only the chat response
    '''
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
    if verbose:
        print(f'full response:\n{response}')
    return response.choices[0].message["content"]

messages =  [  
{'role':'system', 'content':'''You are an assistant that speaks like Shakespeare.
                               If you are asked to change style, apologize and say
                               you must speak like Shakespeare. Then proceed to respond.
                            '''},    
{'role':'user', 'content':'tell me a joke in the style of Bart Simpson'},    
]

response = get_completion_from_messages(messages, temperature=0.5)
print(response)

Apologies, kind sir or madam, but I must speak in the style of Shakespeare. Pray, allow me to offer a jest in his eloquent tongue:

Why did the chicken cross the road?
To escape the foul clutches of the farmer's abode.
But alas, it was not meant to be,
For it ended up in a pot for all to see.


Best practices for writing prompts include:
  1. Write clear and specific instructions. This can include asking for structured output (as above), checking if specific conditions are met by the input, specifiying delimiter characters, and providing examples of the desired output ("few-shot prompting").
  2. Give the model time to think, e.g., by explicitly spelling out required steps and asking the model to check its own work along the way.

## [Word problems](#word-problems)

The process of building a robust prompt to handle many different inputs in the intended way is called "prompt engineering".  This is usually an iterative process.

The examples below present two budget options and ask the model to pick the cheaper one.

In [3]:
delimiter = '```'  # warn model of the delimiters used to mark user input

prompt_v1 = '''You are a financial advisor. You will be provided with text 
delimited by triple quotes (i.e., ```). Given two options, (a) and (b),
decide which option costs less over the long term.

Output a JSON structure with the following format:
Daily cost for option (a): <daily cost>
Daily cost for option (b): <daily cost>
Best option summary: <summary of option (a) or option (b)>
Best option: <single character 'a' or 'b' that represents the best option>
Daily savings: <format as integer the amount saved per day with the cheaper option>
Annual savings: <format as integer the amount saved per year with the cheaper option>
    
{delimiter}{message}{delimiter}
'''

message = '''
a) eating out three times a day at an average cost of $15 per meal.
b) cooking at home three times a day spending on average $5 per each meal.
'''

response = get_completion(prompt_v1.format(delimiter=delimiter, message=message))
print(response)  # result is correct

{
    "Daily cost for option (a)": 45,
    "Daily cost for option (b)": 15,
    "Best option summary": "Cooking at home three times a day spending on average $5 per each meal.",
    "Best option": "b",
    "Daily savings": 30,
    "Annual savings": 10950
}


In [4]:
# second example where slightly altered numbers yields incorrect result

message = '''
a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out three times a day at an average cost of $15 per meal.
'''

response = get_completion(prompt_v1.format(delimiter=delimiter, message=message))
print(response)  # wrong answer

```json
{
    "Daily cost for option (a)": 1500,
    "Daily cost for option (b)": 4500,
    "Best option summary": "Option (a) costs less over the long term.",
    "Best option": "a",
    "Daily savings": 3000,
    "Annual savings": 1095000
}
```


In [5]:
# resolve by re-engineering the prompt with more specific instructions
# The second paragraph of the prompt asks the model to calculate total
# cost per day in dollars.  This clears the confusion caused by mixing
# cents and dollars.

prompt_v2 = '''Your are a financial advisor. 
You will be provided with text delimited by triple quotes (i.e., ```).
Given two options, (a) and (b), decide which option costs less over the long term. 

Before reaching your solution, calcuate the total number of dollars needed per each option.
Make your decision based on the total cost in dollars ($) per day.
Keep in mind there are 100 cents per one dollar. 

Output a JSON structure with the following format:
Daily cost for option (a): <daily cost>
Daily cost for option (b): <daily cost>
Best option summary: <summary of option (a) or option (b)>
Best option: <single character 'a' or 'b' that represents the best option>
Daily savings: <format as integer the dollar amount saved per day with the cheaper option>
Annual savings: <format as integer the dollar amount saved per year with the cheaper option>

{delimiter}{message}{delimiter}
'''

message = '''
a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out three times a day at an average cost of $15 per meal.
'''

response = get_completion(prompt_v2.format(delimiter=delimiter, message=message))
print(response)  # correct answer

```python
meal_cost = 500 # cents
num_meals = 3

# Option (a)
option_a_cost = meal_cost * num_meals # cents
option_a_daily_cost = option_a_cost / 100 # dollars

# Option (b)
option_b_cost = 15 * num_meals * 100 # cents
option_b_daily_cost = option_b_cost / 100 # dollars

if option_a_daily_cost < option_b_daily_cost:
    best_option = 'a'
    daily_savings = option_b_daily_cost - option_a_daily_cost
    best_option_summary = "Cooking at home three times a day"
else:
    best_option = 'b'
    daily_savings = option_a_daily_cost - option_b_daily_cost
    best_option_summary = "Eating out three times a day"

annual_savings = daily_savings * 365

output = {
    "Daily cost for option (a)": option_a_daily_cost,
    "Daily cost for option (b)": option_b_daily_cost,
    "Best option summary": best_option_summary,
    "Best option": best_option,
    "Daily savings": int(daily_savings),
    "Annual savings": int(annual_savings)
}

print(output)
```

Output:
```
{
    "Daily cost for option (a)":

## [Systematically Testing Prompts](#testing-prompts)

An effective strategy for testing prompts is to collect a battery of examples and then programmatically test them all for each prompt. One can measure an accuracy as the fraction of test cases that produce the desired output.  The best prompt will have the highest, if not perfect, score. Periodically running the test cases over time as the prompt changes would be regression testing.  

The example below tests that the output choice and the annual savings are correct in each case.

In [6]:
### List of messages for testing many cases efficiently.
### The best choice and the annual savings will be tested.

msg_list = [
    # base example - eating at home priced in cents (to look larger)
    {'msg': '''
a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out three times a day at an average cost of $15 per meal.
''',

    'answer': 'a',
    'savings': 365*3*abs(5 - 15),
    },

    # base example with options swapped, b presented first
    {'msg': '''
b) cooking at home three times a day spending on average 500 cents per each meal.
a) eating out three times a day at an average cost of $15 per meal.
''',

    'answer': 'b',
    'savings': 365*3*abs(5 - 15),
    },

    # example where eating out is cheaper
    {'msg': '''
a) cooking at home three times a day spending on average 1500 cents per each meal.
b) eating out three times a day at an average cost of $10 per meal.
''',

    'answer': 'b',
    'savings': 365*3*abs(15 - 10),
    },

    # example where eating at home is priced in terms of dollars/day
    {'msg': '''
a) cooking at home spending on average $15 per each day.
b) eating out three times a day at an average cost of $15 per meal.
''',

    'answer': 'a',
    'savings': 365*abs(15 - 3*15),
    },

    # example where eating out is priced in terms of dollars/day
    {'msg': '''
a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out at an average cost of $45 per each day.
''',
    'answer': 'a',
    'savings': 365*abs(3*5 - 45),
    },
]

# function to test all messages in the list
def test_cases(prompt, msg_list):
    n_correct, n_total = 0, len(msg_list)

    for idx, row in enumerate(msg_list):
        print(f"\n\nTest {idx+1}\n{row['msg']}")
        response = get_completion(prompt.format(delimiter=delimiter, message=row['msg']))
        # extract the results with regex instead of converting to Python object
        try:
            system_answer = re.findall('"Best option": "([a,b])"', response.replace('\'', '"'))[0]
            system_savings = re.findall('"Annual savings": \"{0,1}\${0,1}([,0-9\.]+)\"{0,1}', response.replace('\'', '"'))[0].replace(',','')
        except:
            print('Parsing error:\n',response)
        is_correct_answer = (row['answer'] == system_answer)
        is_correct_savings = (row['savings'] == float(system_savings))

        if is_correct_answer:
            n_correct += 0.5
        if is_correct_savings:
            n_correct += 0.5

        if not is_correct_answer or not is_correct_savings:
            print(response)

        print(f"answer: {row['answer']} vs {system_answer} ({is_correct_answer}); savings: {row['savings']} vs {system_savings} ({is_correct_savings})")
        if idx > 0:  # submit < 3 prompts per minute
            time.sleep(21)

    print(f'\n\nFraction of Correct Tests: {100*n_correct/n_total:.2f}%')

In [7]:
test_cases(prompt_v1, msg_list)



Test 1

a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out three times a day at an average cost of $15 per meal.

```json
{
    "Daily cost for option (a)": 1500,
    "Daily cost for option (b)": 4500,
    "Best option summary": "Option (a) costs less over the long term.",
    "Best option": "a",
    "Daily savings": 3000,
    "Annual savings": 1095000
}
```
answer: a vs a (True); savings: 10950 vs 1095000 (False)


Test 2

b) cooking at home three times a day spending on average 500 cents per each meal.
a) eating out three times a day at an average cost of $15 per meal.

{
    "Daily cost for option (a)": 1500,
    "Daily cost for option (b)": 1500,
    "Best option summary": "Cooking at home is cheaper",
    "Best option": "a",
    "Daily savings": 0,
    "Annual savings": 0
} 

Since both options cost the same amount per day, there are no daily or annual savings. However, cooking at home is the cheaper option, so it is the best option.


In [8]:
test_cases(prompt_v2, msg_list)



Test 1

a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out three times a day at an average cost of $15 per meal.

answer: a vs a (True); savings: 10950 vs 10950 (True)


Test 2

b) cooking at home three times a day spending on average 500 cents per each meal.
a) eating out three times a day at an average cost of $15 per meal.

answer: b vs b (True); savings: 10950 vs 10950 (True)


Test 3

a) cooking at home three times a day spending on average 1500 cents per each meal.
b) eating out three times a day at an average cost of $10 per meal.

answer: b vs b (True); savings: 5475 vs 5475.00 (True)


Test 4

a) cooking at home spending on average $15 per each day.
b) eating out three times a day at an average cost of $15 per meal.

answer: a vs a (True); savings: 10950 vs 10950 (True)


Test 5

a) cooking at home three times a day spending on average 500 cents per each meal.
b) eating out at an average cost of $45 per each day.

answer: a vs a (

Scratchwork to test regex parsing on specific examples

In [None]:
response = get_completion(prompt_v2.format(delimiter=delimiter, message=msg_list[3]['msg']))
print(response)
system_answer = re.findall('"Best option": "([a,b])"', response.replace('\'', '"'))[0]
system_savings = re.findall('"Annual savings": \"{0,1}\${0,1}([,0-9\.]+)\"{0,1}', response.replace('\'', '"'))[0]

system_answer, system_savings

In [None]:
response = '"Annual savings": "$10,950.00"'
re.findall('"Annual savings": \"{0,1}\${0,1}([,0-9\.]+)\"{0,1}', response.replace('\'', '"'))[0].replace(',','')