In [7]:
import pandas as pd
import duckdb
import openai
import time 
import os

# Load in data

We use the [Chicago crime dataset](https://data.cityofchicago.org/Public-Safety/Crimes-2001-to-Present/ijzp-q8t2/about_data) from 2021, 2022, and 2023

In [8]:
path = "./data"
files = [x for x in os.listdir(path = path)]
print(files)

['chicago_crime_2021.csv', 'chicago_crime_2022.csv', 'chicago_crime_2023.csv']


In [9]:
# read in and concat data 
chicago_crime = pd.concat((pd.read_csv(path +"/" + f) for f in files), ignore_index=True)

# chicago_crime.shape #(709386, 22)
chicago_crime.head()

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,25953,JE240540,05/24/2021 03:06:00 PM,020XX N LARAMIE AVE,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,False,...,36.0,19,01A,1141387.0,1913179.0,2021,11/18/2023 03:39:49 PM,41.917838,-87.755969,"(41.917838056, -87.755968972)"
1,26038,JE279849,06/26/2021 09:24:00 AM,062XX N MC CORMICK RD,110,HOMICIDE,FIRST DEGREE MURDER,PARKING LOT,True,False,...,50.0,13,01A,1152781.0,1941458.0,2021,11/18/2023 03:39:49 PM,41.995219,-87.713355,"(41.995219444, -87.713354912)"
2,12342615,JE202211,04/17/2021 03:20:00 PM,081XX S PRAIRIE AVE,325,ROBBERY,VEHICULAR HIJACKING,RESIDENCE,True,False,...,6.0,44,03,1179448.0,1851073.0,2021,09/14/2023 03:41:59 PM,41.746626,-87.618032,"(41.746626309, -87.618031954)"
3,26262,JE366265,09/08/2021 04:45:00 PM,047XX W HARRISON ST,110,HOMICIDE,FIRST DEGREE MURDER,CAR WASH,True,False,...,24.0,25,01A,1144907.0,1896933.0,2021,09/14/2023 03:41:59 PM,41.873191,-87.743447,"(41.873191445, -87.743446563)"
4,13209581,JG422927,08/01/2021 12:00:00 AM,012XX E 78TH ST,1563,SEX OFFENSE,CRIMINAL SEXUAL ABUSE,APARTMENT,False,False,...,8.0,45,17,,,2021,09/14/2023 03:43:09 PM,,,


# Prompt Engineering

The OpenAI API documentation provides a [recommendation](https://platform.openai.com/examples/default-sql-translate) for how to set the system and user components in a prompt when requesting to generate an SQL code:  
For context, 
* `system` is the context, e.g. "Given the following SQL tables, your job is to write queries given a user’s request."  
```
CREATE TABLE Orders (
  OrderID int,
  CustomerID int,
  OrderDate datetime,
  OrderTime varchar(8),
  PRIMARY KEY (OrderID)
);
...
```
* `user` is the request, e.g. "Write a SQL query which computes the average total order value for all orders on 2023-04-01."

### Step 1: Create a function `prompt_generator(table_name, query)` that generates the system and user prompts like the example above.
* Parameters:
1. table name
2. user query request
* Output:
completed system and user prompt messages

Use the `DuckDB` library to handle the pandas' df as it was a SQL table and extract its column names and types. 

In [13]:
# Extract info
duckdb.sql("DESCRIBE SELECT * FROM chicago_crime")

┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│     column_name      │ column_type │  null   │   key   │ default │  extra  │
│       varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ ID                   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ Case Number          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Date                 │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Block                │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ IUCR                 │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Primary Type         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Description          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Location Description │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ Arrest               │ BOOLEAN     │ YES     │ NUL

In [141]:
def prompt_generator(table_name, user_query):

    class message:
        def __init__(message, context, user):
            message.context = context
            message.user = user
    
    # Parse the table to extract the first two columns & create a string
    table_des = duckdb.sql("DESCRIBE SELECT * FROM chicago_crime")
    column_name_type_list = []
    for i in range(len(table_des.df().column_name)):
        column_name = table_des.df().column_name[i]
        column_type = table_des.df().column_type[i]
        column_name_type = column_name + ' ' + column_type
        column_name_type_list.append(column_name_type)

    column_name_type_string = str(column_name_type_list).replace('[','(').replace(']',')').replace("\'", "")

    # create context and user string
    context = """
    Given the following SQL tables, your job is to write queries given a user’s request.
    CREATE TABLE {} {} \n
    """.format(table_name, column_name_type_string) 
    user = "Write a SQL query that returns: {}".format(user_query)
    
    prompt = message(context = context, user = user)

    return prompt

In [143]:
table_name = "chicago_crime"
user_query = "Returns the number of cases in each year, order by year in the ascending order."

print(prompt_generator(table_name, user_query).user)

Write a SQL query that returns: Returns the number of cases in each year, order by year in the ascending order.


## Step2: Interact with the OpenAI API