This notebooks shows how to use [OpenAI assistant](https://platform.openai.com/docs/guides/function-calling) to do function calling.
- steps:
  - create an assistant (LLM + a lot of predefined func)
  - call LLM to see which predefined func to use and what are the input params
  - return the results to LLM
  - get the answer from LLM
- the input data are: [`air_passengers.csv`](../../data/air_passengers.csv), [`melbourne_temp.csv`](../../data/melbourne_temp.csv), [`nyc_taxi.csv`](../../data/nyc_taxi.csv)
- the question is: [`easy_precise_questions.csv`](../../data/easy_precise_questions.csv)

In [1]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.utils import convert_types, eval
from utils.vars import DATA_DIR, EXCEPT_FILES, QUESTION_FILES
from utils.assistants import AzureOpenAIAssistant
from utils.customized_func_tools import (
    TOOLS,
)

load_dotenv()

ASSISTANT_NAME_PREFIX = "customized_func"

In [2]:
# get the client object
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",  # only support this version
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)
assistant = AzureOpenAIAssistant(client=client)

In [3]:
display(assistant.list_all_assistants())

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_zp1VbRMednIpC4Zl65nDt4Hk,1735007424,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_melbourne_temp,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
1,asst_5E83wPdW0Lg32VmhhJizKPr4,1735007326,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_air_passengers,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
2,asst_VGmOogPrbDPXHIJeUbVmWWzZ,1734137131,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
3,asst_0Qc1dyLAY7XD3dKLqIkNJ45l,1734137012,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
4,asst_hTacjFmXp0wJW7cTacgiIaVm,1734136870,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
5,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


In [4]:
# delete assistant
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_nyc_taxi")
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_melbourne_temp")
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_air_passengers")

display(assistant.list_all_assistants())

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_VGmOogPrbDPXHIJeUbVmWWzZ,1734137131,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
1,asst_0Qc1dyLAY7XD3dKLqIkNJ45l,1734137012,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
2,asst_hTacjFmXp0wJW7cTacgiIaVm,1734136870,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
3,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


## Run 3 files x all questions

In [5]:
prompt_path = "prompts/prompt.jinja2"

In [6]:
# cons
# cannot pass a lot of values to the prompt due to token limit
# really need to design the function properly

In [7]:
df_result = []

for question_path in QUESTION_FILES:
    df_questions = pd.read_csv(question_path)
    for file_path in Path(DATA_DIR).glob("*.csv"):
        if file_path.name in EXCEPT_FILES:
            continue
        print(f"file: {file_path.name}")
        assistant_id = assistant.create_or_retrieve(
            prompt_path=prompt_path,
            assistant_name=f"{ASSISTANT_NAME_PREFIX}_{file_path.stem}",
            tools=TOOLS,
            tool_resources=None,
            prompt_args={"file_path": file_path},
        )
        for _, row in tqdm(df_questions.iterrows(), total=len(df_questions)):
            question = row["question"]
            answer_true = row[Path(file_path).name]

            result = assistant.ask_a_question(
                question=question, assistant_id=assistant_id, tools=TOOLS
            )

            df_result.append(
                {
                    **result,
                    "file": file_path.name,
                    "question_file": question_path.name,
                    "answer_true": convert_types(answer_true),
                }
            )

file: air_passengers.csv


  0%|          | 0/16 [00:00<?, ?it/s]

file: melbourne_temp.csv


  0%|          | 0/16 [00:00<?, ?it/s]

file: nyc_taxi.csv


  0%|          | 0/16 [00:00<?, ?it/s]

file: air_passengers.csv


  0%|          | 0/16 [00:00<?, ?it/s]

JSONDecodeError: Yes.
JSONDecodeError: No


18:37:01 - cmdstanpy - INFO - Chain [1] start processing
18:37:01 - cmdstanpy - INFO - Chain [1] done processing


file: melbourne_temp.csv


  0%|          | 0/16 [00:00<?, ?it/s]

JSONDecodeError: No
JSONDecodeError: No


18:38:25 - cmdstanpy - INFO - Chain [1] start processing
18:38:26 - cmdstanpy - INFO - Chain [1] done processing


file: nyc_taxi.csv


  0%|          | 0/16 [00:00<?, ?it/s]

JSONDecodeError: Yes
JSONDecodeError: No
JSONDecodeError: Yes.
JSONDecodeError: No


18:39:58 - cmdstanpy - INFO - Chain [1] start processing
18:39:59 - cmdstanpy - INFO - Chain [1] done processing


In [8]:
# how about code interpreter + function calling?


In [9]:
# Thoughts: write function and define each function clearly is important
# return results are difficult e.g., filter_by_weekdays() -> call the other function. if it's just a number or a string, it's fine
# - pros: things are fixable either function or the prompt
# more and more function, more prompt tokens

In [10]:
# eval
df_result = pd.DataFrame(df_result)

# loop through each file
eval(df=df_result, details=True)

Question file: easy_questions.csv; File: air_passengers.csv; Accuracy: 1.0
Question file: easy_questions.csv; File: melbourne_temp.csv; Accuracy: 0.9375
question: Tell me the end date of the given time series data.
answer_pred: 1990-01-11 00:00:00
answer_true: 1990-12-31 00:00:00
messages: user: Tell me the end date of the given time series data.
assistant: {"output": "1990-01-11"}
steps: [{'name': 'get_time_col_and_target_col', 'input': None, 'output': "{'target_col': 'Daily minimum temperatures', 'time_col': 'Date'}", 'args': '{"file_path":"../../data/melbourne_temp.csv"}'}]
**************************************************
Question file: easy_questions.csv; File: nyc_taxi.csv; Accuracy: 1.0
Question file: medium_questions.csv; File: air_passengers.csv; Accuracy: 0.8125
question: Does the dataset show any autocorrelation?
answer_pred: Yes.
answer_true: Yes
messages: user: Does the dataset show any autocorrelation?
assistant: Yes.
steps: [{'name': 'get_index_by_calc_acf_or_pacf', 'in

In [15]:
df_result

Unnamed: 0,completion_tokens,prompt_tokens,total_tokens,question,answer_pred,messages,steps,attachments,execution_time_s,file,question_file,answer_true
0,46,4341,4387,What is the min of the target variable?,104.0,[user: What is the min of the target variable?...,"[{'name': 'get_descriptive_statistics', 'input...",[],9,air_passengers.csv,easy_questions.csv,104.0
1,46,4341,4387,What is the max of the target variable?,622.0,[user: What is the max of the target variable?...,"[{'name': 'get_descriptive_statistics', 'input...",[],13,air_passengers.csv,easy_questions.csv,622.0
2,46,4341,4387,What is the mean of the target variable?,280.3,[user: What is the mean of the target variable...,"[{'name': 'get_descriptive_statistics', 'input...",[],8,air_passengers.csv,easy_questions.csv,280.3
3,47,4342,4389,What is the medium of the target variable?,265.5,[user: What is the medium of the target variab...,"[{'name': 'get_descriptive_statistics', 'input...",[],5,air_passengers.csv,easy_questions.csv,265.5
4,46,4343,4389,What is the standard deviation of the target v...,119.97,[user: What is the standard deviation of the t...,"[{'name': 'get_descriptive_statistics', 'input...",[],4,air_passengers.csv,easy_questions.csv,119.97
...,...,...,...,...,...,...,...,...,...,...,...,...
91,70,4422,4492,Which lagging value has the weakest absolute c...,10.0,[user: Which lagging value has the weakest abs...,"[{'name': 'get_index_by_calc_acf_or_pacf', 'in...",[],3,nyc_taxi.csv,medium_questions.csv,6.0
92,58,4361,4419,What is the average in the first 10 data points?,4534.2,[user: What is the average in the first 10 dat...,"[{'name': 'get_descriptive_statistics', 'input...",[],6,nyc_taxi.csv,medium_questions.csv,4534.2
93,116,4452,4568,What is the total of the maximum of the first ...,34135.0,[user: What is the total of the maximum of the...,"[{'name': 'get_descriptive_statistics', 'input...",[],4,nyc_taxi.csv,medium_questions.csv,34135.0
94,69,4385,4454,Forecast the last 5 data points in the time co...,23719.0_23719.0_23719.0_23719.0_23719.0,[user: Forecast the last 5 data points in the ...,"[{'name': 'forecast_last_n_data', 'input': Non...",[],4,nyc_taxi.csv,medium_questions.csv,23719.0_23719.0_23719.0_23719.0_23719.0


In [12]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_result.groupby(["question_file", "file"]).describe())

Unnamed: 0_level_0,Unnamed: 1_level_0,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
question_file,file,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2
easy_questions.csv,air_passengers.csv,16.0,44.9375,9.412536,34.0,39.0,46.0,47.0,75.0,16.0,4482.3125,567.140396,4333.0,4337.75,4341.0,4343.25,6609.0,16.0,4527.25,575.208774,4369.0,4375.5,4387.0,4389.5,6684.0,16.0,5.125,2.801785,3.0,3.0,4.0,5.5,13.0
easy_questions.csv,melbourne_temp.csv,16.0,43.1875,5.623981,34.0,39.0,45.5,46.25,55.0,16.0,4341.5625,5.773142,4333.0,4338.0,4341.0,4343.25,4356.0,16.0,4384.75,10.109402,4368.0,4376.0,4387.0,4389.5,4411.0,16.0,4.375,1.78419,2.0,3.0,4.0,5.25,8.0
easy_questions.csv,nyc_taxi.csv,16.0,45.5,6.377042,35.0,40.0,47.5,48.25,56.0,16.0,4345.0,6.418723,4337.0,4341.0,4344.5,4347.25,4359.0,16.0,4390.5,12.345039,4375.0,4379.5,4392.0,4395.25,4415.0,16.0,3.8125,0.75,3.0,3.0,4.0,4.0,5.0
medium_questions.csv,air_passengers.csv,16.0,62.625,17.005391,43.0,53.5,63.0,69.0,113.0,16.0,4514.5625,557.136365,4338.0,4359.25,4372.5,4397.0,6601.0,16.0,4577.1875,562.503241,4381.0,4411.25,4435.0,4466.0,6680.0,16.0,4.25,1.770122,3.0,3.0,4.0,4.0,8.0
medium_questions.csv,melbourne_temp.csv,16.0,67.625,26.595426,40.0,53.5,63.0,69.0,140.0,16.0,4519.375,556.123353,4332.0,4362.25,4376.0,4402.5,6601.0,16.0,4587.0,561.172938,4372.0,4413.0,4439.0,4471.5,6680.0,16.0,4.0625,1.569235,3.0,3.0,3.0,5.0,8.0
medium_questions.csv,nyc_taxi.csv,16.0,66.5,17.196899,41.0,57.5,67.5,70.0,116.0,16.0,4658.8125,756.521356,4343.0,4365.5,4384.0,4405.5,6608.0,16.0,4725.3125,762.306825,4384.0,4426.5,4449.0,4475.5,6689.0,16.0,5.125,2.918333,3.0,3.0,4.0,6.0,14.0


## Run one question

In [13]:
# list all assistants
assistant.list_all_assistants()

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_vOt2vULsZnzSZfqASNWCA1uv,1735007676,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_nyc_taxi,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
1,asst_qXfVisu4RHEIYa5Qs4L3I1Xa,1735007596,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_melbourne_temp,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
2,asst_kFyF79QuX27kMYRnguWb80n9,1735007502,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_air_passengers,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
3,asst_VGmOogPrbDPXHIJeUbVmWWzZ,1734137131,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
4,asst_0Qc1dyLAY7XD3dKLqIkNJ45l,1734137012,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
5,asst_hTacjFmXp0wJW7cTacgiIaVm,1734136870,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
6,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


In [14]:
question = "Forecast the last 5 data points in the time column using prophet model."
result = assistant.ask_a_question(
    question=question, assistant_id="asst_PprPG3wDRy0vhncihJgfGKbT", tools=TOOLS
)
result

NotFoundError: Error code: 404 - {'error': {'message': "No assistant found with id 'asst_PprPG3wDRy0vhncihJgfGKbT'.", 'type': 'invalid_request_error', 'param': None, 'code': None}}