This notebooks shows how to use [OpenAI assistant](https://platform.openai.com/docs/guides/function-calling) to do function calling.
- steps:
  - create an assistant (LLM + a lot of predefined func)
  - call LLM to see which predefined func to use and what are the input params
  - return the results to LLM
  - get the answer from LLM
- the input data are: [`air_passengers.csv`](../../data/air_passengers.csv), [`melbourne_temp.csv`](../../data/melbourne_temp.csv), [`nyc_taxi.csv`](../../data/nyc_taxi.csv)
- the question is: [`easy_precise_questions.csv`](../../data/easy_precise_questions.csv)

In [1]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os
import pandas as pd

from pathlib import Path
from tqdm.notebook import tqdm
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils.utils import convert_types, eval
from utils.vars import DATA_DIR, EXCEPT_FILES, QUESTION_PATH
from utils.assistants import AzureOpenAIAssistant
from utils.customized_func_tools import (
    TOOLS,
)

load_dotenv()

ASSISTANT_NAME_PREFIX = "customized_func"

In [2]:
# get the client object
client = AzureOpenAI(
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",  # only support this version
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
)
assistant = AzureOpenAIAssistant(client=client)

In [3]:
display(assistant.list_all_assistants())

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
1,asst_3PTBym6cvEQGfOUYS2GauhSd,1734134708,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
2,asst_wgsQ9A5m5ucnSBRS5twZLEY9,1734134569,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
3,asst_oACu0AnZwnQubgMwDap6jcKI,1734134414,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


In [4]:
# delete assistant
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_nyc_taxi")
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_melbourne_temp")
assistant.delete_assistant(name=f"{ASSISTANT_NAME_PREFIX}_air_passengers")

display(assistant.list_all_assistants())

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
1,asst_3PTBym6cvEQGfOUYS2GauhSd,1734134708,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
2,asst_wgsQ9A5m5ucnSBRS5twZLEY9,1734134569,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
3,asst_oACu0AnZwnQubgMwDap6jcKI,1734134414,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


## Run 3 files x all questions

In [5]:
prompt_path = "prompts/prompt.jinja2"

In [6]:
# read questions
df_questions = pd.read_csv(QUESTION_PATH)

In [7]:
df_result = []

for file_path in Path(DATA_DIR).glob("*.csv"):
    if file_path.name in EXCEPT_FILES:
        continue
    print(f"file: {file_path.name}")
    assistant_id = assistant.create_or_retrieve(
        prompt_path=prompt_path,
        assistant_name=f"{ASSISTANT_NAME_PREFIX}_{file_path.stem}",
        tools=TOOLS,
        tool_resources=None,
        prompt_args={"file_path": file_path},
    )
    for _, row in tqdm(df_questions.iterrows(), total=len(df_questions)):
        question = row["question"]
        answer_true = row[Path(file_path).name]

        result = assistant.ask_a_question(
            question=question, assistant_id=assistant_id, tools=TOOLS
        )

        df_result.append(
            {
                **result,
                "file": file_path.name,
                "answer_true": convert_types(answer_true),
            }
        )

file: air_passengers.csv


  0%|          | 0/16 [00:00<?, ?it/s]

file: melbourne_temp.csv


  0%|          | 0/16 [00:00<?, ?it/s]

file: nyc_taxi.csv


  0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
# Thoughts: write function and define each function clearly is important

In [9]:
# eval
df_result = pd.DataFrame(df_result)

# loop through each file
eval(df=df_result, details=True)

File: air_passengers.csv; Accuracy: 1.0
File: melbourne_temp.csv; Accuracy: 1.0
File: nyc_taxi.csv; Accuracy: 1.0


In [10]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(df_result.groupby(["file"]).describe())

Unnamed: 0_level_0,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,completion_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,prompt_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,total_tokens,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s,execution_time_s
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
file,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2
air_passengers.csv,16.0,44.0,9.549869,34.0,39.0,45.0,46.25,75.0,16.0,1920.0,256.838471,1849.0,1853.0,1857.0,1859.25,2883.0,16.0,1964.0,265.186475,1885.0,1889.5,1903.0,1905.0,2958.0,16.0,3.625,1.454877,2.0,2.75,3.0,4.25,6.0
melbourne_temp.csv,16.0,46.8125,13.039523,34.0,39.0,46.0,47.0,81.0,16.0,1984.375,352.365695,1849.0,1853.75,1857.0,1859.25,2887.0,16.0,2031.1875,364.598266,1885.0,1891.5,1903.0,1905.5,2968.0,16.0,3.0,1.36626,2.0,2.0,2.5,3.25,6.0
nyc_taxi.csv,16.0,48.625,14.333527,35.0,40.0,47.5,49.0,83.0,16.0,1987.4375,351.954158,1852.0,1856.75,1860.5,1863.25,2889.0,16.0,2036.0625,365.448121,1891.0,1893.5,1908.0,1911.5,2972.0,16.0,3.5625,2.189939,2.0,2.0,3.0,4.25,9.0


## Run one question

In [11]:
# list all assistants
assistant.list_all_assistants()

Unnamed: 0,id,created_at,description,instructions,metadata,model,name,object,tools,response_format,temperature,tool_resources,top_p
0,asst_Vlp2LN1jLT9DF3VaW0lBJ1tr,1734136705,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_nyc_taxi,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
1,asst_QiuowqUHhjLjLdZLHogxZ6Aq,1734136645,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_melbourne_temp,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
2,asst_BWLHvd0JyHSkCD1y2MaumwMQ,1734136577,,You are a data scientist in univariate time se...,{},gpt-4o,customized_func_air_passengers,assistant,[{'function': {'name': 'get_time_col_and_targe...,auto,0.0,{},1.0
3,asst_x4c0gSGzdmoljHa4DrmZjPZO,1734135165,,You are a python expert in univariate time ser...,{},gpt-4o,code_interpreter_nyc_taxi_plot,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
4,asst_3PTBym6cvEQGfOUYS2GauhSd,1734134708,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_nyc_taxi,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
5,asst_wgsQ9A5m5ucnSBRS5twZLEY9,1734134569,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_melbourne_temp,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0
6,asst_oACu0AnZwnQubgMwDap6jcKI,1734134414,,You are a data scientist in univariate time se...,{},gpt-4o,code_interpreter_air_passengers,assistant,[{'type': 'code_interpreter'}],auto,0.0,{'code_interpreter': {'file_ids': ['assistant-...,1.0


In [12]:
question = "What is the Q1 of the target variable?"
result = assistant.ask_a_question(
    question=question, assistant_id=assistant_id, tools=TOOLS
)
result

{'completion_tokens': 49,
 'prompt_tokens': 1864,
 'total_tokens': 1913,
 'question': 'What is the Q1 of the target variable?',
 'answer_pred': 10262.0,
 'messages': ['user: What is the Q1 of the target variable?',
  'assistant: {"output": "10262.0"}'],
 'steps': [{'name': 'get_descriptive_statistics',
   'input': None,
   'output': '10262.0',
   'args': '{"file_path":"../../data/nyc_taxi.csv","statistic_name":"25%","col_name":"target_col"}'}],
 'attachments': [],
 'execution_time_s': 2}