# Part 15

# Using Code Interpreter

Universal code for the entire notebook

In [1]:
# Uncomment the line below to make sure you have all the packages needed
# %pip install -r requirements.txt

In [2]:
# Import necessary libraries
from openai import OpenAI  # Used for interacting with OpenAI's API
from typing_extensions import override  # Used for overriding methods in subclasses
from openai import AssistantEventHandler  # Used for handling events related to OpenAI assistants
from IPython.display import display, Markdown, clear_output

import time
import threading

In [3]:
# Create an instance of the OpenAI class to interact with the API.
# This assumes you have set the OPENAI_API_KEY environment variable.
client = OpenAI() 

In [4]:
class EventHandler(AssistantEventHandler):
    """Custom event handler for processing assistant events."""

    def __init__(self):
        super().__init__()
        self.results = []  # Initialize the results list

    @override
    def on_text_created(self, text) -> None:
        """Handle the event when text is first created."""
        print("\nassistant text > ", end="", flush=True)
        self.results.append(text)

    @override
    def on_text_delta(self, delta, snapshot):
        """Handle the event when there is a text delta (partial text)."""
        print(delta.value, end="", flush=True)
        self.results.append(delta.value)

    def on_tool_call_created(self, tool_call):
        """Handle the event when a tool call is created."""
        print(f"\nassistant tool > {tool_call.type}\n", flush=True)

    def on_tool_call_delta(self, delta, snapshot):
        """Handle the event when there is a delta (update) in a tool call."""
        if delta.type == 'code_interpreter':
            if delta.code_interpreter.input:
                print(delta.code_interpreter.input, end="", flush=True)
                self.results.append(delta.code_interpreter.input)
            if delta.code_interpreter.outputs:
                print("\n\noutput >", flush=True)
                for output in delta.code_interpreter.outputs or []:  # Adding a safeguard
                    if output.type == "logs":
                        print(f"\n{output.logs}", flush=True)
                        self.results.append(output.logs)


## Creating an Assistant with Code Interpreter Enabled

Our first step is to create an Assistant that can use Code Interpreter

In [5]:
# Create an assistant using the client library.
assistant = client.beta.assistants.create(
    model="gpt-4o",  # Specify the model to be used.
    
    instructions=""" 
        You are a helpful assistant.
    """,
    
    name="Code Interpreter Assistant",  # Give the assistant a name.
    
    tools=[{"type": "code_interpreter"}], # Add the code interpreter capability to the assistant.
    
    metadata={  # Add metadata about the assistant's capabilities.
        "can_be_used_for_code_analysis": "True",
        "can_do_python": "True",
    },
    temperature=1,  # Set the temperature for response variability.
    top_p=1,  # Set the top_p for nucleus sampling.
)

# Print the details of the created assistant to check its properties.
print(assistant)  # Print the full assistant object.
print("\n\n")
print(assistant.name)  # Print the name of the assistant.
print(assistant.metadata)  # Print the metadata of the assistant.

Assistant(id='asst_NJXSq3fabZZsjUPd3VKQJ7Wr', created_at=1718456940, description=None, instructions=' \n        You are a helpful assistant.\n    ', metadata={'can_be_used_for_code_analysis': 'True', 'can_do_python': 'True'}, model='gpt-4o', name='Code Interpreter Assistant', object='assistant', tools=[CodeInterpreterTool(type='code_interpreter')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=ToolResourcesCodeInterpreter(file_ids=[]), file_search=None), top_p=1.0)



Code Interpreter Assistant
{'can_be_used_for_code_analysis': 'True', 'can_do_python': 'True'}


## Passing Files to Code Interpreter

There are a variety of ways to get files for Code Interpreter to use. 
- Assistant files - viewable by all runs that use the assistant.
- Thread files - only viewable by runs that use the thread. 

Let's review the code for the two main approaches.

### Getting Files to the Assistant

First, you have to have a file that has been uploaded so we can pass it to our assistant.

In [6]:
# Upload a file with an "assistants" purpose
assistant_file = client.files.create(
    file=open("./artifacts/penguins_size.csv", "rb"),
    purpose='assistants'
)

print(assistant_file)

FileObject(id='file-WgMygp342PBPHUaAkRhMQURl', bytes=13519, created_at=1718456940, filename='penguins_size.csv', object='file', purpose='assistants', status='processed', status_details=None)


Next, we need to modify our Assistant with the new file information. 

In [7]:
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tools=[{"type": "code_interpreter"}],
    tool_resources={
        "code_interpreter": {
            "file_ids": [assistant_file.id]
        }
    }
)

print(assistant)

Assistant(id='asst_NJXSq3fabZZsjUPd3VKQJ7Wr', created_at=1718456940, description=None, instructions=' \n        You are a helpful assistant.\n    ', metadata={'can_be_used_for_code_analysis': 'True', 'can_do_python': 'True'}, model='gpt-4o', name='Code Interpreter Assistant', object='assistant', tools=[CodeInterpreterTool(type='code_interpreter')], response_format='auto', temperature=1.0, tool_resources=ToolResources(code_interpreter=ToolResourcesCodeInterpreter(file_ids=['file-WgMygp342PBPHUaAkRhMQURl']), file_search=None), top_p=1.0)


Finally, let's run a message and see if it is working.

In [8]:

# Your assistant code remains unchanged
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a summary of the file penguins_size.csv."
        },
    ]
)

with client.beta.threads.runs.stream(
    thread_id=assistant_thread.id,
    assistant_id=assistant.id,
    instructions="""
    You are a helpful assistant.
    """,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()



assistant tool > code_interpreter

import pandas as pd

# Load the CSV file
file_path = '/mnt/data/file-WgMygp342PBPHUaAkRhMQURl'
penguins_df = pd.read_csv(file_path)

# Get a summary of the DataFrame
summary = {
    'columns': penguins_df.columns.tolist(),
    'head': penguins_df.head(),
    'info': penguins_df.info(),
    'description': penguins_df.describe()
}

summary

output >

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


assistant text > The f

### Formatting the Output
What if we want to format the markdown output? There are two ways to do it. The "easy" way is to just let the output render without streaming and format it afterward.

In [16]:
# Need a thread to send message and get output
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a summary of the file penguins_size.csv.  With at least one small table of data."
        },
    ]
)

run = client.beta.threads.runs.create_and_poll(
    thread_id=assistant_thread.id, assistant_id=assistant.id
)

messages = list(client.beta.threads.messages.list(thread_id=assistant_thread.id, run_id=run.id))

message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
    message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
    if file_citation := getattr(annotation, "file_citation", None):
        cited_file = client.files.retrieve(file_citation.file_id)
        citations.append(f"[{index}] {cited_file.filename}")

display(Markdown(message_content.value))


### Summary of the Dataset

The `penguins_size.csv` dataset contains measurements and characteristics of penguins. Here is a summary of the dataset:

- **Total Entries**: 344
- **Columns**: 7
- **Non-null Entries**: 
  - `species`: 344 non-null
  - `island`: 344 non-null
  - `culmen_length_mm`: 342 non-null
  - `culmen_depth_mm`: 342 non-null
  - `flipper_length_mm`: 342 non-null
  - `body_mass_g`: 342 non-null
  - `sex`: 334 non-null
- **Data Types**: 
  - 4 columns of type float64
  - 3 columns of type object (categorical data)

### Column Meanings

- **species**: Species of the penguin (e.g., Adelie, Chinstrap, Gentoo)
- **island**: Island where the penguin was observed (e.g., Biscoe, Dream, Torgersen)
- **culmen_length_mm**: Length of the culmen (mm)
- **culmen_depth_mm**: Depth of the culmen (mm)
- **flipper_length_mm**: Length of the flipper (mm)
- **body_mass_g**: Body mass (g)
- **sex**: Sex of the penguin (e.g., MALE, FEMALE)

### Data Sample

Here is a small sample of the dataset:

| species | island    | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex    |
|---------|-----------|------------------|-----------------|-------------------|-------------|--------|
| Adelie  | Torgersen | 39.1             | 18.7            | 181.0             | 3750.0      | MALE   |
| Adelie  | Torgersen | 39.5             | 17.4            | 186.0             | 3800.0      | FEMALE |
| Adelie  | Torgersen | 40.3             | 18.0            | 195.0             | 3250.0      | FEMALE |
| Adelie  | Torgersen | NaN              | NaN             | NaN               | NaN         | NaN    |
| Adelie  | Torgersen | 36.7             | 19.3            | 193.0             | 3450.0      | FEMALE |

### Statistical Summary

|                     |      Value   |
|---------------------|--------------|
| **Mean Culmen Length (mm)**       | 43.92  |
| **Mean Culmen Depth (mm)**        | 17.15  |
| **Mean Flipper Length (mm)**      | 200.92 |
| **Mean Body Mass (g)**            | 4201.75|
| **Most Frequent Species**         | Adelie |
| **Most Frequent Island**          | Biscoe |
| **Most Frequent Sex**             | MALE   |

If you need more detailed information or additional analysis, feel free to ask!

The "hard" way is to stream the output and update the display while streaming to show the formatted text. This is what ChatGPT does when you use it. This will require modifying our event handler to be more streamlined, formatting output, and updating our display.

In [15]:

class EventHandler(AssistantEventHandler):
    """Custom event handler for processing assistant events."""

    def __init__(self):
        super().__init__()
        self.results = []  # Initialize the results list

    @override
    def on_text_delta(self, delta, snapshot):
        """Handle the event when there is a text delta (partial text)."""
        self.results.append(delta.value)
        self.update_output()

    def update_output(self):
        """Update the Jupyter Notebook cell with the current markdown content."""
        clear_output(wait=True)
        markdown_content = "".join(self.results)
        display(Markdown(markdown_content))

# Your assistant code remains unchanged
assistant_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a summary of the file penguins_size.csv. With at least one small table of data."
        },
    ]
)

with client.beta.threads.runs.stream(
    thread_id=assistant_thread.id,
    assistant_id=assistant.id,
    instructions="""
    You are a helpful assistant.
    """,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


### Summary of the penguins_size.csv File

The dataset contains the following columns:
1. **species**: The species of the penguin (Adelie, Chinstrap, Gentoo).
2. **island**: The island where the penguin was observed (Biscoe, Dream, Torgersen).
3. **culmen_length_mm**: The length of the culmen (bill) in millimeters.
4. **culmen_depth_mm**: The depth of the culmen (bill) in millimeters.
5. **flipper_length_mm**: The length of the flipper in millimeters.
6. **body_mass_g**: The body mass of the penguin in grams.
7. **sex**: The sex of the penguin (MALE, FEMALE).

#### Summary Statistics:
- **species**: 3 unique species with 344 entries.
- **island**: 3 unique islands with 344 entries.
- **culmen_length_mm**: 342 entries, mean of 43.92 mm, standard deviation of 5.46 mm, min of 32.1 mm, max of 59.6 mm.
- **culmen_depth_mm**: 342 entries, mean of 17.15 mm, standard deviation of 1.97 mm, min of 13.1 mm, max of 21.5 mm.
- **flipper_length_mm**: 342 entries, mean of 200.92 mm, standard deviation of 14.06 mm, min of 172 mm, max of 231 mm.
- **body_mass_g**: 342 entries, mean of 4201.75 g, standard deviation of 801.95 g, min of 2700 g, max of 6300 g.
- **sex**: 334 entries, 3 unique values with 'MALE' being the most frequent.

#### Sample Data:
| species | island    | culmen_length_mm | culmen_depth_mm | flipper_length_mm | body_mass_g | sex   |
|---------|-----------|------------------|-----------------|-------------------|-------------|-------|
| Adelie  | Torgersen | 39.1             | 18.7            | 181.0             | 3750.0      | MALE  |
| Adelie  | Torgersen | 39.5             | 17.4            | 186.0             | 3800.0      | FEMALE|
| Adelie  | Torgersen | 40.3             | 18.0            | 195.0             | 3250.0      | FEMALE|
| Adelie  | Torgersen | NaN              | NaN             | NaN               | NaN         | NaN   |
| Adelie  | Torgersen | 36.7             | 19.3            | 193.0             | 3450.0      | FEMALE|

This dataset provides measurements for different penguin species observed on various islands. The measurements include physical attributes such as culmen length, depth, flipper length, and body mass, along with the sex of the penguins.

### Getting Files to the Thread

First, we need a file uploaded.


In [None]:
# Upload a file with an "assistants" purpose
thread_file = client.files.create(
    file=open("./artifacts/daily-bike-share.csv", "rb"),
    purpose='assistants'
)

print(thread_file)

Second, we need a thread to attach the file to

In [None]:
thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a summary of the daily-bike-share.csv file."
        },
    ]
)

print(thread)

Third, we can update the thread with the file information

In [None]:
updated_thread = client.beta.threads.update(
    thread_id=thread.id,
    tool_resources={
        "code_interpreter": {
            "file_ids": [thread_file.id]
        }
    }
)

print(updated_thread)

Finally, let's run it against a new assistant and see the results

In [None]:
# Create an assistant using the client library.
thread_assistant = client.beta.assistants.create(
    model="gpt-4o",  # Specify the model to be used.
    
    instructions=""" 
        You are a helpful assistant.
    """,
    
    name="Code Interpreter Assistant Using Thread Data",  # Give the assistant a name.
    
    tools=[{"type": "code_interpreter"}], # Add the code interpreter capability to the assistant.
    
    metadata={  # Add metadata about the assistant's capabilities.
        "can_be_used_for_code_analysis": "True",
        "can_do_python": "True",
    },
    temperature=1,  # Set the temperature for response variability.
    top_p=1,  # Set the top_p for nucleus sampling.
)

# Print the details of the created assistant to check its properties.
print(assistant)  # Print the full assistant object.
print("\n\n")
print(assistant.name)  # Print the name of the assistant.
print(assistant.metadata)  # Print the metadata of the assistant.

In [None]:
# stream the output from the assistant
with client.beta.threads.runs.stream(
    thread_id=updated_thread.id,
    assistant_id=thread_assistant.id,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()

### Text Results from files in Assistants and Threads

Let's see what happens if we use an assistant with a file and a thread with a file together

In [None]:
super_thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Give me a summary of the penguins-size.csv and daily-bike-share.csv files."
        },
    ]
)

print(super_thread)

In [None]:
super_updated_thread = client.beta.threads.update(
    thread_id=super_thread.id,
    tool_resources={
        "code_interpreter": {
            "file_ids": [thread_file.id]
        }
    }
)

print(super_updated_thread)

In [None]:
# stream the output from the assistant
with client.beta.threads.runs.stream(
    thread_id=super_updated_thread.id,
    assistant_id=assistant.id,
    event_handler=EventHandler(),
) as stream:
    stream.until_done()

### Full Results Output

We may have other things that are produced

In [None]:
import openai
from IPython.display import Image, display
import json

# Initialize the OpenAI client
client = openai.OpenAI()

# Upload the file
file = client.files.create(
    file=open("./artifacts/penguins_size.csv", "rb"),
    purpose='assistants'
)

# Create the assistant with the Code Interpreter tool
assistant = client.beta.assistants.create(
    instructions="You are a data analyst. When provided with a file, analyze the data and generate visualizations.",
    model="gpt-4o",
    tools=[{"type": "code_interpreter"}],
    tool_resources={
        "code_interpreter": {
            "file_ids": [file.id]
        }
    }
)

# Create a thread to start the analysis
thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": "Please analyze the data in the uploaded file and generate relevant visualizations.",
            "attachments": [
                {
                    "file_id": file.id,
                    "tools": [{"type": "code_interpreter"}]
                }
            ]
        }
    ]
)

# Fetch the results of the analysis
response = client.beta.threads.retrieve(thread.id)

# Convert the response to a dictionary to access its attributes
response_dict = response.to_dict()

# Debug: Print the entire response to understand its structure
print(json.dumps(response_dict, indent=2))

# Access messages from the thread
messages = response_dict.get('messages', [])
for message in messages:
    content = message.get('content', '')
    print(content)
    
    # If the response includes images, download and save them
    if "image_file" in message:
        image_file_id = message["image_file"]["file_id"]
        image_data = client.files.content(image_file_id)
        with open("analysis_image.png", "wb") as image_file:
            image_file.write(image_data.read())
        # Display the image in the notebook
        display(Image("analysis_image.png"))
