In [None]:
from openai import AzureOpenAI
import json
import os
import time

from dotenv import load_dotenv
load_dotenv()

azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
api_key=os.getenv("AZURE_OPENAI_KEY")

# Create an AzureOpenAI client
client = AzureOpenAI(
  azure_endpoint = azure_endpoint,
  api_key= api_key,
  api_version="2024-05-01-preview"
)
# Create a file
file = client.files.create(
  file=open("./data/bigfootsightings.csv", "rb"),
  purpose='assistants'
)

# Create an assistant
assistant = client.beta.assistants.create(
  model=azure_deployment, # replace with model deployment name.
  name="codeexecuter"
  instructions="""You are an assistant answering questions about bigfootsightings dataset.""",
  tools=[{"type":"code_interpreter"}],
  tool_resources={"code_interpreter":{"file_ids":[file.id]}},
  temperature=1,
  top_p=1
)

# Create a thread
thread = client.beta.threads.create()

In [3]:
PROMPT_SUFFIX = """
- **ALWAYS** before giving the Final Answer, try another method.
Then reflect on the answers of the two methods you did and ask yourself
if it answers correctly the original question.
If you are not sure, try another method.
- If the methods tried do not give the same result,reflect and
try again until you have two methods that have the same result.
- If you still cannot arrive to a consistent result, say that
you are not sure of the answer.
- If you are sure of the correct answer, create a beautiful
and thorough response using Markdown.
- **DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE,
ONLY USE THE RESULTS OF THE CALCULATIONS YOU HAVE DONE**.
- **ALWAYS**, as part of your "Final Answer", explain how you got
to the answer on a section that starts with: "\n\nExplanation:\n".
In the explanation, mention the column names that you used to get
to the final answer and provide the python code you used.

"""

In [4]:
user_question ="""If 1 row in the dataset is 1 sighting, what's our total sightings
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )
if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)

# Print the assistant response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_uwYaEzNojTAGSyvgqHludlHZ', assistant_id='asst_6Tn9P9XnJ6lvG5PBzDpOMgAO', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="We have two consistent results:\n\n- Method 1 (using `shape`): 5021 sightings\n- Method 3 (using `len` function): 5021 sightings\n\nBoth methods correctly consider the total number of rows in the dataset, regardless of missing values in the columns, and both report 5021 sightings.\n\n## Explanation:\nThe total number of sightings in the dataset was calculated using the `shape` attribute and the `len()` function, both of which confirmed that there are 5021 rows in the dataset, meaning there are 5021 sightings. \n\nHere's the python code used:\n```python\nimport pandas as pd\n\n# Load the dataset\nfile_path = '/mnt/data/assistant-jTs5aSPMjwfDUPeK7IUvvbIi'\ndata = pd.read_csv(file_path)\n\n# Calculate the total number of sightings using the shape attribute\ntotal_sightings = data

In [5]:
user_question ="""Which state has the most sightings? Provide the number of sightings in that state
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_3xRqQEeaaSQ9DZs4nXcJ8YX6', assistant_id='asst_6Tn9P9XnJ6lvG5PBzDpOMgAO', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="Both methods confirm that Washington has the most sightings with a total of 601 sightings.\n\n## Explanation:\nThe state with the most sightings was identified using two different methods: the `value_counts` method and grouping the data by state and calculating the size of each group. Both methods resulted in Washington as the state with the most sightings, with a total of 601 sightings.\n\nHere's the python code used:\n```python\nimport pandas as pd\n\n# Load the dataset\nfile_path = '/mnt/data/assistant-jTs5aSPMjwfDUPeK7IUvvbIi'\ndata = pd.read_csv(file_path)\n\n# Method 1: Identify the state with the most sightings using value_counts\nstate_sightings = data['state'].value_counts()\nmost_sightings_state = state_sightings.idxmax()\nmost_sightings_count = state_sightings.max()

In [1]:
user_question ="""Which season has the most sightings?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


NameError: name 'client' is not defined

In [45]:
user_question ="""What are the top 10 years with the most sightings? Use the date column and get the year there.
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_7sNlYwUdNl4ywzfMis46K5L7', assistant_id='asst_2l1ustJwnB2SyY8PwkNB9W9L', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="## Final Answer\n\nThe top 10 years with the most bigfoot sightings are as follows:\n\n1. 2004: 241 sightings\n2. 2005: 209 sightings\n3. 2006: 176 sightings\n4. 2003: 152 sightings\n5. 2000: 146 sightings\n6. 2007: 145 sightings\n7. 2008: 141 sightings\n8. 2012: 141 sightings\n9. 2001: 131 sightings\n10. 2011: 121 sightings\n\n---\n\n## Explanation\n\nTo get the years with the most sightings:\n\nWe first transformed the 'date' column into a datetime format. Then, we extracted the year and created a new 'year' column in the dataframe. \n\nSubsequently, we counted the number of sightings in each year. We used two methods to carry out these calculations:\n\n1. Method 1: Use the value_counts() function to tally the instances of each year, sort them in descending order, and pick t

In [5]:
user_question ="""How many sightings did we have for Washington state in the year 2000?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


failed
### Mt. Mitchell Sighting in the Dataset

There is one recorded Bigfoot sighting in the dataset which mentions Mt. Mitchell. Here are the details of that sighting:

- **Observed:** "In early spring 1988, some friends of mine and I were hiking around Mt. Mitchell. We noticed deep impressions in the snow..."
- **County:** Yancey County
- **State:** North Carolina
- **Season:** Spring
- **Date:** March 15, 1988
- **Classification:** Class B

### Explanation:
To find out if there are any Bigfoot sightings observed at Mt. Mitchell, I checked for the occurrence of "Mt. Mitchell" in the "observed" column of the dataset. The steps taken are as follows:

1. Loaded the data and inspected the columns to understand its structure.
2. Filtered the dataset to locate rows where the "observed" column contains the text "Mt. Mitchell".
3. Verified the count and details of the sightings mentioning Mt. Mitchell.

Here is the Python code used:

```python
import pandas as pd

# Load the dataframe from

In [None]:
user_question ="""Using the "observed" column, are there any sightings observed at Mt. Mitchell?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )
if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)

# Print the assistant response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


# Delete Assistant

In [7]:
response = client.beta.assistants.delete(assistant.id)