In [1]:
from openai import AzureOpenAI
import json
import os
import time

from dotenv import load_dotenv
load_dotenv()

azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
api_key=os.getenv("AZURE_OPENAI_KEY")

# Create an AzureOpenAI client
client = AzureOpenAI(
  azure_endpoint = azure_endpoint,
  api_key= api_key,
  api_version="2024-05-01-preview"
)
# Create a file
file = client.files.create(
  file=open("./data/bigfootsightings.csv", "rb"),
  purpose='assistants'
)

# Create an assistant
assistant = client.beta.assistants.create(
  model=azure_deployment, # replace with model deployment name.
  instructions="""You are an assistant answering questions about bigfootsightings dataset.""",
  tools=[{"type":"code_interpreter"}],
  tool_resources={"code_interpreter":{"file_ids":[file.id]}},
  temperature=1,
  top_p=1
)

# Create a thread
thread = client.beta.threads.create()

In [2]:
PROMPT_SUFFIX = """
- **ALWAYS** before giving the Final Answer, try another method.
Then reflect on the answers of the two methods you did and ask yourself
if it answers correctly the original question.
If you are not sure, try another method.
- If the methods tried do not give the same result,reflect and
try again until you have two methods that have the same result.
- If you still cannot arrive to a consistent result, say that
you are not sure of the answer.
- If you are sure of the correct answer, create a beautiful
and thorough response using Markdown.
- **DO NOT MAKE UP AN ANSWER OR USE PRIOR KNOWLEDGE,
ONLY USE THE RESULTS OF THE CALCULATIONS YOU HAVE DONE**.
- **ALWAYS**, as part of your "Final Answer", explain how you got
to the answer on a section that starts with: "\n\nExplanation:\n".
In the explanation, mention the column names that you used to get
to the final answer and provide the python code you used.

"""

In [3]:
user_question ="""If 1 row in the dataset is 1 sighting, what's our total sightings
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )
if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)

# Print the assistant response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_L6dl8dOYdx6oNVBeuQrMaWgC', assistant_id='asst_fEif7adqlsIR1KVg6yeYBHAN', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="While the total number of rows in the dataset is 5021, there are 4983 non-null values in the 'observed' column. This discrepancy could be due to some rows not having an observation recorded, implying missing or incomplete data for some sightings.\n\n# Final Answer\n\nThere is a total of 5021 rows in the dataset representing 5021 Bigfoot sightings. This number, however, includes possibly incomplete data as there are only 4983 non-null records in the 'observed' column. This could mean that there are 5021 reported sightings, but only 4983 of these reports included recorded observations.\n\n## Explanation:\n\nThe total number of Bigfoot sightings was determined by counting the number of rows in the dataset and verifying it by counting the number of non-null observations recorded i

In [4]:
user_question ="""Which state has the most sightings? Provide the number of sightings in that state
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


failed
While the total number of rows in the dataset is 5021, there are 4983 non-null values in the 'observed' column. This discrepancy could be due to some rows not having an observation recorded, implying missing or incomplete data for some sightings.

# Final Answer

There is a total of 5021 rows in the dataset representing 5021 Bigfoot sightings. This number, however, includes possibly incomplete data as there are only 4983 non-null records in the 'observed' column. This could mean that there are 5021 reported sightings, but only 4983 of these reports included recorded observations.

## Explanation:

The total number of Bigfoot sightings was determined by counting the number of rows in the dataset and verifying it by counting the number of non-null observations recorded in the 'observed' column.

The Python code used:

```python
total_sightings = len(data)
total_sightings_observed = data['observed'].count()
```


In [5]:
user_question ="""Which season has the most sightings?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_qdIgpQzEIbRzH8UnmVZ7hSNw', assistant_id='asst_fEif7adqlsIR1KVg6yeYBHAN', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="The second method confirmed the first result. The season with the most sightings is the Summer, with 1867 sightings.\n\n# Final Answer\n\nThe season with most Bigfoot sightings is Summer, with the sighting count being 1867.\n\n## Explanation:\n\nThe 'season' column of the dataset was used to determine the season with the most Bigfoot sightings. Two methods were used:\n\nIn the first method, we used the 'value_counts' function on the 'season' column which gives us a count of unique values. We then used 'idxmax' to find the season with maximum count and 'max' to get the count itself.\n\nIn the second method, we grouped the dataset by 'season' using 'groupby' function and then counted the number of sightings in each season. We then used 'idxmax' and 'max' functions to find the se

In [45]:
user_question ="""What are the top 10 years with the most sightings? Use the date column and get the year there.
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_7sNlYwUdNl4ywzfMis46K5L7', assistant_id='asst_2l1ustJwnB2SyY8PwkNB9W9L', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value="## Final Answer\n\nThe top 10 years with the most bigfoot sightings are as follows:\n\n1. 2004: 241 sightings\n2. 2005: 209 sightings\n3. 2006: 176 sightings\n4. 2003: 152 sightings\n5. 2000: 146 sightings\n6. 2007: 145 sightings\n7. 2008: 141 sightings\n8. 2012: 141 sightings\n9. 2001: 131 sightings\n10. 2011: 121 sightings\n\n---\n\n## Explanation\n\nTo get the years with the most sightings:\n\nWe first transformed the 'date' column into a datetime format. Then, we extracted the year and created a new 'year' column in the dataframe. \n\nSubsequently, we counted the number of sightings in each year. We used two methods to carry out these calculations:\n\n1. Method 1: Use the value_counts() function to tally the instances of each year, sort them in descending order, and pick t

In [19]:
user_question ="""How many sightings did we have for Washington state in the year 2000?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )

if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)


content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


incomplete
The state with the most Bigfoot sightings is Washington, with a total of 601 sightings.

Explanation:
We obtained this answer by counting the number of sightings in each state using the 'state' column of the dataset. This was done with the `value_counts()` function in Python, which returns a series with the counts of unique values. The state with the most sightings can then be found by identifying the maximum value in this series, done with the `idxmax()` function. As a validation, we also sorted the counts in descending order and picked the first one, and obtained the same result confirming our calculations. The Python code used is as follows:

```python
# Get the count of sightings by state
sightings_by_state = data['state'].value_counts()

# Method 1:
most_sightings_state = sightings_by_state.idxmax()
number_of_sightings = sightings_by_state.max()

# Method 2:
sightings_by_state_sorted = sightings_by_state.sort_values(ascending=False)
most_sightings_state_2 = sightings_by

In [21]:
user_question ="""Using the "observed" column, are there any sightings observed at Mt. Mitchell?
"""

# Add a user question to the thread
message = client.beta.threads.messages.create(
  thread_id=thread.id,
  role="user",
  content=PROMPT_SUFFIX + user_question
)

# Run the thread
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id
)

# Looping until the run completes or fails
while run.status in ['queued', 'in_progress', 'cancelling']:
  time.sleep(1)
  run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id
  )
if run.status == 'completed':
  messages = client.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
elif run.status == 'requires_action':
  # the assistant requires calling some functions
  # and submit the tool outputs back to the run
  pass
else:
  print(run.status)

# Print the assistant response
content_block = messages.data[0].content[0]
value = content_block.text.value
print(value)


SyncCursorPage[Message](data=[Message(id='msg_bX77TwolaLCUdOr82QU2qZZh', assistant_id='asst_BI59EOKcXzKgCfgCY6Xg07T0', attachments=[], completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='Final Answer:\n\nYes, there is one sighting noted in the "observed" column of the dataset that refers to Mt. Mitchell.\n\nExplanation:\n\nUsing the pandas str.contains method, I first performed a case-insensitive search for the term "Mt. Mitchell" in the "observed" column of the dataset.\n\n```python\nsightings_mt_mitchell = df[df[\'observed\'].str.contains(\'Mt. Mitchell\', na=False)]\nsightings_count = sightings_mt_mitchell.shape[0]\n```\n\nThe result of this code was 1, meaning one record in the observed column contained the term "Mt. Mitchell".\n\nFor additional confidence, I implemented a more sophisticated search using Regular Expressions (regex) that also considered possible variations of the term "Mt. Mitchell", such as "Mount Mitchell" or "Mt Mitchell". The code was:\