In [1]:
import pandas as pd
from langchain_experimental.openai_assistant import OpenAIAssistantRunnable

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

### Data description
Reference: https://www.kaggle.com/datasets/prosperchuks/health-dataset/data  
#### Columns:
- **Age**: 13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
- **Sex**: patient's gender (1: male; 0: female)
- **HighChol**: 0 = no high cholesterol 1 = high cholesterol
- **CholCheck**: 0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
- **BMI**: Body Mass Index
- **Smoker**: Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
- **HeartDiseaseorAttack**: coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
- **PhysActivity**: physical activity in past 30 days - not including job 0 = no 1 = yes
- **Fruits**: Consume Fruit 1 or more times per day 0 = no 1 = yes
- **Veggies**: Consume Vegetables 1 or more times per day 0 = no 1 = yes
- **HvyAlcoholConsump**: (adult men >=14 drinks per week and adult women>=7 drinks per week) 0 = no 1 = yes
- **GenHlth**: Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
- **MentHlth**: days of poor mental health scale 1-30 days
- **PhysHlth**: physical illness or injury days in past 30 days scale 1-30
- **DiffWalk**: Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
- **Stroke**: you ever had a stroke. 0 = no, 1 = yes
- **HighBP**: 0 = no high, BP 1 = high BP  

### Research questions and hypothesis
- Hypothesis: older people are at higher risk of diabetes
- What kind of life style contributes or prevents diabetes?

In [13]:
df = pd.read_csv('data/raw_data/diabetes_data.csv')
df.shape

(70692, 18)

In [5]:
# Ref: https://python.langchain.com/v0.2/api_reference/langchain/agents/langchain.agents.openai_assistant.base.OpenAIAssistantRunnable.html#langchain.agents.openai_assistant.base.OpenAIAssistantRunnable

from openai import OpenAI
client = OpenAI()
prompt = "You are a medical scientist great at interpreting clinical data through data visualization and story telling. You analyze data present in .csv files, understand patterns, and come up with data visualizations relevant to those patterns. You also share a brief story of the patterns observed from the data with references provided."

file = client.files.create(
  file=open("data/raw_data/diabetes_data.csv", "rb"),
  purpose='assistants'
)

assistant = client.beta.assistants.create(
  name="Data visualizer",
  description=prompt,
  model="gpt-3.5-turbo",
  tools=[{"type": "code_interpreter"}],
  tool_resources={
    "code_interpreter": {
      "file_ids": [file.id]
    }
  }
)



In [23]:
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [26]:
file = client.files.create(
  file=open("data/raw_data/diabetes_data.csv", "rb"),
  purpose='assistants'
)
file.id

'file-407LwMXODdQ3A0DSV3IkQrcY'

In [46]:
# create thread
thread = client.beta.threads.create(
            messages=[{"role": "user",
                        "content": content}]
            )

In [47]:
# create run
run = client.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jane Doe. The user has a premium account."
)

In [None]:
import time
while run.status != 'completed':
    print(run.status)
    time.sleep(3)

print(f'run {run.status}')
messages = client.beta.threads.messages.list(thread_id=thread.id)
print(messages)

In [48]:
# if run.status == 'completed': 
#   messages = client.beta.threads.messages.list(thread_id=thread.id)
#   print(messages)
# else:
#   print(run.status)


SyncCursorPage[Message](data=[Message(id='msg_8CVNAHWCn218LvJDBHg4vBLG', assistant_id='asst_EXBcbKGQqqYreARSiFQriwle', attachments=[], completed_at=None, content=[ImageFileContentBlock(image_file=ImageFile(file_id='file-eDBy7pWtG6ce1JZ0iQk5LSoG', detail=None), type='image_file'), TextContentBlock(text=Text(annotations=[], value='The bar chart above shows the distribution of diabetes cases across different age groups. From the visualization, we can observe how the proportion of individuals with diabetes varies with age. This can help us determine if age plays a significant role as a factor in the presence of diabetes.\n\nIf you would like to explore more visualizations or have any specific analysis in mind, feel free to let me know!'), type='text')], created_at=1729020579, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='assistant', run_id='run_RJTA6QGKXZ5wOjTejkv1Y58p', status=None, thread_id='thread_Vre3bOl0mLXxMvRPNsouJFjQ'), Message(id='msg_3c

In [52]:
response = messages.data[0].content
response[0]

ImageFileContentBlock(image_file=ImageFile(file_id='file-eDBy7pWtG6ce1JZ0iQk5LSoG', detail=None), type='image_file')

In [63]:
messages.data[1].content[0].text.value

"The dataset contains several columns including 'Age' and 'Diabetes'. The 'Age' column represents the age of individuals, and the 'Diabetes' column indicates whether the individual has diabetes (1 for yes, 0 for no).\n\nLet's now create data visualizations to analyze the relationship between age and diabetes. We can start by creating a bar chart showing the distribution of diabetes cases across different age groups. This will help us understand if there is a significant correlation between age and diabetes. Let's go ahead and create the visualization."

In [58]:
# retrieve image
image_data = client.files.content(response[0].image_file.file_id)
image_data_bytes = image_data.read()

with open("data/my-image.png", "wb") as file:
    file.write(image_data_bytes)