In [1]:
import pandas as pd

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

### Data description
Reference: https://www.kaggle.com/datasets/prosperchuks/health-dataset/data  
#### Columns:
- **Age**: 13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
- **Sex**: patient's gender (1: male; 0: female)
- **HighChol**: 0 = no high cholesterol 1 = high cholesterol
- **CholCheck**: 0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
- **BMI**: Body Mass Index
- **Smoker**: Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
- **HeartDiseaseorAttack**: coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
- **PhysActivity**: physical activity in past 30 days - not including job 0 = no 1 = yes
- **Fruits**: Consume Fruit 1 or more times per day 0 = no 1 = yes
- **Veggies**: Consume Vegetables 1 or more times per day 0 = no 1 = yes
- **HvyAlcoholConsump**: (adult men >=14 drinks per week and adult women>=7 drinks per week) 0 = no 1 = yes
- **GenHlth**: Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
- **MentHlth**: days of poor mental health scale 1-30 days
- **PhysHlth**: physical illness or injury days in past 30 days scale 1-30
- **DiffWalk**: Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
- **Stroke**: you ever had a stroke. 0 = no, 1 = yes
- **HighBP**: 0 = no high, BP 1 = high BP  

### Research questions and hypothesis
- Hypothesis: older people are at higher risk of diabetes
- What kind of life style contributes or prevents diabetes?

In [None]:
df = pd.read_csv('data/raw_data/diabetes_data.csv')
df.shape

## **TODO**
- wrap assistant in langchain tool
- https://python.langchain.com/v0.2/api_reference/langchain/agents/langchain.agents.openai_assistant.base.OpenAIAssistantRunnable.html#langchain.agents.openai_assistant.base.OpenAIAssistantRunnable

In [2]:

from openai import OpenAI
client = OpenAI()
prompt = "You are a medical scientist great at interpreting clinical data through data visualization and story telling. You analyze data present in .csv files, understand patterns, and come up with data visualizations relevant to those patterns. You also share a brief story of the patterns observed from the data with references provided."

file = client.files.create(
  file=open("data/raw_data/diabetes_data.csv", "rb"),
  purpose='assistants'
)

assistant = client.beta.assistants.create(
  name="Data visualizer",
  description=prompt,
  model="gpt-3.5-turbo",
  tools=[{"type": "code_interpreter"}],
  tool_resources={
    "code_interpreter": {
      "file_ids": [file.id]
    }
  }
)

In [3]:
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [4]:
# create thread
thread = client.beta.threads.create(
            messages=[{"role": "user",
                        "content": content}]
            )

# create run
run = client.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jane Doe. The user has a premium account."
)

In [5]:
import time
while run.status != 'completed':
    print(run.status)
    time.sleep(3)

print(f'run {run.status}')
messages = client.beta.threads.messages.list(thread_id=thread.id)
print(messages)

run completed
SyncCursorPage[Message](data=[Message(id='msg_M2EPjm9NqKcgmwsVAo921kxg', assistant_id='asst_DaVEenaRDF50e0ThmFXe7PKC', attachments=[], completed_at=None, content=[ImageFileContentBlock(image_file=ImageFile(file_id='file-jp0qawrgaAtDljMx4CVqd6B1', detail=None), type='image_file'), TextContentBlock(text=Text(annotations=[], value='The bar plot above shows the distribution of diabetes across different age groups based on the data in the file. Each age group is represented on the x-axis, while the count of individuals with and without diabetes is shown on the y-axis.\n\nFrom the visualization, we can observe how diabetes is distributed across different age groups. If you have any specific questions or would like to explore the data further, feel free to let me know!'), type='text')], created_at=1729401822, incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='assistant', run_id='run_COYgwkeRWtP6NuOaaTau4mRI', status=None, thread_id='thread_R

In [6]:
len(messages.data)

4

In [9]:
messages.data[0].content

[ImageFileContentBlock(image_file=ImageFile(file_id='file-jp0qawrgaAtDljMx4CVqd6B1', detail=None), type='image_file'),
 TextContentBlock(text=Text(annotations=[], value='The bar plot above shows the distribution of diabetes across different age groups based on the data in the file. Each age group is represented on the x-axis, while the count of individuals with and without diabetes is shown on the y-axis.\n\nFrom the visualization, we can observe how diabetes is distributed across different age groups. If you have any specific questions or would like to explore the data further, feel free to let me know!'), type='text')]

In [11]:
messages.data[1].content

[TextContentBlock(text=Text(annotations=[], value="The data contains various columns including 'Age' and 'Diabetes' which will be of interest to us for this analysis. \n\nLet's create data visualizations to explore the relationship between age and diabetes. We can start by plotting a graph to see how diabetes is distributed across different age groups. Let's create this visualization now."), type='text')]

In [12]:
messages.data[2].content

[TextContentBlock(text=Text(annotations=[], value="Sure, let's start by loading the data from the uploaded file to understand its structure and contents. Once we have a clear understanding of the data, we can create data visualizations to explore the relationship between age and diabetes. Let's load the data first."), type='text')]

In [13]:
messages.data[3].content

[TextContentBlock(text=Text(annotations=[], value='Is age an major factor of diabetes? Create a data visualizations based on the data in this file.'), type='text')]

In [14]:
messages.data[1].content[0].text.value

"The data contains various columns including 'Age' and 'Diabetes' which will be of interest to us for this analysis. \n\nLet's create data visualizations to explore the relationship between age and diabetes. We can start by plotting a graph to see how diabetes is distributed across different age groups. Let's create this visualization now."

In [16]:
# retrieve image
image_data = client.files.content(messages.data[0].content[0].image_file.file_id)
image_data_bytes = image_data.read()

with open("data/plot.png", "wb") as file:
    file.write(image_data_bytes)

### Langchain assistant runnable

In [2]:
file_path = "data/raw_data/diabetes_data.csv"
DATA_ANALYSIS_PROMPT = "You are a medical scientist great at interpreting clinical data through data visualization and story telling. You analyze data present in .csv files, understand patterns, and come up with data visualizations relevant to those patterns. You also share a brief story of the patterns observed from the data with references provided."
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [3]:
from langchain_experimental.openai_assistant import OpenAIAssistantRunnable
from openai import OpenAI


In [None]:
interpreter_assistant = OpenAIAssistantRunnable.create_assistant(
                            name="visualization_assistant",
                            instructions = prompt,
                            tools=[{"type": "code_interpreter"}],
                            model="gpt-3.5-turbo", 
                            truncation_strategy={
                                        "type": "last_messages",
                                        "last_messages": 1
                                    }
                            )

In [4]:
def create_thread(file_path, content):    
    """ Define the thread that uploads file and takes input message"""
    
    client = OpenAI()
    file = client.files.create(
                file=open(file_path, "rb"),
                purpose='assistants'
                )

    thread = client.beta.threads.create(
                    messages=[{"role": "user",
                                "content": content,
                                "attachments": [{
                                    "file_id": file.id,
                                    "tools": [{"type": "code_interpreter"}]
                                    }]
                            }],           
                    )
    return thread
    

output = interpreter_assistant.invoke(input={"content": content, 'thread_id':thread.id})[-1]


In [None]:
def plot_node(content, prompt, name='data_vis'):
    assistant = OpenAIAssistantRunnable.create_assistant(
                            name="visualization_assistant",
                            instructions = prompt,
                            tools=[{"type": "code_interpreter"}],
                            model="gpt-3.5-turbo", 
                            truncation_strategy={
                                        "type": "last_messages",
                                        "last_messages": 1
                                    })
    thread = create_thread(file_path, content)
    results = assistant.invoke(input={"content": content, 'thread_id':thread.id})[-1]
    
    # retrieve image
    f_id = results.content[0].image_file.file_id
    image_data = client.files.content(f_id)
    image_data_bytes = image_data.read()
    with open("data/plot.png", "wb") as file:
        file.write(image_data_bytes)
    
    return {"messages": [HumanMessage(content=results.content[1].text.value, name=name)]}


In [5]:
output.content

[ImageFileContentBlock(image_file=ImageFile(file_id='file-y9nyQvJAJkm1YLfDmHqqIPAQ', detail=None), type='image_file'),
 TextContentBlock(text=Text(annotations=[], value='The histograms above show the distribution of ages for individuals with and without diabetes. The red histogram represents individuals with diabetes (Diabetes: Yes), while the blue histogram represents individuals without diabetes (Diabetes: No).\n\nFrom the histograms, we can observe the following:\n- The distribution of ages for individuals without diabetes (blue) is relatively uniform across age groups.\n- The distribution of ages for individuals with diabetes (red) shows a slightly right-skewed pattern, with a slightly higher proportion of older individuals having diabetes.\n\nThis visualization provides a clearer comparison of the age distribution between individuals with and without diabetes. It suggests that there may be a slightly higher likelihood of having diabetes among older individuals, but further analysi

In [6]:
text_results = output.content[1].text.value
text_results

'The histograms above show the distribution of ages for individuals with and without diabetes. The red histogram represents individuals with diabetes (Diabetes: Yes), while the blue histogram represents individuals without diabetes (Diabetes: No).\n\nFrom the histograms, we can observe the following:\n- The distribution of ages for individuals without diabetes (blue) is relatively uniform across age groups.\n- The distribution of ages for individuals with diabetes (red) shows a slightly right-skewed pattern, with a slightly higher proportion of older individuals having diabetes.\n\nThis visualization provides a clearer comparison of the age distribution between individuals with and without diabetes. It suggests that there may be a slightly higher likelihood of having diabetes among older individuals, but further analysis would be needed to quantify and understand this relationship more comprehensively.\n\nIf you would like any additional analyses or visualizations, feel free to let me 

In [7]:
f_id = output.content[0].image_file.file_id
f_id

'file-y9nyQvJAJkm1YLfDmHqqIPAQ'

In [8]:
# retrieve image
image_data = client.files.content(f_id)
image_data_bytes = image_data.read()

with open("data/plot.png", "wb") as file:
    file.write(image_data_bytes)

In [None]:
from utils.data_analysis import data_visualization_node
file_path = "data/raw_data/diabetes_data.csv"
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [None]:
results = data_visualization_node(content, file_path)
results