In [11]:
import pandas as pd

In [None]:
from dotenv import load_dotenv
load_dotenv()

### Data description
Reference: https://www.kaggle.com/datasets/prosperchuks/health-dataset/data  
#### Columns:
- **Age**: 13-level age category (_AGEG5YR see codebook) 1 = 18-24 9 = 60-64 13 = 80 or older
- **Sex**: patient's gender (1: male; 0: female)
- **HighChol**: 0 = no high cholesterol 1 = high cholesterol
- **CholCheck**: 0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years
- **BMI**: Body Mass Index
- **Smoker**: Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes
- **HeartDiseaseorAttack**: coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes
- **PhysActivity**: physical activity in past 30 days - not including job 0 = no 1 = yes
- **Fruits**: Consume Fruit 1 or more times per day 0 = no 1 = yes
- **Veggies**: Consume Vegetables 1 or more times per day 0 = no 1 = yes
- **HvyAlcoholConsump**: (adult men >=14 drinks per week and adult women>=7 drinks per week) 0 = no 1 = yes
- **GenHlth**: Would you say that in general your health is: scale 1-5 1 = excellent 2 = very good 3 = good 4 = fair 5 = poor
- **MentHlth**: days of poor mental health scale 1-30 days
- **PhysHlth**: physical illness or injury days in past 30 days scale 1-30
- **DiffWalk**: Do you have serious difficulty walking or climbing stairs? 0 = no 1 = yes
- **Stroke**: you ever had a stroke. 0 = no, 1 = yes
- **HighBP**: 0 = no high, BP 1 = high BP  

### Research questions and hypothesis
- Hypothesis: older people are at higher risk of diabetes
- What kind of life style contributes or prevents diabetes?

In [None]:
df = pd.read_csv('data/raw_data/diabetes_data.csv')
df.shape

In [None]:
df.head()

## **TODO**
- wrap assistant in langchain tool
- https://python.langchain.com/v0.2/api_reference/langchain/agents/langchain.agents.openai_assistant.base.OpenAIAssistantRunnable.html#langchain.agents.openai_assistant.base.OpenAIAssistantRunnable

In [2]:

from openai import OpenAI
client = OpenAI()
prompt = "You are a medical scientist great at interpreting clinical data through data visualization and story telling. You analyze data present in .csv files, understand patterns, and come up with data visualizations relevant to those patterns. You also share a brief story of the patterns observed from the data with references provided."

file = client.files.create(
  file=open("data/raw_data/diabetes_data.csv", "rb"),
  purpose='assistants'
)

assistant = client.beta.assistants.create(
  name="Data visualizer",
  description=prompt,
  model="gpt-3.5-turbo",
  tools=[{"type": "code_interpreter"}],
  tool_resources={
    "code_interpreter": {
      "file_ids": [file.id]
    }
  }
)

In [3]:
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [4]:
# create thread
thread = client.beta.threads.create(
            messages=[{"role": "user",
                        "content": content}]
            )

# create run
run = client.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jane Doe. The user has a premium account."
)

In [None]:
import time
while run.status != 'completed':
    print(run.status)
    time.sleep(3)

print(f'run {run.status}')
messages = client.beta.threads.messages.list(thread_id=thread.id)
print(messages)

In [None]:
len(messages.data)

In [None]:
messages.data[0].content

In [None]:
messages.data[1].content

In [None]:
messages.data[2].content

In [None]:
messages.data[3].content

In [None]:
messages.data[1].content[0].text.value

In [16]:
# retrieve image
image_data = client.files.content(messages.data[0].content[0].image_file.file_id)
image_data_bytes = image_data.read()

with open("data/plot.png", "wb") as file:
    file.write(image_data_bytes)

### Langchain assistant runnable

In [2]:
file_path = "data/raw_data/diabetes_data.csv"
DATA_ANALYSIS_PROMPT = "You are a medical scientist great at interpreting clinical data through data visualization and story telling. You analyze data present in .csv files, understand patterns, and come up with data visualizations relevant to those patterns. You also share a brief story of the patterns observed from the data with references provided."
content = "What caused diabetes?"

In [3]:
from langchain_experimental.openai_assistant import OpenAIAssistantRunnable
from openai import OpenAI


In [None]:
interpreter_assistant = OpenAIAssistantRunnable.create_assistant(
                            name="visualization_assistant",
                            instructions = prompt,
                            tools=[{"type": "code_interpreter"}],
                            model="gpt-3.5-turbo", 
                            truncation_strategy={
                                        "type": "last_messages",
                                        "last_messages": 1
                                    }
                            )

In [4]:
def create_thread(file_path, content):    
    """ Define the thread that uploads file and takes input message"""
    
    client = OpenAI()
    file = client.files.create(
                file=open(file_path, "rb"),
                purpose='assistants'
                )

    thread = client.beta.threads.create(
                    messages=[{"role": "user",
                                "content": content,
                                "attachments": [{
                                    "file_id": file.id,
                                    "tools": [{"type": "code_interpreter"}]
                                    }]
                            }],           
                    )
    return thread
    

output = interpreter_assistant.invoke(input={"content": content, 'thread_id':thread.id})[-1]


In [None]:
def plot_node(content, prompt, name='data_vis'):
    assistant = OpenAIAssistantRunnable.create_assistant(
                            name="visualization_assistant",
                            instructions = prompt,
                            tools=[{"type": "code_interpreter"}],
                            model="gpt-3.5-turbo", 
                            truncation_strategy={
                                        "type": "last_messages",
                                        "last_messages": 1
                                    })
    thread = create_thread(file_path, content)
    results = assistant.invoke(input={"content": content, 'thread_id':thread.id})[-1]
    
    # retrieve image
    f_id = results.content[0].image_file.file_id
    image_data = client.files.content(f_id)
    image_data_bytes = image_data.read()
    with open("data/plot.png", "wb") as file:
        file.write(image_data_bytes)
    
    return {"messages": [HumanMessage(content=results.content[1].text.value, name=name)]}


In [None]:
output.content

In [None]:
text_results = output.content[1].text.value
text_results

In [None]:
f_id = output.content[0].image_file.file_id
f_id

In [8]:
# retrieve image
image_data = client.files.content(f_id)
image_data_bytes = image_data.read()

with open("data/plot.png", "wb") as file:
    file.write(image_data_bytes)

In [1]:
from utils.data_analysis import data_visualization_node
file_path = "data/raw_data/diabetes_data.csv"
content = "Is age an major factor of diabetes? Create a data visualizations based on the data in this file."

In [None]:
results = data_visualization_node(content, file_path)
results

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from utils.graphs import compile_analysis_graph

  from tqdm.autonotebook import tqdm, trange

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langgraph.pregel import Channel, Pregel


In [3]:
graph = compile_analysis_graph()

  | llm.bind_functions(functions=[function_def], function_call="route")
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/assistants "HTTP/1.1 200 OK"


Processing data/literature/s41591-023-02278-8.pdf...
Processing data/literature/PIIS1550413121006318.pdf...
Processing data/text_books/Textbook-of-Diabetes-2024-shortened.pdf...


In [4]:
question = "What caused diabetes?"
file_path = "data/raw_data/diabetes_data.csv"

In [None]:
graph.invoke(question, file_path=file_path)

In [5]:
for s in graph.stream(question, file_path=file_path):
    if "__end__" not in s:
        print(s)
        print("---")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
INFO:root:File uploaded successfully with ID: file-VUFduroUmDush7RbCFuXN6hN
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads "HTTP/1.1 200 OK"
INFO:root:Thread created successfully with ID: thread_J9BkEWJXED0yyImWIDCV3iQe
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_J9BkEWJXED0yyImWIDCV3iQe/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_J9BkEWJXED0yyImWIDCV3iQe/runs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_J9BkEWJXED0yyImWIDCV3iQe/runs/run_VDtDGyjJhmCN0eOyGdykPz1s "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_J9BkEWJXED0yyImWIDCV3iQe/runs/run_VDtDGyjJhmCN0eOyGdykPz1s "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_J9BkEWJXED0yyImWIDCV3iQe/runs/run_VDtDGyjJhmCN0eOyGdykPz1s "HTTP/1.1 200 OK"


{'Visualisation': {'messages': [HumanMessage(content='### Observations from the Correlation Heatmap\n\nThe heatmap above illustrates the correlations of various variables with diabetes using the Pearson correlation coefficient. Below are some key observations:\n\n1. **Age**: There is a moderate positive correlation (0.28) between age and diabetes. This indicates that as age increases, the likelihood of having diabetes also tends to increase.\n  \n2. **High Cholesterol (HighChol)**: High cholesterol shows a moderate positive correlation (0.29) with diabetes, suggesting that individuals with high cholesterol levels are more likely to have diabetes.\n\n3. **Body Mass Index (BMI)**: BMI also presents a moderate positive correlation (0.29) with diabetes, indicating that higher BMI is associated with a higher prevalence of diabetes.\n\n4. **Physical Activity (PhysActivity)**: Physical activity has a small negative correlation (-0.16) with diabetes, suggesting that individuals who engage in p

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'Supervisor': {'next': 'LocalInformationRetriever'}}
---


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'Research': {'messages': [HumanMessage(content='The observations from the correlation heatmap indicate several factors that contribute to the risk of developing diabetes:\n\n1. **Age**: Moderate positive correlation (0.28) with diabetes, suggesting that older age increases the likelihood of diabetes.\n\n2. **High Cholesterol**: Moderate positive correlation (0.29) with diabetes, indicating that individuals with higher cholesterol levels are more likely to be diabetic.\n\n3. **Body Mass Index (BMI)**: Moderate positive correlation (0.29) with diabetes, showing that higher BMI is associated with a higher prevalence of diabetes.\n\n4. **Physical Activity**: Small negative correlation (-0.16) with diabetes, implying that engaging in physical activity lowers the likelihood of diabetes.\n\n5. **General Health**: Positive correlation (0.41) with lower prevalence of diabetes, indicating that individuals who perceive their health as good are less likely to have diabetes.\n\n6. **High Blood Pre

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


{'Supervisor': {'next': 'FINISH'}}
---


In [6]:
import asyncio
from langchain_core.messages import HumanMessage

In [12]:
async def run_analysis(file_path, question, compiled_graph):
    # Compile the graph
    # compiled_graph = compile_analysis_graph()

    # Run the compiled graph asynchronously
    async for chunk in compiled_graph.astream(question, file_path=file_path):
        
        references = []
        for _, values in chunk.items():
            # Extract messages from values
            if "messages" in values:
                if values['name'] == 'DataVis':
                    analysis = values["messages"]
                else:
                    references.append(values["messages"])

    return analysis, '\n'.join(references)

In [13]:


question = "What caused diabetes?"
file_path = "data/raw_data/diabetes_data.csv"


# Running the async function using asyncio
analysis, references = asyncio.run(run_analysis(file_path, question, graph))

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
INFO:root:File uploaded successfully with ID: file-NHPCiuWMhd7YgKcylrLEyFhO
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads "HTTP/1.1 200 OK"
INFO:root:Thread created successfully with ID: thread_AGn6jqF5S5LbDY1tVM8TTUPL
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_AGn6jqF5S5LbDY1tVM8TTUPL/messages "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_AGn6jqF5S5LbDY1tVM8TTUPL/runs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_AGn6jqF5S5LbDY1tVM8TTUPL/runs/run_gccQMhnEYtBtzwXhGFHOZDWI "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_AGn6jqF5S5LbDY1tVM8TTUPL/runs/run_gccQMhnEYtBtzwXhGFHOZDWI "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://api.openai.com/v1/threads/thread_AGn6jqF5S5LbDY1tVM8TTUPL/runs/run_gccQMhnEYtBtzwXhGFHOZDWI "HTTP/1.1 200 OK"


[HumanMessage(content='The bar plot illustrates the prevalence of diabetes across different age groups. However, the results show a significant issue: nearly 50% of individuals in the 0-18 age group have been classified as having diabetes, which seems anomalous given the typical disease profile of diabetes.\n\n### Insights from the Data Visualization:\n\n1. **Age Group 0-18**: A remarkably high percentage of diabetes prevalence in this group suggests potential data quality issues (e.g., reporting errors or misclassification of diabetes cases). This calls for further examination of the underlying data for this age group.\n\n2. **Other Age Groups**: The bars for older age groups (19-35, 36-50, 51-65, 66+) are not available, which indicates that a detailed breakdown for older populations may also be needed for a comprehensive understanding of diabetes trends.\n\n3. **Lifestyle and Health Variables**: Understanding how the variables such as BMI, physical activity, smoking, and cholesterol 

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[HumanMessage(content='Here are several documents that discuss various causes and factors associated with diabetes:\n\n1. **Incident type 2 diabetes attributable to suboptimal diet in 184 countries** (2023) by Meghan O’Hearn et al. - This paper explores how poor dietary choices contribute to the incidence of type 2 diabetes globally.\n\n2. **Reduced early insulin secretion in the etiology of type 2 diabetes mellitus in Pima Indians** (2002) by C. Bogardus and P.A. Tataranni - This study investigates the role of insulin secretion levels in the development of type 2 diabetes among the Pima Indian population.\n\n3. **Contribution of abnormal muscle and liver glucose metabolism to postprandial hyperglycemia in NIDDM** (1990) by A. Mitrakou et al. - The research focuses on how metabolic abnormalities in muscle and liver contribute to high blood sugar levels after meals in non-insulin-dependent diabetes mellitus (NIDDM).\n\n4. **Reversal of type 2 diabetes: normalization of beta cell functio

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Analysis Complete.


In [None]:
from IPython.display import Image, display

chain = compile_analysis_chain()

try:
    display(Image(chain.get_graph(xray=True).draw_mermaid_png()))
except:
    print('Na...')
    pass