### This dataset contains information about various health and lifestyle factors of 100000 individuals

#### Columns:

id : Unique identifier <br>
age : Age of the person <br>
gender : Gender (Male/Female) <br>
bmi : Body Mass Index <br>
daily_steps : Number of steps per day <br>
sleep_hours : Average sleep duration (hours) <br>
water_intake_l : Daily water intake (liters) <br>
calories_consumed : Calories consumed per day <br>
smoker : Smoking status (0 = No, 1 = Yes) <br>
alcohol : Alcohol use (0 = No, 1 = Yes) <br>
resting_hr : Resting heart rate (bpm) <br>
systolic_bp : Systolic blood pressure<br>
diastolic_bp : Diastolic blood pressure<br>
cholesterol : Cholesterol level (mg/dL)<br>
family_history : Family history of disease (0 = No, 1 = Yes)<br>
disease_risk : Risk of disease (0 = Low, 1 = High)


In [None]:
%pip install pandas plotly
%pip install nbformat
%pip install streamlit pandas plotly

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
path = "/Users/tanya/Desktop/Data Science Journey/ML/my_project/health_lifestyle_dataset.csv"
df = pd.read_csv(path)

In [4]:
num_cols = df.shape[1]
num_rows = df.shape[0]
print(f"Number of columns: {num_cols} \nNumber of rows: {num_rows}")

Number of columns: 16 
Number of rows: 100000


In [5]:
print("a glimpse into the dataset:")
df.head()


a glimpse into the dataset:


Unnamed: 0,id,age,gender,bmi,daily_steps,sleep_hours,water_intake_l,calories_consumed,smoker,alcohol,resting_hr,systolic_bp,diastolic_bp,cholesterol,family_history,disease_risk
0,1,56,Male,20.5,4198,3.9,3.4,1602,0,0,97,161,111,240,0,0
1,2,69,Female,33.3,14359,9.0,4.7,2346,0,1,68,116,65,207,0,0
2,3,46,Male,31.6,1817,6.6,4.2,1643,0,1,90,123,99,296,0,0
3,4,32,Female,38.2,15772,3.6,2.0,2460,0,0,71,165,95,175,0,0
4,5,60,Female,33.6,6037,3.8,4.0,3756,0,1,98,139,61,294,0,0


Let's have a bird's eye view on the data using Plotly library

In [44]:
age = df["age"]
bmi = df["bmi"]
steps = df["daily_steps"]
sleep = df["sleep_hours"]
water = df["water_intake_l"]

calory = df["calories_consumed"]
rest = df["resting_hr"]
cholesterol = df["cholesterol"]
systolic_bp = df["systolic_bp"]
diastolic_bp = df["diastolic_bp"]

gender = df["gender"]
smoke = df["smoker"]
alcohol = df["alcohol"]
family_history = df["family_history"]
disease_risk = df["disease_risk"]

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


data = {
    "Age": age,
    "BMI": bmi,
    "Daily steps": steps,
    "Sleep hours": sleep,
    "Water intake": water,
    "Calory consumption": calory,
    "Resting hour": rest,
    "Cholesterol": cholesterol,
    "Systolic blood pressure": systolic_bp,
    "Diastolic blood pressure": diastolic_bp,
    "Gender": gender,
    "Smoke": smoke,
    "Alcohol": alcohol,
    "Family history": family_history,
    "Disease risk": disease_risk
}


bin_specs = {
    "Age": dict(start=int(age.min()), end=int(age.max()) + 1, size=1),
    "BMI": dict(start=float(bmi.min()), end=float(bmi.max()), size=0.5),
    "Daily steps": dict(start=int(steps.min()), end=int(steps.max()), size=500),
    "Sleep hours": dict(start=int(sleep.min()), end=int(sleep.max()) + 1, size=1),
    "Water intake": dict(start=int(water.min()), end=int(water.max()) + 1, size=1),
    "Calory consumption": dict(start=int(calory.min()), end=int(calory.max()) + 1, size=300),
    "Resting hour": dict(start=int(rest.min()), end=int(rest.max()) + 1, size=5),
    "Cholesterol": dict(start=int(cholesterol.min()), end=int(cholesterol.max()) + 1, size=30),
    "Systolic blood pressure": dict(start=int(systolic_bp.min()), end=int(systolic_bp.max()) + 1, size=10),
    "Diastolic blood pressure": dict(start=int(diastolic_bp.min()), end=int(diastolic_bp.max()) + 1, size=10),
    "Gender": dict(start=-0.5, end=1.5, size=1),
    "Smoke": dict(start=-0.5, end=1.5, size=1),
    "Alcohol": dict(start=-0.5, end=1.5, size=1),
    "Family history": dict(start=-0.5, end=1.5, size=1),
    "Disease risk": dict(start=-0.5, end=1.5, size=1)
}

fig = make_subplots(rows=3, cols=5, subplot_titles=list(data.keys()))
positions = [(1,1), (1,2), (1,3), (1,4), (1,5), (2,1), (2,2), (2,3), (2,4), (2,5), (3,1), (3,2), (3,3), (3,4), (3,5)]

for (title, s), (r, c) in zip(data.items(), positions):
    xbins = bin_specs[title]
    fig.add_trace(
        go.Histogram(
            x=s,
            xbins=xbins,          
            histfunc="count",     
            histnorm=None,        
            marker=dict(color="#1f77b4", line=dict(color = "yellow", width=0.8))
        ),
        row=r, col=c
    )
    fig.update_xaxes(range=[xbins["start"], xbins["end"]], row=r, col=c)
    fig.update_yaxes(row=r, col=c)

fig.update_layout(
    height=800, width=1400,
    showlegend=False,
    title_text="Health Dataset Histograms",
    title_x=0.5,
    bargap=0.05
)

fig.add_annotation(
    text="Count",
    x=-0.05, y=0.5,
    xref="paper", yref="paper",
    showarrow=False,
    textangle=-90,
    font=dict(size=20)
)


fig.show()


In [67]:
import plotly.io as pio

fig.write_html(
    "/Users/tanya/Desktop/Data Science Journey/ML/my_project/docs/histograms.html",
    include_plotlyjs="cdn",   # loads plotly.js from CDN (keeps file size smaller)
    full_html=True,           # self-contained HTML page
    auto_open=False
)