# 🔬 Impact of Physical Activity on Metabolic Health

In [1]:
!pip freeze | grep -E "pandas|numpy|matplotlib|seaborn|ipywidgets"

ipywidgets==8.1.7
matplotlib==3.10.3
matplotlib-inline==0.1.7
numpy==2.2.5
pandas==2.2.3
seaborn==0.13.2


## Dataset Loading & Cleaning

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display
import os

In [3]:
# get all files into a list
path = 'Datafiles'
fileNames = os.listdir(path)
fileNames

# get all files ending in .xpt
xpt_list = [file for file in fileNames if file.endswith('.xpt')]
xpt_list

['HSCRP_L.xpt',
 'CBC_L (1).xpt',
 'INS_L (1).xpt',
 'GLU_L.xpt',
 'DEMO_L.xpt',
 'HDL_L.xpt',
 'TCHOL_L.xpt',
 'PAQ_L (1).xpt',
 'GHB_L.xpt']

## Step 1: Data Cleaning

In [None]:
# create a list of Pandas dataframes from the xpt files

def convert_to_df(filename):
    """
    create a function to convert the xpt file to a Pandas Dataframe:
    """
    # use an f string to allow the function to access a the
    # filename variable each time the function is called.
    df = pd.read_sas(f'Datafiles/{filename}')
    return df


# Create a lookup dict for friendly names
file_descriptions = {
    "PAQ_L.xpt": "Physical Activity Data",
    "DEMO_L.xpt": "Demographics Data",
    "GHB_L.xpt":  "Glycohemoglobin Data",
    "GLU_L.xpt":  "Fasting Glucose Data",
    "HDL_L.xpt":  "HDL Cholesterol Data",
    "HSCRP_L.xpt":"High-Sensitivity CRP Data",
    "INS_L.xpt":  "Insulin Data",
    "TCHOL_L.xpt":"Total Cholesterol Data",
    "CBC_L.xpt":  "Complete Blood Count Data",
}

df_list = []

# Load each file, store in df_list, and print descriptive info
for i, fname in enumerate(xpt_list, start=1):
    df = convert_to_df(fname)
    df_list.append(df)
    # retrieves a friendly label or returns "Unknown file" if it’s not in the dictionary.
    friendly_name = file_descriptions.get(fname, "Unknown file")
    print(f"DataFrame {i} from '{friendly_name}' ({fname}): {df.shape}")

In [None]:
# merge dataframes on SEQN

master_df = df_list[0]

for df in df_list[1:]:
    # Ensure drop overlapping columns
    overlap_cols = master_df.columns.intersection(df.columns).drop('SEQN')
    df = df.drop(columns = overlap_cols)
    # merge on cleaned SEQN numbers
    master_df = master_df.merge(df, on = 'SEQN', how ='inner')

# function to check for missing values
def summarize_missing(df):
#     print("Missing values per column:")
    for col in df.columns:
        num_missing = df[col].isna().sum()
#         print(f"{col}: {num_missing} missing values")

In [None]:
# Selecting columns:
cols = [
    # Basic Lab Variables
    'SEQN', 'LBXGLU', 'LBXGH', 'LBDHDD', 'LBXTC', 'RIDAGEYR', 'RIAGENDR',

    # Physical Activity Variables
    'PAD790Q','PAD790U','PAD800','PAD810Q','PAD810U', 'PAD820', 'PAD680',

    # Insulin
    'LBXIN',

    # High-sensitivity C-reactive protein
    'LBXHSCRP'
]

# grab only those columns and make a true copy
analysis_df = master_df[cols].copy()

# safe to drop rows in place
analysis_df.dropna(inplace=True)

#analysis_df

In [None]:
# Renaming selected columns for clarity:
analysis_df.rename(columns={
    'SEQN': 'ID',
    'RIDAGEYR': 'Age',
    'RIAGENDR': 'Male',
    'LBXGLU': 'Fasting Glucose (mg/dl)',
    'LBXGH': 'Glycohemoglobin (%)',
    'LBDHDD': 'HDL Cholesterol (mg/dl)',
    'LBXTC': 'Total Cholesterol (mg/dl)',
    'LBXIN': 'Insulin (µU/mL)',
    'LBXHSCRP': 'HS C-Reactive Protein (mg/L)',

    # Physical Activity
    'PAD790Q': 'Moderate Activity Frequency',
    'PAD790U': 'Moderate Activity Frequency Unit',
    'PAD800': 'Moderate Activity Duration',
    'PAD810Q': 'Vigorous Activity Frequency',
    'PAD810U': 'Vigorous Activity Frequency Unit',
    'PAD820': 'Vigorous Activity Duration',
    'PAD680': 'Total Time Sedentary Activities'
}, inplace=True)

# Convert Gender data values: # Map 1→1 (male), 2→0 (female);invalid codes->NaN

# .map({<dict>}) uses a dict as a tool to describe how to change values 
# the keys represent the current values and the values of those keys 
# represent the changes that need to be made to the current values.
analysis_df


analysis_df['Male'] = analysis_df['Male'].map({1.0: 1, 2.0: 0})
analysis_df['Male'] = analysis_df['Male'].astype('Int64')
analysis_df['ID'] = analysis_df['ID'].astype('int')
analysis_df['Age'] = analysis_df['Age'].astype('int')
analysis_df.reset_index(drop = True, inplace = True)

#analysis_df

In [None]:
def convert_to_weekly_activity(frequency, freq_unit, duration):
    """
    Convert raw physical activity data to total weekly minutes of activity.

    Parameters
    ----------
    frequency : float
        Number of activity sessions (e.g., 3.0).
    freq_unit : bytes
        Unit of frequency stored as a byte string. Expected values:
        - b'D' for daily
        - b'W' for weekly
        - b'M' for monthly
        - b'Y' for yearly
    duration : float
        Duration of each session in minutes.

    Returns
    -------
    float or None
        Total weekly minutes of activity, rounded to 2 decimal places.
        Returns None if input is missing, invalid, or results in an unrealistic total.

    Notes
    -----
    - Filters out rows with missing values or implausible values (e.g., >300 min/session).
    - Caps extreme totals (e.g., >10,000 min/week) to prevent outlier distortion.
    - Byte strings are safely decoded into unit strings.
    """

    # Return None if any required field is missing
    if pd.isna(frequency) or pd.isna(duration) or pd.isna(freq_unit):
        return None

    # Sanity check for logical bounds
    if frequency < 0 or duration < 0 or duration > 300:
        return None

    # Decode byte string (e.g., b'D') into readable string (e.g., 'D')
    try:
        unit_str = freq_unit.decode('utf-8').strip()
    except Exception:
        return None

    # Convert to weekly frequency
    if unit_str == 'D':      # Daily
        freq_per_week = frequency * 7
    elif unit_str == 'W':    # Weekly
        freq_per_week = frequency
    elif unit_str == 'M':    # Monthly
        freq_per_week = (frequency * 12) / 52
    elif unit_str == 'Y':    # Yearly
        freq_per_week = frequency / 52
    else:
        return None  # Unexpected unit

    # Compute total weekly minutes
    total = freq_per_week * duration

    MAX_WEEKLY_MINUTES = 2000  # About 4.75 hrs/day
    return round(total, 2) if total <= MAX_WEEKLY_MINUTES else None

In [None]:
# apply the covert to weekly function to the each of the activity type columns

def preprocess_activity_columns(df):
    if all(col in df.columns for col in [
        'Moderate Activity Frequency', 'Moderate Activity Frequency Unit', 'Moderate Activity Duration',
        'Vigorous Activity Frequency', 'Vigorous Activity Frequency Unit', 'Vigorous Activity Duration'
    ]):
        df['Moderate_Weekly_Minutes'] = df.apply(
            lambda row: convert_to_weekly_activity(
                row['Moderate Activity Frequency'],
                row['Moderate Activity Frequency Unit'],
                row['Moderate Activity Duration']
            ), axis=1
        )
        df['Vigorous_Weekly_Minutes'] = df.apply(
            lambda row: convert_to_weekly_activity(
                row['Vigorous Activity Frequency'],
                row['Vigorous Activity Frequency Unit'],
                row['Vigorous Activity Duration']
            ), axis=1
        )
        df['Total_Weekly_Mins'] = df['Moderate_Weekly_Minutes'] + df['Vigorous_Weekly_Minutes']
        df['Meets_Goal'] = df['Total_Weekly_Mins'] >= 150

        # Safely drop if present
        columns_to_drop = [
            'Moderate Activity Frequency',
            'Moderate Activity Frequency Unit',
            'Moderate Activity Duration',
            'Vigorous Activity Frequency',
            'Vigorous Activity Frequency Unit',
            'Vigorous Activity Duration',
        ]
        df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)
    return df

analysis_df = preprocess_activity_columns(analysis_df)

# Create total weekly mins and add in a boole

analysis_df['Total_Weekly_Mins'] = analysis_df['Moderate_Weekly_Minutes'] + analysis_df['Vigorous_Weekly_Minutes']
analysis_df['Meets_Goal'] = analysis_df['Total_Weekly_Mins'] >= 150


analysis_df.head()

In [None]:
# Define columns to cap
columns_to_cap = [
    'Fasting Glucose (mg/dl)', 'Glycohemoglobin (%)', 'HDL Cholesterol (mg/dl)',
    'Total Cholesterol (mg/dl)', 'Age', 'Total Time Sedentary Activities',
    'Insulin (µU/mL)', 'HS C-Reactive Protein (mg/L)', 'Total_Weekly_Mins', 'Moderate_Weekly_Minutes', 'Vigorous_Weekly_Minutes'
]

# Cap values above 99th percentile
for col in columns_to_cap:
    upper_limit = analysis_df[col].quantile(0.99)
    analysis_df[col] = np.where(analysis_df[col] > upper_limit, upper_limit, analysis_df[col])

In [None]:
def analyze_dataframe(df, include_corr=False, return_summary=False):
    """
    Analyze a numeric DataFrame: summarize statistics, detect outliers, and visualize via heatmap.

    Parameters:
    - df: pandas DataFrame with only numerical columns
    - include_corr: if True, displays a correlation heatmap
    - return_summary: if True, returns the summary DataFrame

    Returns:
    - summary DataFrame
    """
    # Validate input
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")

    numeric_df = df.select_dtypes(include=[float, int])
    if numeric_df.empty:
        raise ValueError("No numeric columns found in the DataFrame.")

    desc = numeric_df.describe().T

    # Compute IQR and outlier bounds
    desc['IQR'] = desc['75%'] - desc['25%']
    desc['nulls'] = numeric_df.isnull().sum()
    desc['n_unique'] = numeric_df.nunique()
    desc['max_outlier'] = numeric_df.max() > (desc['75%'] + 1.5 * desc['IQR'])
    desc['min_outlier'] = numeric_df.min() < (desc['25%'] - 1.5 * desc['IQR'])

    # Select relevant columns for visualization
    vis_df = desc[['mean', 'std', 'min', '25%', '50%', '75%', 'max', 'IQR', 'nulls', 'n_unique']].astype(float)

    # Plot heatmap
    plt.figure(figsize=(12, vis_df.shape[0] * 0.75))
    sns.heatmap(vis_df,
                annot=True,
                cmap="Blues",
                fmt=".2f",
                linecolor="white",
                linewidths=1,
                cbar=False,
                annot_kws={"size": 10})

    plt.xticks(size=12)
    plt.yticks(size=12, rotation=0)
    plt.title("Descriptive Statistics & Outlier Detection", size=14)
    plt.show()

    # correlation heatmap
    if include_corr:
        corr = numeric_df.corr()
        plt.figure(figsize=(10, 8))
        sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
        plt.title("Correlation Matrix", size=14)
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
        plt.show()

    if return_summary:
        return desc

analyze_dataframe(analysis_df)

### Inference Statistics

In [None]:
def summarize_activity_vs_insulin(df, activity_col='Total_Weekly_Mins', insulin_col='Insulin (µU/mL)', goal=150):
    """
    Summarize how insulin levels differ between users who meet vs. do not meet a weekly activity goal.

    Parameters
    ----------
    df : pd.DataFrame
        The dataset containing activity and insulin data.
    activity_col : str
        Column name for total weekly minutes of activity.
    insulin_col : str
        Column name for insulin measurements.
    goal : int
        Activity goal threshold (default = 150 minutes).

    Returns
    -------
    dict
        A dictionary with:
            - percent_above
            - percent_below
            - mean_insulin_above
            - mean_insulin_below
            - delta_insulin
    """

    if activity_col not in df.columns or insulin_col not in df.columns:
        raise ValueError(f"Columns '{activity_col}' or '{insulin_col}' not found in dataframe.")

    df = df.copy()
    df['Meets_Goal'] = df[activity_col] >= goal

    # Calculate percentages
    # calc the % of users who met the 150 min target
    percent_above = df['Meets_Goal'].mean() * 100
    # calc the % of users who did not meet the 150 min target
    percent_below = 100 - percent_above

    # Mean insulin levels:

    # mean insulin for those who met the goal
    mean_insulin_above = df.loc[df['Meets_Goal'], insulin_col].mean()
    # mean insulin for those who did not
    mean_insulin_below = df.loc[~df['Meets_Goal'], insulin_col].mean()
    delta_insulin = mean_insulin_above - mean_insulin_below

    return {
        "percent_above": round(percent_above, 2),
        "percent_below": round(percent_below, 2),
        "mean_insulin_above": round(mean_insulin_above, 2),
        "mean_insulin_below": round(mean_insulin_below, 2),
        "delta_insulin": round(delta_insulin, 2)
    }

print(summarize_activity_vs_insulin(analysis_df))


## Step 2: Feature Engineering

### Creating a SPLOM to understand general trends in data

In [None]:
# In sns.pairplot, if we want to color by Male (using the hue parameter), 
# Seaborn expects a categorical variable.
analysis_df ['Male'] = analysis_df ['Male'].astype('category')

#convert Age to int64
analysis_df ["Age"] = analysis_df ["Age"].astype('int64')


selected_cols = [
    'Insulin (µU/mL)', 'Fasting Glucose (mg/dl)', 
    'Glycohemoglobin (%)', 'Moderate_Weekly_Minutes', 
    'Vigorous_Weekly_Minutes', 'Male'
]

sns.pairplot(
    analysis_df[selected_cols], 
    hue='Male', 
    plot_kws={'alpha': 0.4, 's': 20}, 
    diag_kind='kde'
)

plt.tight_layout()
plt.show()

### Heatmap to check correlations

In [None]:
def create_heatmap(corr_matrix, figsize=(10, 8), annot=True, cmap='RdBu'):
    """
    Generates and returns a heatmap with an adjustable figure size.

    Parameters:
    -----------
    corr_matrix : pd.DataFrame
        The correlation matrix.
    figsize : tuple
        Size of the figure.
    annot : bool
        Whether to show values inside heatmap cells.
    cmap : str
        Color map
    """
    plt.figure(figsize=figsize)

    # Create upper triangle mask to reduce redundancy
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    sns.heatmap(
        corr_matrix,
        mask=mask,
        cmap=cmap,
        annot=annot,
        fmt=".2f",
        center=0,
        vmin=-1,
        vmax=1,
        linewidths=0.5,
        square=True
    )

    plt.title("Correlation Heatmap of Activity & Biomarkers", fontsize=14, pad=20)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

# Generate Heat 

create_heatmap(analysis_df.corr(), cmap='YlGnBu', annot=True)

### Overlay Goal Line & Annotate in Matplotlib

In [None]:
def plot_activity_insulin_goal_matplotlib(
    analysis_df,
    weekly_col: str = "Total_Weekly_Mins",
    insulin_col: str = "Insulin (µU/mL)",
    goal: int = 150,
    cap_pct: float = 0.97
):
    """
    Scatterplot of weekly activity vs. fasting insulin with:
      • A vertical goal line at `goal` minutes
      • A vertical cap at the `cap_pct` percentile
      • Two clean legends (activity and reference lines)

    Returns
    -------
    fig, ax : matplotlib Figure and Axes objects
    """

    # 1) Copy & cap activity outliers
    data = analysis_df.copy()
    cap_value = data[weekly_col].quantile(cap_pct)
    data['Activity_Capped'] = data[weekly_col].clip(upper=cap_value)

    # 2) Add Meets_Goal column if missing
    if 'Meets_Goal' not in data.columns:
        data['Meets_Goal'] = data[weekly_col] >= goal

    # 3) Summary stats
    pct_above = data['Meets_Goal'].mean() * 100
    pct_below = 100 - pct_above
    mean_insulin_above = data.loc[data['Meets_Goal'], insulin_col].mean()
    mean_insulin_below = data.loc[~data['Meets_Goal'], insulin_col].mean()
    delta_insulin = mean_insulin_above - mean_insulin_below

    # 4) Plot setup
    fig, ax = plt.subplots(figsize=(10, 6))

    # 5) Scatter plot
    ax.scatter(
        data.loc[~data['Meets_Goal'], 'Activity_Capped'],
        data.loc[~data['Meets_Goal'], insulin_col],
        color='deepskyblue', alpha=0.6, s=30
    )
    ax.scatter(
        data.loc[data['Meets_Goal'], 'Activity_Capped'],
        data.loc[data['Meets_Goal'], insulin_col],
        color='orange', alpha=0.6, s=30
    )

    # 6) Reference lines
    ax.axvline(x=goal, color='red', linestyle='--', linewidth=2)
    ax.axvline(x=cap_value, color='gray', linestyle=':', linewidth=2)

    # 7) Stats annotation
    stats_text = (
        f">= {goal} min: {pct_above:.1f}%\n"
        f"<  {goal} min: {pct_below:.1f}%\n"
        f"Δ Insulin: {delta_insulin:.2f}"
    )
    ax.annotate(
        stats_text,
        xy=(0.95, 0.95), xycoords='axes fraction',
        ha='right', va='top',
        bbox=dict(boxstyle='round,pad=0.5', fc='white', alpha=0.85)
    )

    # 8) Activity legend (left)
    scatter_handles = [
        Line2D([0], [0], marker='o', linestyle='None', color='w',
               label=f">= {goal} min", markerfacecolor='orange', markersize=8),
        Line2D([0], [0], marker='o', linestyle='None', color='w',
               label=f"< {goal} min", markerfacecolor='deepskyblue', markersize=8)
    ]
    leg1 = ax.legend(
        handles=scatter_handles,
        title='Activity Goal',
        loc='upper left',
        frameon=True,
        facecolor='white'
    )
    ax.add_artist(leg1)

    # 9) Reference lines legend (upper centre)
    line_handles = [
        Line2D([0], [0], color='red', linestyle='--', linewidth=2,
               label=f"{goal} min goal"),
        Line2D([0], [0], color='gray', linestyle=':', linewidth=2,
               label=f"{int(cap_pct * 100)}th percentile ≈ {int(cap_value)} min")
    ]
    leg2 = ax.legend(
        handles=line_handles,
        title='Reference Lines',
        loc='upper center',
        frameon=True,
        facecolor='white'
    )
    leg2.get_frame().set_alpha(1.0)
    leg2.get_frame().set_linewidth(0.5)

    # 10) Titles and labels
    ax.set_title("Weekly Activity vs. Fasting Insulin\nGoal = 150 min/week, 97th Percentile Cap")
    ax.set_xlabel(f"Weekly Activity (capped at {int(cap_value)} min)")
    ax.set_ylabel(insulin_col)

    plt.tight_layout()
    return fig, ax

fig, ax = plot_activity_insulin_goal_matplotlib(analysis_df)
plt.show()


## Step 3: Dashboard Setup & Interactive Widgets

In [None]:
# --- Dropdowns ---
activity_options = ['Moderate_Weekly_Minutes', 'Vigorous_Weekly_Minutes', 'Total Time Sedentary Activities']
biomarker_options = [
    'Insulin (µU/mL)', 'Fasting Glucose (mg/dl)', 'Glycohemoglobin (%)',
    'HDL Cholesterol (mg/dl)', 'Total Cholesterol (mg/dl)', 'HS C-Reactive Protein (mg/L)'
]

activity_var = widgets.Dropdown(options=activity_options, value='Moderate_Weekly_Minutes', description='Activity:')
biomarker_var = widgets.Dropdown(options=biomarker_options, value='Insulin (µU/mL)', description='Biomarker:')

In [None]:
# --- Dashboard Function ---
def update_dashboard(activity, biomarker):
    # Filter extreme activity values
    filtered_df = analysis_df[
        (analysis_df['Moderate_Weekly_Minutes'] <= 1000) &
        (analysis_df['Vigorous_Weekly_Minutes'] <= 500) &
        (analysis_df['Total Time Sedentary Activities'] <= 1000)
    ]

    # Create a Gender column for better labeling
    filtered_df['Gender'] = filtered_df['Male'].map({0: 'Female', 1: 'Male'})

    # Remove Y-axis outliers based on biomarker
    if biomarker == 'Insulin (µU/mL)':
        filtered_df = filtered_df[filtered_df[biomarker] <= 40]
    elif biomarker == 'Fasting Glucose (mg/dl)':
        filtered_df = filtered_df[filtered_df[biomarker] <= 180]
    elif biomarker == 'Glycohemoglobin (%)':
        filtered_df = filtered_df[filtered_df[biomarker] <= 8]
    elif biomarker == 'HS C-Reactive Protein (mg/L)':
        filtered_df = filtered_df[filtered_df[biomarker] <= 15]

    # --- Subplots ---
    fig, axs = plt.subplots(2, 2, figsize=(14, 10))
    plt.subplots_adjust(hspace=0.4)

    # --- Scatter Plot ---
    sns.scatterplot(data=filtered_df, x=activity, y=biomarker, hue='Male', ax=axs[0, 0])
    sns.regplot(data=filtered_df, x=activity, y=biomarker, scatter=False, ax=axs[0, 0], color='black', line_kws={"linewidth": 1})
    axs[0, 0].set_title(f'Scatter: {biomarker} vs {activity}')
    axs[0, 0].grid(True)

    # --- Box Plot ---
    sns.boxplot(data=filtered_df, x='Male', y=activity, ax=axs[0, 1])
    axs[0, 1].set_title(f'Box Plot: {activity} by Gender')
    axs[0, 1].set_xlabel('')
    axs[0, 1].set_xticklabels(['Female', 'Male'])
    axs[0, 1].grid(True)

    # --- Violin Plot ---
    sns.violinplot(data=filtered_df, x='Male', y=biomarker, ax=axs[1, 0])
    axs[1, 0].set_title(f'Violin: {biomarker} by Gender')
    axs[1, 0].set_xlabel('')
    axs[1, 0].set_xticklabels(['Female', 'Male'])
    axs[1, 0].grid(True)

    # --- Jittered Dot Plot ---
    sns.stripplot(data=filtered_df, x='Male', y=biomarker, jitter=True, ax=axs[1, 1])
    axs[1, 1].set_title(f'Jittered Dot: {biomarker} by Gender')
    axs[1, 1].set_xlabel('')
    axs[1, 1].set_xticklabels(['Female', 'Male'])
    axs[1, 1].grid(True)

    plt.tight_layout()
    plt.style.use('default')
    plt.show()

In [None]:
# --- Layout ---
ui = widgets.VBox([activity_var, biomarker_var])
out = widgets.interactive_output(update_dashboard, {'activity': activity_var, 'biomarker': biomarker_var})

display(ui, out)

# Descriptions Section


## 🧠 Relevance to Human Health

**Understanding how physical activity influences key biomarkers is critical for promoting long‑term health.**
These markers are directly linked to chronic diseases such as:
- **Type 2 Diabetes**
- **Cardiovascular Disease**
- **Metabolic Syndrome**
- **Obesity‑related Inflammation**

---

## 🔬 Impact of Physical Activity on Biomarkers

1. **Insulin (µU/mL)**
   - _Effect:_ Improves insulin sensitivity
   - _Outcome:_ Lower circulating insulin levels; reduced risk of insulin resistance and type 2 diabetes

2. **Fasting Glucose (mg/dL) & HbA1c (%)**
   - _Effect:_ Enhances muscle glucose uptake
   - _Outcome:_ Better blood glucose control; lower HbA1c values

3. **Cholesterol (HDL & Total)**
   - _Effect:_ Increases HDL (“good” cholesterol); may reduce LDL and total cholesterol
   - _Outcome:_ Improved heart health; reduced cardiovascular risk

4. **C‑Reactive Protein (CRP)**
   - _Effect:_ Decreases systemic inflammation
   - _Outcome:_ Lower CRP levels; protection against chronic diseases and improved immune function

5. **Sedentary Time**
   - _Effect:_ Excessive sedentary behavior correlates with elevated insulin, worse lipid profiles, and higher CRP
   - _Outcome:_ Provides a clear contrast between active and inactive lifestyles



## 📊 Market Analysis Layer

- **96 million U.S. adults** are prediabetic
  → Source: [CDC 2022 National Diabetes Statistics Report](https://www.cdc.gov/diabetes/data/statistics-report/index.html)

- **30% adoption** of activity nudges (e.g., walking prompts, step goals)
  → From digital health app engagement studies (JAMA, Nature Digital Medicine)

- **~5% reduction** in fasting insulin is associated with reduced diabetes risk
  → Clinical studies suggest even modest increases in physical activity improve insulin sensitivity

- **Assumption:** We reach 1 million preventable cases long term

- **Annual direct medical cost of diabetes:** ~$9,600 per person
  → Source: [ADA, 2022 Cost of Diabetes Report](https://diabetesjournals.org/care/article/46/4/454/148746)

- **Estimated cost savings:**
  - 1 million cases × \$8,000–\$10,000 per person = \$8–\$10 billion/year
  → Source: [ADA, 2022 Cost of Diabetes Report](https://diabetesjournals.org/care/article/46/4/454/148746)




