In [1]:
# Import libraries
# Load the dataset
# Display first few rows to confirm it loaded

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('healthcare-dataset-stroke-data.csv')

print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


Part 1

# Step 1: Shape

In [3]:
print("Dataset Shape:")
print(f"Rows (observations): {df.shape[0]}")
print(f"Columns (features): {df.shape[1]}")
print(f"\nColumn names: {list(df.columns)}")

Dataset Shape:
Rows (observations): 5110
Columns (features): 12

Column names: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']


Findings:
Rows (observations): 5110
Columns (features): 12
Each row represents a patient with 12 features assessed

# Step 2: Column Names

In [4]:
print("Column Names:")
print(df.columns.tolist())
print(f"\nTotal number of columns: {len(df.columns)}")


Column Names:
['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

Total number of columns: 12


Findings:

List all column names:
id - Unique patient identifier
gender - Patient's biological sex
age - Patient's age in years
hypertension - History of high blood pressure
heart_disease - History of heart disease
ever_married - Marital status
work_type - Type of employment/occupation
Residence_type - Urban or rural residence
avg_glucose_level - Average blood glucose level
bmi - Body Mass Index
smoking_status - Smoking history
stroke - Whether patient had a stroke (TARGET VARIABLE)

avg_glucose_level - Need to understand what units (mg/dL vs mmol/L) and whether this is fasting glucose, random glucose, or HbA1c equivalent
bmi - Need to confirm the standard BMI categories for clinical interpretation
hypertension - Need to understand the diagnostic threshold used (140/90 vs 130/80 mmHg)
smoking_status - Need to clarify what "Unknown" means and the definitions of the other categories
work_type - Need to understand how "children" and "Never_worked" are classified

# Step 3: Data Types

In [5]:
print("Data Types for Each Column:")
print(df.dtypes)
print("\n" + "="*50)

print("\nNumeric Columns:")
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numeric_cols)

print("\nCategorical Columns (object/string):")
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

print("\n" + "="*50)
print("Data Type Summary:")
print(df.info())

Data Types for Each Column:
id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


Numeric Columns:
['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

Categorical Columns (object/string):
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

Data Type Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5

Findings:

Numeric Columns:
id (int64) - Integer identifier
age (float64) - Continuous/decimal number
hypertension (int64) - Integer (0 or 1)
heart_disease (int64) - Integer (0 or 1)
avg_glucose_level (float64) - Continuous measurement
bmi (float64) - Continuous measurement
stroke (int64) - Integer (0 or 1)

Categorical Columns:
gender (object) - Text categories
ever_married (object) - Text categories
work_type (object) - Text categories
Residence_type (object) - Text categories
smoking_status (object) - Text categories

No incorrect data types detected

# Step 4: First Look

In [6]:
print("First 10 rows of the dataset:")
print(df.head(10))

print("\n" + "="*80)
print("Sample of data (random 5 rows):")
print(df.sample(5, random_state=42))

print("\n" + "="*80)
print("Checking for potential placeholder values:")
print("\nUnique values in categorical columns:")
for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    print(f"{col}: {df[col].unique()}")

First 10 rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   
5  56669    Male  81.0             0              0          Yes   
6  53882    Male  74.0             1              1          Yes   
7  10434  Female  69.0             0              0           No   
8  27419  Female  59.0             0              0          Yes   
9  60491  Female  78.0             0              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never 

Findings:

What do the actual values look like:
- Numeric values appear reasonable:
 -Ages range from infants (0.08 years ≈ 1 month old) to elderly (82 years)
 -Glucose levels look realistic (55-271 mg/dL)
 -BMI values are in expected ranges (10.3-97.6)

- Categorical values are clear and descriptive:
 -Gender: "Male", "Female", "Other"
 -Work type: "Private", "Self-employed", "Govt_job", "children", "Never_worked"
 -Smoking status: "never smoked", "formerly smoked", "smokes", "Unknown"

- Binary values are coded as 0/1 (hypertension, heart_disease, stroke)

Anything unusual or unexpected:
- Very young patients: There's an infant aged 0.08 years (about 1 month old) in the dataset. Pediatric stroke is rare but possible.
- "Unknown" smoking status: 30.2% of records have "Unknown" smoking status - this is a significant amount of missing information disguised as a category.
- "Other" gender: Only 1 patient classified as "Other" - this might need special handling in analysis.
- "Never_worked": 22 patients in this category - might overlap with "children" category.

Values that look like they might be placeholders for missing data:
- "Unknown" in smoking_status: This appears to be a placeholder for missing data (1,544 records). Rather than showing as NaN, the absence of smoking information is coded as "Unknown".

# Step 5: Last Look

In [7]:
print("Last 10 rows of the dataset:")
print(df.tail(10))

print("\n" + "="*80)
print("Comparison of first and last rows:")
print("\nFirst row:")
print(df.iloc[0])
print("\nLast row:")
print(df.iloc[-1])

# Check if data ends cleanly
print("\n" + "="*80)
print(f"Total rows in dataset: {len(df)}")
print(f"Last index: {df.index[-1]}")
print(f"Are indices continuous? {df.index.is_monotonic_increasing and len(df) == df.index[-1] + 1}")

Last 10 rows of the dataset:
         id  gender   age  hypertension  heart_disease ever_married  \
5100  68398    Male  82.0             1              0          Yes   
5101  36901  Female  45.0             0              0          Yes   
5102  45010  Female  57.0             0              0          Yes   
5103  22127  Female  18.0             0              0           No   
5104  14180  Female  13.0             0              0           No   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
5100  Self-employed          Rural              71.97  28.3     never smoked   
5101        Private          

Findings:

The dataset ends cleanly at row 5,109
- No incomplete rows or truncated data
- No error messages or corruption visible
- All columns are present in the last rows
- Indices run continuously from 0 to 5,109 with no gaps

The last rows consistent with the first rows
- Same 12 columns present throughout
- Same data types maintained (no type changes mid-dataset)
- Same categorical values (no new categories appearing at the end)
- Mix of stroke outcomes (both 0 and 1 present)
- Similar patterns of missing BMI values (NaN appears in both sections)
- Value ranges appear consistent (ages, glucose, BMI all in similar ranges)

# Step 6: Memory Usage

In [8]:
print("Memory Usage Report:")
print("="*80)
print(df.info(memory_usage='deep'))

print("\n" + "="*80)
print("Memory usage by column:")
print(df.memory_usage(deep=True))

print("\n" + "="*80)
# Calculate total memory in MB
memory_bytes = df.memory_usage(deep=True).sum()
memory_mb = memory_bytes / (1024**2)
print(f"Total memory usage: {memory_mb:.2f} MB")
print(f"Total memory usage: {memory_bytes:,} bytes")

# Memory per row
memory_per_row = memory_bytes / len(df)
print(f"\nMemory per row: {memory_per_row:.2f} bytes")

Memory Usage Report:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 1.6 MB
None

Memory usage by column:
Index                   132
id                    40880
gender               276819
age                   408

Findings:

Total dataset memory usage is 1.62 MB

This is a small dataset by data science standards

# Step 7: Missing Values

In [9]:
print("Missing Values Analysis:")
print("="*80)

# Count missing values per column
print("\nMissing values per column:")
print(df.isnull().sum())

print("\n" + "="*80)
# Calculate percentage of missing values
print("Percentage of missing values per column:")
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
print(missing_pct)

print("\n" + "="*80)
# Total missing values
total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"Total missing values: {total_missing}")
print(f"Total cells in dataset: {total_cells}")
print(f"Overall percentage missing: {(total_missing/total_cells)*100:.2f}%")

print("\n" + "="*80)
# Check for hidden missing values
print("Checking for HIDDEN missing values (disguised as data):")
print("\nSmoking status value counts:")
print(df['smoking_status'].value_counts())
print("\nGender value counts:")
print(df['gender'].value_counts())
print("\nWork type value counts:")
print(df['work_type'].value_counts())

Missing Values Analysis:

Missing values per column:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Percentage of missing values per column:
id                   0.00
gender               0.00
age                  0.00
hypertension         0.00
heart_disease        0.00
ever_married         0.00
work_type            0.00
Residence_type       0.00
avg_glucose_level    0.00
bmi                  3.93
smoking_status       0.00
stroke               0.00
dtype: float64

Total missing values: 201
Total cells in dataset: 61320
Overall percentage missing: 0.33%

Checking for HIDDEN missing values (disguised as data):

Smoking status value counts:
smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885


Findings:

Only the bmi column has missing values 

The bmi column is missing 3.93% (201 out of 5,110)
All other columns are missing 0.00%

There ARE values that look like data but actually represent missing information:
- smoking_status - "Unknown" category:
 -1,544 records (30.2% of dataset) are marked as "Unknown"
 -This is NOT detected by .isnull() because it's stored as a text string
 -This represents missing information about smoking history
 -Impact: Nearly 1/3 of smoking data is actually missing
 
- Potential issue with work_type - "Never_worked":
 -22 records classified as "Never_worked"
 -Could be legitimate data OR could indicate missing employment information
 -Needs clinical context to determine

- gender - "Other" category:
 -Only 1 record with "Other"
 -Too small for meaningful analysis, but is legitimate data

# Step 8: Duplicates

In [10]:
print("Duplicate Analysis:")
print("="*80)

# Check for completely duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("\nDuplicate rows found:")
    print(df[df.duplicated(keep=False)])

print("\n" + "="*80)
# Check for duplicate IDs
duplicate_ids = df['id'].duplicated().sum()
print(f"Number of duplicate patient IDs: {duplicate_ids}")

if duplicate_ids > 0:
    print("\nDuplicate IDs found:")
    print(df[df['id'].duplicated(keep=False)].sort_values('id'))

print("\n" + "="*80)
# Verify uniqueness
print(f"Total rows in dataset: {len(df)}")
print(f"Unique patient IDs: {df['id'].nunique()}")
print(f"Are all IDs unique? {len(df) == df['id'].nunique()}")

Duplicate Analysis:
Number of duplicate rows: 0

Number of duplicate patient IDs: 0

Total rows in dataset: 5110
Unique patient IDs: 5110
Are all IDs unique? True


Findings:

There are NO duplicate rows in the dataset

All 5,110 patient IDs are unique

# Step 9: Basic Statistics

In [11]:
print("Descriptive Statistics for Numerical Variables:")
print("="*80)
print(df.describe())

print("\n" + "="*80)
print("Key Variable Ranges:")
print(f"\nAge:")
print(f"  Minimum: {df['age'].min():.2f} years")
print(f"  Maximum: {df['age'].max():.2f} years")
print(f"  Mean: {df['age'].mean():.2f} years")
print(f"  Median: {df['age'].median():.2f} years")

print(f"\nAverage Glucose Level:")
print(f"  Minimum: {df['avg_glucose_level'].min():.2f} mg/dL")
print(f"  Maximum: {df['avg_glucose_level'].max():.2f} mg/dL")
print(f"  Mean: {df['avg_glucose_level'].mean():.2f} mg/dL")
print(f"  Median: {df['avg_glucose_level'].median():.2f} mg/dL")

print(f"\nBMI:")
print(f"  Minimum: {df['bmi'].min():.2f}")
print(f"  Maximum: {df['bmi'].max():.2f}")
print(f"  Mean: {df['bmi'].mean():.2f}")
print(f"  Median: {df['bmi'].median():.2f}")

print("\n" + "="*80)
print("Checking for clinically unlikely values:")
print(f"Ages below 0: {(df['age'] < 0).sum()}")
print(f"Ages above 120: {(df['age'] > 120).sum()}")
print(f"Glucose below 30: {(df['avg_glucose_level'] < 30).sum()}")
print(f"Glucose above 500: {(df['avg_glucose_level'] > 500).sum()}")
print(f"BMI below 10: {(df['bmi'] < 10).sum()}")
print(f"BMI above 80: {(df['bmi'] > 80).sum()}")

Descriptive Statistics for Numerical Variables:
                 id          age  hypertension  heart_disease  \
count   5110.000000  5110.000000   5110.000000    5110.000000   
mean   36517.829354    43.226614      0.097456       0.054012   
std    21161.721625    22.612647      0.296607       0.226063   
min       67.000000     0.080000      0.000000       0.000000   
25%    17741.250000    25.000000      0.000000       0.000000   
50%    36932.000000    45.000000      0.000000       0.000000   
75%    54682.000000    61.000000      0.000000       0.000000   
max    72940.000000    82.000000      1.000000       1.000000   

       avg_glucose_level          bmi       stroke  
count        5110.000000  4909.000000  5110.000000  
mean          106.147677    28.893237     0.048728  
std            45.283560     7.854067     0.215320  
min            55.120000    10.300000     0.000000  
25%            77.245000    23.500000     0.000000  
50%            91.885000    28.100000     0.0000

Findings:

Age range in the dataset:
- Minimum: 0.08 years (approximately 1 month old)
- Maximum: 82.00 years

Range of average glucose levels:
- Minimum: 55.12 mg/dL
- Maximum: 271.74 mg/dL

Range of BMI values:
- Minimum: 10.30
- Maximum: 97.60

None of the min/max values seem impossible. There are two bmi values above 80, which are possible but seem clinically unlikely

# Step 10: Unique Counts

In [12]:
print("Unique Value Counts:")
print("="*80)
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col:20} : {unique_count:5} unique values")

print("\n" + "="*80)
print("\nColumns with FEW unique values (likely categorical/binary):")
few_unique = df.columns[df.nunique() <= 10]
for col in few_unique:
    print(f"  {col}: {df[col].nunique()} unique values - {list(df[col].unique()[:5])}")

print("\n" + "="*80)
print("Columns with MANY unique values (likely continuous or IDs):")
many_unique = df.columns[df.nunique() > 100]
for col in many_unique:
    print(f"  {col}: {df[col].nunique()} unique values")

print("\n" + "="*80)
print("ID Verification:")
print(f"Total rows: {len(df)}")
print(f"Unique IDs: {df['id'].nunique()}")
print(f"Do unique IDs match number of rows? {len(df) == df['id'].nunique()}")

Unique Value Counts:
id                   :  5110 unique values
gender               :     3 unique values
age                  :   104 unique values
hypertension         :     2 unique values
heart_disease        :     2 unique values
ever_married         :     2 unique values
work_type            :     5 unique values
Residence_type       :     2 unique values
avg_glucose_level    :  3979 unique values
bmi                  :   418 unique values
smoking_status       :     4 unique values
stroke               :     2 unique values


Columns with FEW unique values (likely categorical/binary):
  gender: 3 unique values - ['Male', 'Female', 'Other']
  hypertension: 2 unique values - [np.int64(0), np.int64(1)]
  heart_disease: 2 unique values - [np.int64(1), np.int64(0)]
  ever_married: 2 unique values - ['Yes', 'No']
  work_type: 5 unique values - ['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked']
  Residence_type: 2 unique values - ['Urban', 'Rural']
  smoking_status: 4

Findings:

Columns with very few unique values:
- Binary variables (2 unique values):
 -hypertension: 2 values - [0, 1]
 -heart_disease: 2 values - [0, 1]
 -ever_married: 2 values - ['Yes', 'No']
 -Residence_type: 2 values - ['Urban', 'Rural']
 -stroke: 2 values - [0, 1] (TARGET variable)
- Low cardinality categorical (3-5 unique values):
 -gender: 3 values - ['Male', 'Female', 'Other']
 -smoking_status: 4 values - ['formerly smoked', 'never smoked', 'smokes', 'Unknown']
 -work_type: 5 values - ['Private', 'Self-employed', 'children', 'Govt_job', 'Never_worked']

Columns that have many unique values: 
id: 5,110 unique values (patient identifier - each unique)
avg_glucose_level: 3,979 unique values (continuous measurement)
bmi: 418 unique values (continuous measurement, but fewer due to rounding)
age: 104 unique values (discrete but treated as continuous)

The number of unique IDs does match the number of rows

Part 2

# Data Dictionary

In [13]:
data_dict = {
    'Column': [
        'id', 'gender', 'age', 'hypertension', 'heart_disease', 
        'ever_married', 'work_type', 'Residence_type', 
        'avg_glucose_level', 'bmi', 'smoking_status', 'stroke'
    ],
    'Description': [
        'Unique patient identifier assigned to each individual in the dataset',
        'Biological sex of the patient',
        'Patient\'s age in years at time of data collection',
        'Binary indicator of hypertension diagnosis (high blood pressure ≥140/90 or ≥130/80 mmHg)',
        'Binary indicator of cardiovascular disease (CAD, heart failure, MI, AFib, valvular disease)',
        'Marital status indicator - whether the patient has ever been married',
        'Patient\'s employment category or occupational status',
        'Type of residential area where patient lives',
        'Average blood glucose level in mg/dL (likely fasting glucose or HbA1c equivalent)',
        'Body Mass Index calculated as weight(kg) / height(m)²',
        'Patient\'s smoking history and current status',
        'Target variable indicating whether the patient experienced a stroke'
    ],
    'Feature Type': [
        'Identifier',
        'Categorical - Nominal',
        'Continuous',
        'Binary',
        'Binary',
        'Categorical - Nominal (Binary)',
        'Categorical - Nominal',
        'Categorical - Nominal (Binary)',
        'Continuous',
        'Continuous',
        'Categorical - Nominal',
        'Binary (TARGET)'
    ],
    'Valid Values/Range': [
        '67 to 72,940 (5,110 unique)',
        'Male (2,115), Female (2,994), Other (1)',
        '0.08 to 82.00 years (Mean: 43.23)',
        '0 = No (4,612), 1 = Yes (498)',
        '0 = No (4,834), 1 = Yes (276)',
        'Yes (3,353), No (1,757)',
        'Private (2,925), Self-employed (819), children (687), Govt_job (657), Never_worked (22)',
        'Urban (2,596), Rural (2,514)',
        '55.12 to 271.74 mg/dL (Mean: 106.15)',
        '10.30 to 97.60 (Mean: 28.89)',
        'never smoked (1,892), Unknown (1,544), formerly smoked (885), smokes (789)',
        '0 = No stroke (4,861), 1 = Stroke (249)'
    ],
    'Notes/Issues': [
        'All unique, no duplicates. Should not be used as feature in modeling.',
        'Only 1 "Other" - insufficient for analysis. May need to exclude.',
        'Includes infants to elderly. Strongest non-modifiable risk factor.',
        '9.75% prevalence. Most important modifiable risk factor (3-4x risk increase).',
        '5.40% prevalence. Heart disease increases stroke risk 5-fold.',
        'Clinical relevance unclear. May proxy for age, SES, social support.',
        'Proxies for SES, stress, physical activity, healthcare access.',
        'Nearly balanced. Relevant for healthcare access and lifestyle factors.',
        'Units and timing unclear. Diabetes doubles stroke risk.',
        '201 missing (3.93%). Extreme values (10.30, 97.60) are rare but possible.',
        'MAJOR ISSUE: 30.2% "Unknown" = hidden missing data. Critical risk factor.',
        'SEVERE CLASS IMBALANCE: Only 4.87% positive cases. Needs special handling.'
    ]
}

data_dictionary = pd.DataFrame(data_dict)

pd.set_option('display.max_colwidth', None)  # Show full text
pd.set_option('display.width', None)  # Don't wrap
print("="*100)
print("DATA DICTIONARY - STROKE PREDICTION DATASET")
print("="*100)
display(data_dictionary)


DATA DICTIONARY - STROKE PREDICTION DATASET


Unnamed: 0,Column,Description,Feature Type,Valid Values/Range,Notes/Issues
0,id,Unique patient identifier assigned to each individual in the dataset,Identifier,"67 to 72,940 (5,110 unique)","All unique, no duplicates. Should not be used as feature in modeling."
1,gender,Biological sex of the patient,Categorical - Nominal,"Male (2,115), Female (2,994), Other (1)","Only 1 ""Other"" - insufficient for analysis. May need to exclude."
2,age,Patient's age in years at time of data collection,Continuous,0.08 to 82.00 years (Mean: 43.23),Includes infants to elderly. Strongest non-modifiable risk factor.
3,hypertension,Binary indicator of hypertension diagnosis (high blood pressure ≥140/90 or ≥130/80 mmHg),Binary,"0 = No (4,612), 1 = Yes (498)",9.75% prevalence. Most important modifiable risk factor (3-4x risk increase).
4,heart_disease,"Binary indicator of cardiovascular disease (CAD, heart failure, MI, AFib, valvular disease)",Binary,"0 = No (4,834), 1 = Yes (276)",5.40% prevalence. Heart disease increases stroke risk 5-fold.
5,ever_married,Marital status indicator - whether the patient has ever been married,Categorical - Nominal (Binary),"Yes (3,353), No (1,757)","Clinical relevance unclear. May proxy for age, SES, social support."
6,work_type,Patient's employment category or occupational status,Categorical - Nominal,"Private (2,925), Self-employed (819), children (687), Govt_job (657), Never_worked (22)","Proxies for SES, stress, physical activity, healthcare access."
7,Residence_type,Type of residential area where patient lives,Categorical - Nominal (Binary),"Urban (2,596), Rural (2,514)",Nearly balanced. Relevant for healthcare access and lifestyle factors.
8,avg_glucose_level,Average blood glucose level in mg/dL (likely fasting glucose or HbA1c equivalent),Continuous,55.12 to 271.74 mg/dL (Mean: 106.15),Units and timing unclear. Diabetes doubles stroke risk.
9,bmi,Body Mass Index calculated as weight(kg) / height(m)²,Continuous,10.30 to 97.60 (Mean: 28.89),"201 missing (3.93%). Extreme values (10.30, 97.60) are rare but possible."


# Clinical Research Questions for Version A

1. What is a normal average glucose level? What level indicates pre-diabetes? What level indicates diabetes?
According to the American Diabetes Association (ADA) guidelines for fasting plasma glucose:
- Normal: Less than 100 mg/dL (5.6 mmol/L).
- Pre-diabetes: 100-125 mg/dL (5.6-6.9 mmol/L).
- Diabetes: ≥126 mg/dL (≥7.0 mmol/L).

2. What are the major risk factors for stroke according to the American Heart Association? Which of these are captured in our dataset?
According to the American Heart Association, major stroke risk factors include:
- CAPTURED in our dataset:
 1. Age - Most important non-modifiable risk factor
 2. Hypertension (High Blood Pressure) - #1 modifiable risk factor
 3. Heart Disease - Especially atrial fibrillation
 4. Diabetes (elevated blood glucose) - Doubles stroke risk
 5. Smoking - Doubles stroke risk
 6. Obesity - Increases risk through multiple mechanisms
 7. Gender - Men higher risk when young, women when older
 - NOT CAPTURED in our dataset:
 8. Race/Ethnicity - African Americans have 2x risk
 9. Prior Stroke/TIA - Highest predictor of future stroke
 10. Atrial Fibrillation - Specific heart rhythm disorder
 11. High Cholesterol - Promotes atherosclerosis
 12. Physical Inactivity - Sedentary lifestyle increases risk
 13. Family History - Genetic predisposition
 14. Diet - Poor diet increases risk

3. Why might work_type be relevant to stroke risk? Think about stress, physical activity, and socioeconomic factors.
Work_type is relevant to stroke risk through multiple interconnected pathways:
 1. SOCIOECONOMIC STATUS (SES) Proxy:
 - Government jobs often indicate stable employment, benefits, healthcare access.
 - Private sector varies widely in compensation and benefits
 - Self-employed may lack consistent healthcare coverage
 - Never worked may indicate poverty, disability, or chronic unemployment
 - Lower SES is associated with:
    -50% higher stroke risk
    -Delayed healthcare seeking
    -Poor medication adherence
    -Limited health literacy
 2. OCCUPATIONAL STRESS:
 - High-stress jobs (certain private sector, self-employed) associated with:
    -Chronic elevation of cortisol and inflammatory markers
    -Higher blood pressure
    -Increased risk of hypertension
    -30-50% increased stroke risk in high-stress occupations
 - Job strain (high demand, low control) particularly harmful
 - Government jobs often have more structure and job security (lower stress)
 3. PHYSICAL ACTIVITY LEVELS:
 - Children = likely high physical activity (protective)
 - Govt_job/Private office work = often sedentary (risk factor)
 - Sedentary work increases stroke risk by 20-30%
 - Manual labor (protective) vs desk work (risk)
 - Physical inactivity is an independent stroke risk factor
 4. HEALTHCARE ACCESS:
 - Government employees often have comprehensive health insurance
 - Self-employed may lack insurance or have high deductibles
 - Never worked may rely on emergency care only
 5. LIFESTYLE FACTORS CLUSTERING:
 - Work type correlates with:
    -Diet quality (SES-related)
    -Smoking rates (higher in lower SES jobs)
    -Alcohol consumption patterns
    -Sleep quality (shift work vs regular hours)
    -Access to healthy food (food deserts in lower SES areas)

4. What is the relationship between hypertension (high blood pressure) and stroke risk? Why is this considered one of the most important risk factors?
Hypertension is the SINGLE MOST IMPORTANT modifiable risk factor for stroke because of the following:
- Increases stroke risk 3-4 fold compared to normotensive individuals
- Responsible for ~50% of all strokes globally
- Both ischemic (clot) and hemorrhagic (bleeding) strokes
- Risk is continuous and graded: higher BP = higher risk, even within "normal" range

Part 3: Data Validation

# 3.1 Age Validation

In [14]:
print("\n1. CHECKING FOR IMPOSSIBLE VALUES:")
print(f"   Ages below 0: {(df['age'] < 0).sum()}")
print(f"   Ages above 120: {(df['age'] > 120).sum()}")

print("\n2. ACTUAL AGE RANGE:")
print(f"   Minimum age: {df['age'].min():.2f} years")
print(f"   Maximum age: {df['age'].max():.2f} years")
print(f"   Mean age: {df['age'].mean():.2f} years")
print(f"   Median age: {df['age'].median():.2f} years")

print("\n3. PEDIATRIC PATIENT COUNTS:")
under_18 = (df['age'] < 18).sum()
under_1 = (df['age'] < 1).sum()
print(f"   Patients under 18 years: {under_18} ({under_18/len(df)*100:.2f}%)")
print(f"   Patients under 1 year (infants): {under_1} ({under_1/len(df)*100:.2f}%)")

print("\n4. YOUNGEST PATIENTS IN DATASET:")
youngest = df.nsmallest(10, 'age')[['id', 'age', 'gender', 'stroke']]
print(youngest)

print("\n5. INTERPRETING DECIMAL AGES (for youngest patients):")
for age in df.nsmallest(5, 'age')['age']:
    months = age * 12
    days = age * 365
    print(f"   Age {age:.2f} years = {months:.1f} months = {days:.0f} days")

print("\n6. AGE DISTRIBUTION:")
age_groups = pd.cut(df['age'], 
                    bins=[0, 1, 5, 12, 18, 30, 45, 60, 75, 120],
                    labels=['<1 year', '1-5', '5-12', '12-18', '18-30', 
                            '30-45', '45-60', '60-75', '75+'])
print(age_groups.value_counts().sort_index())

print("\n7. STROKE RATE BY AGE GROUP:")
age_stroke = df.groupby(age_groups, observed=True)['stroke'].agg(['count', 'sum', 'mean'])
age_stroke.columns = ['Total Patients', 'Stroke Cases', 'Stroke Rate']
age_stroke['Stroke Rate'] = age_stroke['Stroke Rate'] * 100
print(age_stroke)

print("\n8. AGE STATISTICS SUMMARY:")
print(df['age'].describe())


1. CHECKING FOR IMPOSSIBLE VALUES:
   Ages below 0: 0
   Ages above 120: 0

2. ACTUAL AGE RANGE:
   Minimum age: 0.08 years
   Maximum age: 82.00 years
   Mean age: 43.23 years
   Median age: 45.00 years

3. PEDIATRIC PATIENT COUNTS:
   Patients under 18 years: 856 (16.75%)
   Patients under 1 year (infants): 43 (0.84%)

4. YOUNGEST PATIENTS IN DATASET:
         id   age  gender  stroke
1614  47350  0.08  Female       0
3295  29955  0.08    Male       0
3618  22877  0.16    Male       0
3968  41500  0.16    Male       0
4021   8247  0.16    Male       0
996   53279  0.24    Male       0
1999  42500  0.24    Male       0
2898  64974  0.24    Male       0
3392  11371  0.24    Male       0
4293  69222  0.24    Male       0

5. INTERPRETING DECIMAL AGES (for youngest patients):
   Age 0.08 years = 1.0 months = 29 days
   Age 0.08 years = 1.0 months = 29 days
   Age 0.16 years = 1.9 months = 58 days
   Age 0.16 years = 1.9 months = 58 days
   Age 0.16 years = 1.9 months = 58 days

6. AGE D

Findings:

No impossible age values found

It does make clinical sense to have very young patients (infants) in a stroke dataset because of the following:
- Pediatric stroke is real but rare: Occurs in approximately 2-13 per 100,000 children per year
- Perinatal stroke: Most common in newborns (birth to 28 days)
 -Affects ~1 in 2,300 live births
 -Often related to birth complications, blood clotting disorders, heart defects
- Causes in infants differ from adults
- Important for pediatric hospitals and comprehensive stroke centers, however, risk factors differ (hypertension, diabetes less relevant in infants)

Decimal ages are used for percision (matters especially for infants and young children).
- Age 0.72 years = 0.72 × 12 = 8.6 months old
- Age 1.24 years = 1.24 × 12 = 14.9 months old (Roughly 1 year, 3 months old)

# 3.2 BMI Validation

In [15]:
print("\n1. BMI DATA TYPE:")
print(f"   Data type: {df['bmi'].dtype}")

print("\n2. MISSING VALUES CHECK:")
print(f"   Missing values (.isnull()): {df['bmi'].isnull().sum()}")
print(f"   Percentage missing: {(df['bmi'].isnull().sum() / len(df) * 100):.2f}%")

print("\n3. CHECKING FOR NON-NUMERIC VALUES:")
print(f"   Total non-null BMI values: {df['bmi'].notna().sum()}")

print("\n4. SAMPLE OF BMI VALUES:")
print("   First 20 BMI values:")
print(df['bmi'].head(20).values)

print("\n5. CHECKING FOR UNUSUAL BMI VALUES:")
print(f"   BMI = 0: {(df['bmi'] == 0).sum()}")
print(f"   BMI < 10: {(df['bmi'] < 10).sum()}")
print(f"   BMI > 80: {(df['bmi'] > 80).sum()}")
print(f"   BMI > 100: {(df['bmi'] > 100).sum()}")

print("\n6. BMI STATISTICS (excluding missing):")
print(df['bmi'].describe())

print("\n7. EXTREME BMI VALUES:")
print("   5 Lowest BMI values:")
print(df.nsmallest(5, 'bmi')[['id', 'age', 'bmi', 'stroke']])
print("\n   5 Highest BMI values:")
print(df.nlargest(5, 'bmi')[['id', 'age', 'bmi', 'stroke']])

print("\n8. BMI DISTRIBUTION BY CATEGORY:")
bmi_categories = pd.cut(df['bmi'], 
                        bins=[0, 18.5, 25, 30, 35, 40, 100],
                        labels=['Underweight (<18.5)', 
                                'Normal (18.5-25)', 
                                'Overweight (25-30)', 
                                'Obese I (30-35)', 
                                'Obese II (35-40)', 
                                'Obese III (40+)'])
print(bmi_categories.value_counts().sort_index())

print("\n9. ATTEMPTING CALCULATIONS:")
try:
    mean_bmi = df['bmi'].mean()
    print(f"   ✓ Mean BMI calculation successful: {mean_bmi:.2f}")
    print(f"   ✓ This automatically excludes NaN values")
except Exception as e:
    print(f"   ✗ Error calculating mean: {e}")

print(f"\n   Mean BMI (with skipna=True): {df['bmi'].mean(skipna=True):.2f}")
print(f"   Mean BMI (with skipna=False): {df['bmi'].mean(skipna=False)}")

print("\n10. CHECKING IF ANY VALUES ARE ACTUALLY STRINGS:")
print(f"    Are all non-null BMI values numeric? {df['bmi'].dropna().apply(lambda x: isinstance(x, (int, float))).all()}")


1. BMI DATA TYPE:
   Data type: float64

2. MISSING VALUES CHECK:
   Missing values (.isnull()): 201
   Percentage missing: 3.93%

3. CHECKING FOR NON-NUMERIC VALUES:
   Total non-null BMI values: 4909

4. SAMPLE OF BMI VALUES:
   First 20 BMI values:
[36.6  nan 32.5 34.4 24.  29.  27.4 22.8  nan 24.2 29.7 36.8 27.3  nan
 28.2 30.9 37.5 25.8 37.8  nan]

5. CHECKING FOR UNUSUAL BMI VALUES:
   BMI = 0: 0
   BMI < 10: 0
   BMI > 80: 2
   BMI > 100: 0

6. BMI STATISTICS (excluding missing):
count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64

7. EXTREME BMI VALUES:
   5 Lowest BMI values:
         id    age   bmi  stroke
1609  38043   1.24  10.3       0
3307   3205  79.00  11.3       0
2187  59993  40.00  11.5       0
657   20364   4.00  12.0       0
922   45893   8.00  12.3       0

   5 Highest BMI values:
         id   age   bmi  stroke
2128  564

Findings:

Did NOT find any BMI values that aren't numbers.

0 non-numeric values.

It IS detected by .isnull() because NaN is Python/NumPy's standard way to represent missing numeric data, therefore, .isnull() specifically checks for NaN values.

If you tried to calculate the mean BMI without handling this, Pandas handles it for you automatically. By default, pandas uses skipna=True for aggregation functions, therefore, NaN values are automatically excluded from calculations.

# 3.3 Smoking Status Validation

In [16]:
print("\n1. ALL UNIQUE SMOKING STATUS VALUES:")
print(df['smoking_status'].unique())
print(f"\n   Number of unique values: {df['smoking_status'].nunique()}")

print("\n2. VALUE COUNTS:")
print(df['smoking_status'].value_counts())

print("\n3. PERCENTAGE DISTRIBUTION:")
print(df['smoking_status'].value_counts(normalize=True) * 100)

# Check unknown
unknown_count = (df['smoking_status'] == 'Unknown').sum()
print(f"\n4. 'UNKNOWN' SMOKING STATUS:")
print(f"   Records with 'Unknown': {unknown_count}")
print(f"   Percentage 'Unknown': {(unknown_count/len(df)*100):.2f}%")

print("\n5. IS 'UNKNOWN' DETECTED AS MISSING?")
print(f"   Null values (.isnull()): {df['smoking_status'].isnull().sum()}")
print(f"   'Unknown' values: {unknown_count}")
print(f"   Are they the same? {df['smoking_status'].isnull().sum() == unknown_count}")

print("\n6. COMPARISON WITH OTHER MISSING DATA:")
print(f"   BMI missing (NaN): {df['bmi'].isnull().sum()} ({(df['bmi'].isnull().sum()/len(df)*100):.2f}%)")
print(f"   Smoking 'Unknown': {unknown_count} ({(unknown_count/len(df)*100):.2f}%)")
print(f"   TOTAL missing info: {df['bmi'].isnull().sum() + unknown_count} patients")

print("\n7. STROKE RATE BY SMOKING STATUS:")
smoking_stroke = df.groupby('smoking_status')['stroke'].agg(['count', 'sum', 'mean'])
smoking_stroke.columns = ['Total', 'Strokes', 'Stroke Rate']
smoking_stroke['Stroke Rate'] = smoking_stroke['Stroke Rate'] * 100
print(smoking_stroke.sort_values('Stroke Rate', ascending=False))

print("\n8. CROSS-TABULATION (Smoking Status vs Stroke):")
print(pd.crosstab(df['smoking_status'], df['stroke'], 
                  margins=True, normalize='index') * 100)

print("\n9. DO 'UNKNOWN' SMOKERS ALSO HAVE MISSING BMI?")
unknown_smokers = df[df['smoking_status'] == 'Unknown']
print(f"   'Unknown' smokers with missing BMI: {unknown_smokers['bmi'].isnull().sum()}")
print(f"   Percentage: {(unknown_smokers['bmi'].isnull().sum()/len(unknown_smokers)*100):.2f}%")
print(f"   Overall BMI missing rate: {(df['bmi'].isnull().sum()/len(df)*100):.2f}%")

print("\n10. AGE DISTRIBUTION OF 'UNKNOWN' SMOKING STATUS:")
print(f"    Mean age (Unknown): {unknown_smokers['age'].mean():.2f} years")
print(f"    Mean age (Overall): {df['age'].mean():.2f} years")
print(f"    Children (<18) with 'Unknown': {(unknown_smokers['age'] < 18).sum()}")


1. ALL UNIQUE SMOKING STATUS VALUES:
['formerly smoked' 'never smoked' 'smokes' 'Unknown']

   Number of unique values: 4

2. VALUE COUNTS:
smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: count, dtype: int64

3. PERCENTAGE DISTRIBUTION:
smoking_status
never smoked       37.025440
Unknown            30.215264
formerly smoked    17.318982
smokes             15.440313
Name: proportion, dtype: float64

4. 'UNKNOWN' SMOKING STATUS:
   Records with 'Unknown': 1544
   Percentage 'Unknown': 30.22%

5. IS 'UNKNOWN' DETECTED AS MISSING?
   Null values (.isnull()): 0
   'Unknown' values: 1544
   Are they the same? False

6. COMPARISON WITH OTHER MISSING DATA:
   BMI missing (NaN): 201 (3.93%)
   Smoking 'Unknown': 1544 (30.22%)
   TOTAL missing info: 1745 patients

7. STROKE RATE BY SMOKING STATUS:
                 Total  Strokes  Stroke Rate
smoking_status                              
formerly smoked    885       70     7.909

Findings:

There are 4 unique values for smoking status:
- "never smoked" - 1,892 patients (37.0%)
- "Unknown" - 1,544 patients (30.2%)
- "formerly smoked" - 885 patients (17.3%)
- "smokes" - 789 patients (15.4%)

1,544 records have "Unknown" smoking status

"Unknown" should be treated as MISSING DATA, not as a valid category because:
- It represents lack of information, not a smoking behavior
- It's hidden missing data disguised as a category
- Similar stroke rate to overall population (not a distinct risk group)
- Clinical decision-making requires known smoking status
- Likely a data collection issue

Part 4 Create Clinical Age Groups

Findings:

1. How many patients in each age group:
Child (0-12)                     588
Adolescent (13-19)               378
Young Adult (20-39)              1204
Middle Adult (40-59)             1564
Older Adult (60-74)              858
Elderly (75+)                    518

2. Stroke rate (percentage) for each age group:
age_group                                           
Child (0-12)                     0.17%
Adolescent (13-19)               0.26%
Young Adult (20-39)              0.50%
Middle Adult (40-59)             3.84%
Older Adult (60-74)              9.21%
Elderly (75+)                    19.69%

3. At what life stage does stroke risk appear to increase most dramatically? Does this match what you learned in your clinical research?
The most dramatic increase in stroke risk occurs during the transition from YOUNG ADULT to MIDDLE ADULT (nearly 8x higher).
This is strongly validated by what I learned in my clinical research as ages 40-59 represent the "accumulation period" where:
- Years of hypertension begin causing arterial damage
- Atherosclerotic plaques become clinically significant
- Diabetes and metabolic syndrome manifest 
- Smoking effects accumulate (20+ pack-years)

In [23]:
df['age_group'] = pd.cut(df['age'],
                         bins=[0, 13, 20, 40, 60, 75, 120],
                         labels=['Child (0-12)', 
                                 'Adolescent (13-19)', 
                                 'Young Adult (20-39)', 
                                 'Middle Adult (40-59)', 
                                 'Older Adult (60-74)', 
                                 'Elderly (75+)'],
                         right=False, 
                         include_lowest=True)

print("\n" + "="*80)
print("VERIFICATION: AGE GROUP DISTRIBUTION")
print("\n" + "="*80)

print("\n1. COUNTS PER AGE GROUP:")
age_counts = df['age_group'].value_counts().sort_index()
print(age_counts)

print("\n2. PERCENTAGE DISTRIBUTION:")
age_percentages = df['age_group'].value_counts(normalize=True).sort_index() * 100
print(age_percentages.round(2))

print("\n3. VERIFICATION CHECKS:")
print(f"   Total patients in dataset: {len(df)}")
print(f"   Total patients categorized: {df['age_group'].notna().sum()}")
print(f"   Missing age groups: {df['age_group'].isna().sum()}")
print(f"   ✓ All patients categorized: {len(df) == df['age_group'].notna().sum()}")

print("\n4. ACTUAL AGE RANGES WITHIN EACH GROUP:")
age_range_summary = df.groupby('age_group', observed=True)['age'].agg(['min', 'max', 'mean', 'count'])
age_range_summary.columns = ['Min Age', 'Max Age', 'Mean Age', 'Count']
print(age_range_summary)

print("\n5. SAMPLE PATIENTS FROM EACH GROUP:")
for group in df['age_group'].cat.categories:
    sample = df[df['age_group'] == group].head(3)[['id', 'age', 'age_group', 'stroke']]
    print(f"\n{group}:")
    print(sample)

    print("\n" + "="*80)
print("STROKE RATE ANALYSIS BY AGE GROUP")
print("="*80)

# Calculate the percentage of patients who had a stroke in each age group
print("\n1. DETAILED STROKE STATISTICS BY AGE GROUP:")
stroke_by_age = df.groupby('age_group', observed=True)['stroke'].agg([
    ('Total Patients', 'count'),
    ('Stroke Cases', 'sum'),
    ('Stroke Rate (%)', lambda x: (x.sum() / len(x)) * 100)
])
print(stroke_by_age.round(2))

print("\n2. COMPREHENSIVE AGE GROUP ANALYSIS:")
age_summary = df.groupby('age_group', observed=True).agg({
    'stroke': ['count', 'sum', 'mean'],
    'age': ['min', 'max', 'mean'],
    'hypertension': 'mean',
    'heart_disease': 'mean'
})

age_summary.columns = ['Total_Patients', 'Stroke_Cases', 'Stroke_Rate', 
                       'Min_Age', 'Max_Age', 'Mean_Age',
                       'Hypertension_Rate', 'Heart_Disease_Rate' ]

# Convert rates to percentages
age_summary['Stroke_Rate'] = age_summary['Stroke_Rate'] * 100
age_summary['Hypertension_Rate'] = age_summary['Hypertension_Rate'] * 100
age_summary['Heart_Disease_Rate'] = age_summary['Heart_Disease_Rate'] * 100

print(age_summary.round(2))

print("\n3. RELATIVE STROKE RISK (compared to Children):")
baseline_rate = stroke_by_age.loc['Child (0-12)', 'Stroke Rate (%)']
stroke_by_age['Relative Risk'] = stroke_by_age['Stroke Rate (%)'] / baseline_rate
print(stroke_by_age[['Stroke Rate (%)', 'Relative Risk']].round(2))

print("\n4. VISUAL REPRESENTATION:")
print("\nStroke Rate by Age Group:")
for idx, row in stroke_by_age.iterrows():
    bar_length = int(row['Stroke Rate (%)'] * 2)  # Scale for visibility
    bar = '█' * bar_length
    print(f"{idx:25} {row['Stroke Rate (%)']:5.2f}% {bar}")


VERIFICATION: AGE GROUP DISTRIBUTION


1. COUNTS PER AGE GROUP:
age_group
Child (0-12)             588
Adolescent (13-19)       378
Young Adult (20-39)     1204
Middle Adult (40-59)    1564
Older Adult (60-74)      858
Elderly (75+)            518
Name: count, dtype: int64

2. PERCENTAGE DISTRIBUTION:
age_group
Child (0-12)            11.51
Adolescent (13-19)       7.40
Young Adult (20-39)     23.56
Middle Adult (40-59)    30.61
Older Adult (60-74)     16.79
Elderly (75+)           10.14
Name: proportion, dtype: float64

3. VERIFICATION CHECKS:
   Total patients in dataset: 5110
   Total patients categorized: 5110
   Missing age groups: 0
   ✓ All patients categorized: True

4. ACTUAL AGE RANGES WITHIN EACH GROUP:
                      Min Age  Max Age   Mean Age  Count
age_group                                               
Child (0-12)             0.08     12.0   5.619048    588
Adolescent (13-19)      13.00     19.0  16.015873    378
Young Adult (20-39)     20.00     39.0  29.9875

Part 5: Research Questions

# 5.1 Write Three Answerable Questions

1. Is there a difference in average glucose levels between patients who experienced a stroke and those who did not?

2. Do stroke rates differ between patients who have been married and those who have never been married?

3. Are patients who have both hypertension and heart disease more likely to have had a stroke compared to patients with neither or only one of these conditions?

# 5.2 One Question the Data CANNOT Answer

Does taking antihypertensive medication reduce the risk of stroke?

# 5.3 Grouping Analysis: Average BMI by Work Type

In [18]:
# Handle BMI data quality issue
df['bmi'] = df['bmi'].replace('N/A', pd.NA)
df['bmi'] = df['bmi'].astype(float)

# Groupby analysis
bmi_by_work = df.groupby('work_type')['bmi'].mean()

print(bmi_by_work)


work_type
Govt_job         30.522063
Never_worked     25.545455
Private          30.304625
Self-employed    30.211871
children         20.038003
Name: bmi, dtype: float64



My Interpretation:

Work type with the highest average BMI: Government job
Work type with the lowest average BMI: Children
This can be explained by the fact that children naturally have lower BMI values, as well as in how government jobs are usually sedentary, thus promoting higher BMI levels.


Part 6: Target Variable Analysis

In [19]:
# Show the count and percentage of stroke vs no-stroke
print("\n1. STROKE DISTRIBUTION (Raw Counts):")
stroke_counts = df['stroke'].value_counts().sort_index()
print(stroke_counts)

print("\n2. STROKE DISTRIBUTION (Percentages):")
stroke_percentages = df['stroke'].value_counts(normalize=True).sort_index() * 100
print(stroke_percentages.round(2))



1. STROKE DISTRIBUTION (Raw Counts):
stroke
0    4861
1     249
Name: count, dtype: int64

2. STROKE DISTRIBUTION (Percentages):
stroke
0    95.13
1     4.87
Name: proportion, dtype: float64


### Bonus Questions:

1. What percentage of patients in this dataset had a stroke?

In [20]:
stroke_pct = df["stroke"].mean() * 100
print(f"{stroke_pct:.2f}% of patients had a stroke ({df['stroke'].sum()} out of {len(df)})")

4.87% of patients had a stroke (249 out of 5110)


4.87%

2. Is this dataset balanced or imbalanced? (A balanced dataset has roughly equal classes)
This dataset is severely imbalanced.

3. Why does class imbalance matter for machine learning?
Class imbalance matters tremendously for machine learning because it fundamentally breaks the assumptions and optimization objectives of most standard algorithms, introducing bias into the agorithm that will cause it to favor the majority class.

4. 4. In the real world, would you expect stroke rates to be this high or this low? What might this tell you about how this dataset was collected?
The current rate of prevalence (living with stroke history) is 2.5-3%. Our rate is 1.6-2x higher than this, therefore, in the real world I would expect stroke rates to be lower.
The typical stroke rate for cardiology clinic patients, however, is 5-10%, which is closely in line with our rate. This suggests this data was collected from a clinical/hospital database.
    

Dr. Ammar Feedback: Awesome work Tyler, you can get the fomatting better on the markdown.  See the Bonus Question.. 

Correlation analysis of top three risk factors

In [33]:
"""
Stroke Risk Factors - Statistical Correlation Analysis
This script analyzes correlations between age, heart disease, hypertension and stroke occurrence.
"""
from scipy import stats

print(f"Total records: {len(df)}")
print(f"Stroke cases: {df['stroke'].sum()} ({df['stroke'].mean()*100:.2f}%)")
print("\n" + "="*80)

# Calculate correlations and statistical tests
results = []

# 1. Age and Stroke - Pearson correlation
age_stroke_corr, age_stroke_pval = stats.pearsonr(df['age'], df['stroke'])
results.append({
    'Variables': 'Age vs Stroke',
    'Correlation Coefficient': f"{age_stroke_corr:.4f}",
    'P-value': f"{age_stroke_pval:.4e}",
    'Statistical Significance': 'Yes (p < 0.05)' if age_stroke_pval < 0.05 else 'No (p ≥ 0.05)'
})

# 2. Heart Disease and Stroke - Point-biserial correlation (both binary)
heart_stroke_corr, heart_stroke_pval = stats.pointbiserialr(df['heart_disease'], df['stroke'])
results.append({
    'Variables': 'Heart Disease vs Stroke',
    'Correlation Coefficient': f"{heart_stroke_corr:.4f}",
    'P-value': f"{heart_stroke_pval:.4e}",
    'Statistical Significance': 'Yes (p < 0.05)' if heart_stroke_pval < 0.05 else 'No (p ≥ 0.05)'
})

# 3. Hypertension and Stroke - Point-biserial correlation (both binary)
hyper_stroke_corr, hyper_stroke_pval = stats.pointbiserialr(df['hypertension'], df['stroke'])
results.append({
    'Variables': 'Hypertension vs Stroke',
    'Correlation Coefficient': f"{hyper_stroke_corr:.4f}",
    'P-value': f"{hyper_stroke_pval:.4e}",
    'Statistical Significance': 'Yes (p < 0.05)' if hyper_stroke_pval < 0.05 else 'No (p ≥ 0.05)'
})

# Create results dataframe
results_df = pd.DataFrame(results)

# Display the correlation table
print("\nSTATISTICAL ANALYSIS: CORRELATIONS WITH STROKE")
print("="*80)
print(results_df.to_string(index=False))
print("="*80)

# Additional descriptive statistics
print("\n\nADDITIONAL DESCRIPTIVE STATISTICS")
print("="*80)

print("\n1. Age comparison:")
print(f"   Mean age (Stroke = Yes): {df[df['stroke']==1]['age'].mean():.2f} years")
print(f"   Mean age (Stroke = No):  {df[df['stroke']==0]['age'].mean():.2f} years")
print(f"   Difference: {df[df['stroke']==1]['age'].mean() - df[df['stroke']==0]['age'].mean():.2f} years")

print("\n2. Heart Disease prevalence:")
print(f"   Among stroke patients:     {df[df['stroke']==1]['heart_disease'].mean()*100:.2f}%")
print(f"   Among non-stroke patients: {df[df['stroke']==0]['heart_disease'].mean()*100:.2f}%")
print(f"   Risk ratio: {(df[df['stroke']==1]['heart_disease'].mean() / df[df['stroke']==0]['heart_disease'].mean()):.2f}x")

print("\n3. Hypertension prevalence:")
print(f"   Among stroke patients:     {df[df['stroke']==1]['hypertension'].mean()*100:.2f}%")
print(f"   Among non-stroke patients: {df[df['stroke']==0]['hypertension'].mean()*100:.2f}%")
print(f"   Risk ratio: {(df[df['stroke']==1]['hypertension'].mean() / df[df['stroke']==0]['hypertension'].mean()):.2f}x")

print("\n" + "="*80)

# Optional: Create a more detailed table with confidence intervals
print("\n\nDETAILED STATISTICAL TABLE")
print("="*80)

detailed_results = []

# Age vs Stroke
n = len(df)
age_ci = 1.96 * np.sqrt((1 - age_stroke_corr**2) / (n - 2))
detailed_results.append({
    'Variable Pair': 'Age vs Stroke',
    'Correlation (r)': f"{age_stroke_corr:.4f}",
    '95% CI': f"[{age_stroke_corr - age_ci:.4f}, {age_stroke_corr + age_ci:.4f}]",
    'P-value': f"{age_stroke_pval:.2e}",
    'N': n
})

# Heart Disease vs Stroke
heart_ci = 1.96 * np.sqrt((1 - heart_stroke_corr**2) / (n - 2))
detailed_results.append({
    'Variable Pair': 'Heart Disease vs Stroke',
    'Correlation (r)': f"{heart_stroke_corr:.4f}",
    '95% CI': f"[{heart_stroke_corr - heart_ci:.4f}, {heart_stroke_corr + heart_ci:.4f}]",
    'P-value': f"{heart_stroke_pval:.2e}",
    'N': n
})

# Hypertension vs Stroke
hyper_ci = 1.96 * np.sqrt((1 - hyper_stroke_corr**2) / (n - 2))
detailed_results.append({
    'Variable Pair': 'Hypertension vs Stroke',
    'Correlation (r)': f"{hyper_stroke_corr:.4f}",
    '95% CI': f"[{hyper_stroke_corr - hyper_ci:.4f}, {hyper_stroke_corr + hyper_ci:.4f}]",
    'P-value': f"{hyper_stroke_pval:.2e}",
    'N': n
})

detailed_df = pd.DataFrame(detailed_results)
print(detailed_df.to_string(index=False))
print("="*80)


Total records: 5110
Stroke cases: 249 (4.87%)


STATISTICAL ANALYSIS: CORRELATIONS WITH STROKE
              Variables Correlation Coefficient    P-value Statistical Significance
          Age vs Stroke                  0.2453 7.0308e-71           Yes (p < 0.05)
Heart Disease vs Stroke                  0.1349 3.4519e-22           Yes (p < 0.05)
 Hypertension vs Stroke                  0.1279 4.3676e-20           Yes (p < 0.05)


ADDITIONAL DESCRIPTIVE STATISTICS

1. Age comparison:
   Mean age (Stroke = Yes): 67.73 years
   Mean age (Stroke = No):  41.97 years
   Difference: 25.76 years

2. Heart Disease prevalence:
   Among stroke patients:     18.88%
   Among non-stroke patients: 4.71%
   Risk ratio: 4.01x

3. Hypertension prevalence:
   Among stroke patients:     26.51%
   Among non-stroke patients: 8.89%
   Risk ratio: 2.98x


Results saved to 'stroke_correlation_results.csv'


DETAILED STATISTICAL TABLE
          Variable Pair Correlation (r)           95% CI  P-value    N
       