# Libraries and Dataset

In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
print('data loaded')

data loaded


In [3]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


# 10-Point Inspection

In [4]:
#1 Shape
#shape = df.shape
#print(shape[0])
df.shape

(5110, 12)

**Findings**

* 5110 Rows
* 12 Columns/Features
* Each row represents a patient, while each column is a feature of that patient

In [5]:
#2 Column Names
columns = df.columns
n = 1
for i in columns:
    print(f'Column {n}: {i}')
    n += 1 

Column 1: id
Column 2: gender
Column 3: age
Column 4: hypertension
Column 5: heart_disease
Column 6: ever_married
Column 7: work_type
Column 8: Residence_type
Column 9: avg_glucose_level
Column 10: bmi
Column 11: smoking_status
Column 12: stroke


For columns like work and residence type I will need to look at the values to better understand how they have been used. I will also need to research the medical value columns (such as 'bmi' and 'avg_glucose_level') so that I can know what impossible/illogical values look like.

In [6]:
# 3 Data Types
df.dtypes

numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()

print('Sorting Values...\n')

print('Numerical Columns:')
for col in numerical_cols:
    print(f"{col}: {df.dtypes[col]}")

print()
print('Categorical Columns:')
for col in categorical_cols:
    print(f"{col}: {df.dtypes[col]}")


Sorting Values...

Numerical Columns:
id: int64
age: float64
hypertension: int64
heart_disease: int64
avg_glucose_level: float64
bmi: float64
stroke: int64

Categorical Columns:
gender: object
ever_married: object
work_type: object
Residence_type: object
smoking_status: object


Some of the data types in the numrical section cannot be considered truly numerical. the 'heart_disease' column, while using integers, is actually categorical with 0 meaning they do not have heart disease and 1 meaning they do. As such, going forward I shouldn't rely on the dtypes to determine whether or not they are numerical or categorical.

In [7]:
#4 First look
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


* I was curious what the dataset meant by work and residence type, but now it is a little  more clear what that means. I do wonder why they did not consider sub-urban as a residence type, though perhap they considered it close enough to urban.
* Nothing unexpected or unusual jumps out, except for the potentially missing values which will be discussed next.
* There do seem to be some obvious missing values immediately, with the second line having 'NaN' for bmi. Furthermore, not seen here but in the tail on the next block, there is an 'Unknown' value for smoking, which may be considered a placeholder.

In [8]:
#5 Last Look
df.tail()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


* The data seems to have a clean end
* All the rows remain consistent with the beginning

In [9]:
#6 Memory Usage
df.memory_usage()

Index                  132
id                   40880
gender               40880
age                  40880
hypertension         40880
heart_disease        40880
ever_married         40880
work_type            40880
Residence_type       40880
avg_glucose_level    40880
bmi                  40880
smoking_status       40880
stroke               40880
dtype: int64

In [10]:
print(f"Total memory usage: {df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

Total memory usage: 1.62 MB


This is a very small data set by data science standards.

In [11]:
#7 Missing Values
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [12]:
#8 Duplicates
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5105    False
5106    False
5107    False
5108    False
5109    False
Length: 5110, dtype: bool

In [13]:
#9 Basic Statistics
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [14]:
#10 Unique Counts
df.nunique()

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

# Data Dictionary

# Data Validation

In [17]:
# 1. How many ages are below 0?
below_zero = (df["age"] < 0).sum()

# 2. How many ages are above 120?
above_120 = (df["age"] > 120).sum()

# 3. What is the actual age range?
min_age = df["age"].min()
max_age = df["age"].max()

# 4. How many patients are under 18?
under_18 = (df["age"] < 18).sum()

# 5. How many patients are under 1 year old?
under_1 = (df["age"] < 1).sum()

# --- Print Results ---
print("Age Validation Results:")
print(f"Ages below 0: {below_zero}")
print(f"Ages above 120: {above_120}")
print(f"Actual age range: {min_age} to {max_age}")
print(f"Patients under 18: {under_18}")
print(f"Patients under 1 year old: {under_1}")

Age Validation Results:
Ages below 0: 0
Ages above 120: 0
Actual age range: 0.08 to 82.0
Patients under 18: 856
Patients under 1 year old: 43


In [18]:
# Basic summary statistics
print("Summary Statistics for BMI:")
print(df["bmi"].describe())

print("\n")

# Count missing values
print("Missing BMI values:", df["bmi"].isna().sum())

print("\n")

# Value counts (sorted)
print("BMI Value Counts (Top 20 Most Frequent):")
print(df["bmi"].value_counts().head(20))

print("\n")

# Check for impossible / unusual values
print("BMI values below 10:", (df["bmi"] < 10).sum())
print("BMI values above 60:", (df["bmi"] > 60).sum())

print("\n")

# Show extreme values
print("Lowest BMI values:")
print(df["bmi"].sort_values().head(10))

print("\n")

print("Highest BMI values:")
print(df["bmi"].sort_values(ascending=False).head(10))


Summary Statistics for BMI:
count    4909.000000
mean       28.893237
std         7.854067
min        10.300000
25%        23.500000
50%        28.100000
75%        33.100000
max        97.600000
Name: bmi, dtype: float64


Missing BMI values: 201


BMI Value Counts (Top 20 Most Frequent):
bmi
28.7    41
28.4    38
27.7    37
27.6    37
26.7    37
26.1    37
27.3    36
23.4    36
27.0    35
25.1    34
26.4    34
26.9    34
25.5    33
24.8    31
28.9    31
23.5    31
30.3    30
28.3    30
26.5    30
22.2    30
Name: count, dtype: int64


BMI values below 10: 0
BMI values above 60: 13


Lowest BMI values:
1609    10.3
3307    11.3
2187    11.5
657     12.0
922     12.3
3319    12.8
3968    13.0
3619    13.2
4694    13.3
1701    13.4
Name: bmi, dtype: float64


Highest BMI values:
2128    97.6
4209    92.0
928     78.0
544     71.9
1559    66.8
358     64.8
4188    64.4
2764    63.3
3825    61.6
2840    61.2
Name: bmi, dtype: float64


In [19]:
print("Smoking Status Distribution (Counts and Percentages):")

counts = df["smoking_status"].value_counts(dropna=False)
percentages = df["smoking_status"].value_counts(normalize=True, dropna=False) * 100

smoking_summary = pd.DataFrame({
    "Count": counts,
    "Percentage (%)": percentages.round(2)
})

print(smoking_summary)


Smoking Status Distribution (Counts and Percentages):
                 Count  Percentage (%)
smoking_status                        
never smoked      1892           37.03
Unknown           1544           30.22
formerly smoked    885           17.32
smokes             789           15.44


In [20]:
df["smoking_status"] = df["smoking_status"].replace("Unknown", pd.NA)


# Create Age Groups

In [21]:
# Define bins and labels
bins = [0, 5, 15, 25, 45, 65, 75, 85, float("inf")]

labels = [
    "Infant/Toddler (0-4)",
    "Child (5-14)",
    "Youth (15-24)",
    "Adult (25-44)",
    "Middle Age (45-64)",
    "Young-Old (65-74)",
    "Old-Old (75-84)",
    "Oldest-Old (85+)"
]

# Create new column
df["age_group"] = pd.cut(
    df["age"],
    bins=bins,
    labels=labels,
    right=False  # includes left value, excludes right (e.g., 5 goes into 5-14)
)

# Verify results
print(df["age_group"].value_counts())


age_group
Middle Age (45-64)      1550
Adult (25-44)           1297
Youth (15-24)            537
Old-Old (75-84)          518
Young-Old (65-74)        509
Child (5-14)             444
Infant/Toddler (0-4)     255
Oldest-Old (85+)           0
Name: count, dtype: int64


In [23]:
# Define the correct order
age_order = [
    "Infant/Toddler (0-4)",
    "Child (5-14)",
    "Youth (15-24)",
    "Adult (25-44)",
    "Middle Age (45-64)",
    "Young-Old (65-74)",
    "Old-Old (75-84)",
    "Oldest-Old (85+)"
]

# Convert to ordered categorical
df["age_group"] = pd.Categorical(
    df["age_group"],
    categories=age_order,
    ordered=True
)

# Verify ordering
print(df["age_group"].value_counts().sort_index())


age_group
Infant/Toddler (0-4)     255
Child (5-14)             444
Youth (15-24)            537
Adult (25-44)           1297
Middle Age (45-64)      1550
Young-Old (65-74)        509
Old-Old (75-84)          518
Oldest-Old (85+)           0
Name: count, dtype: int64


In [22]:
# Calculate stroke rate per age group
stroke_percentage = (
    df.groupby("age_group")["stroke"]
      .mean() * 100
).round(2)

print("Percentage of Patients Who Had a Stroke by Age Group:")
print(stroke_percentage)


Percentage of Patients Who Had a Stroke by Age Group:
age_group
Infant/Toddler (0-4)     0.39
Child (5-14)             0.23
Youth (15-24)            0.00
Adult (25-44)            0.62
Middle Age (45-64)       5.16
Young-Old (65-74)       11.20
Old-Old (75-84)         19.69
Oldest-Old (85+)          NaN
Name: stroke, dtype: float64


  df.groupby("age_group")["stroke"]


# Research Questions

# Target Variable Analysis 