In [23]:
# Import libraries
# Load the dataset
# Display first few rows to confirm it loaded

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('healthcare-dataset-stroke-data.csv')

print("First 5 rows of the dataset:")
print(df.head())

First 5 rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  


In [25]:
# Step 1: Shape

In [26]:
print("Dataset Shape:")
print(f"Rows (observations): {df.shape[0]}")
print(f"Columns (features): {df.shape[1]}")
print(f"\nColumn names: {list(df.columns)}")




Dataset Shape:
Rows (observations): 5110
Columns (features): 12

Column names: ['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']


In [27]:
'''
Findings:
Rows (observations): 5110
Columns (features): 12
Each row represents a patient with 12 features assessed
'''

'\nFindings:\nRows (observations): 5110\nColumns (features): 12\nEach row represents a patient with 12 features assessed\n'

In [28]:
# Step 2: Column Names

In [29]:
print("Column Names:")
print(df.columns.tolist())
print(f"\nTotal number of columns: {len(df.columns)}")


Column Names:
['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']

Total number of columns: 12


In [30]:
'''
Findings:

List all column names:
id - Unique patient identifier
gender - Patient's biological sex
age - Patient's age in years
hypertension - History of high blood pressure
heart_disease - History of heart disease
ever_married - Marital status
work_type - Type of employment/occupation
Residence_type - Urban or rural residence
avg_glucose_level - Average blood glucose level
bmi - Body Mass Index
smoking_status - Smoking history
stroke - Whether patient had a stroke (TARGET VARIABLE)

avg_glucose_level - Need to understand what units (mg/dL vs mmol/L) and whether this is fasting glucose, random glucose, or HbA1c equivalent
bmi - Need to confirm the standard BMI categories for clinical interpretation
hypertension - Need to understand the diagnostic threshold used (140/90 vs 130/80 mmHg)
smoking_status - Need to clarify what "Unknown" means and the definitions of the other categories
work_type - Need to understand how "children" and "Never_worked" are classified
'''

'\nFindings:\n\nList all column names:\nid - Unique patient identifier\ngender - Patient\'s biological sex\nage - Patient\'s age in years\nhypertension - History of high blood pressure\nheart_disease - History of heart disease\never_married - Marital status\nwork_type - Type of employment/occupation\nResidence_type - Urban or rural residence\navg_glucose_level - Average blood glucose level\nbmi - Body Mass Index\nsmoking_status - Smoking history\nstroke - Whether patient had a stroke (TARGET VARIABLE)\n\navg_glucose_level - Need to understand what units (mg/dL vs mmol/L) and whether this is fasting glucose, random glucose, or HbA1c equivalent\nbmi - Need to confirm the standard BMI categories for clinical interpretation\nhypertension - Need to understand the diagnostic threshold used (140/90 vs 130/80 mmHg)\nsmoking_status - Need to clarify what "Unknown" means and the definitions of the other categories\nwork_type - Need to understand how "children" and "Never_worked" are classified\n

In [31]:
# Step 3: Data Types

In [35]:
print("Data Types for Each Column:")
print(df.dtypes)
print("\n" + "="*50)

print("\nNumeric Columns:")
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(numeric_cols)

print("\nCategorical Columns (object/string):")
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

print("\n" + "="*50)
print("Data Type Summary:")
print(df.info())

Data Types for Each Column:
id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


Numeric Columns:
['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']

Categorical Columns (object/string):
['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

Data Type Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5

In [36]:
'''
Findings:

Numeric Columns:
id (int64) - Integer identifier
age (float64) - Continuous/decimal number
hypertension (int64) - Integer (0 or 1)
heart_disease (int64) - Integer (0 or 1)
avg_glucose_level (float64) - Continuous measurement
bmi (float64) - Continuous measurement
stroke (int64) - Integer (0 or 1)

Categorical Columns:
gender (object) - Text categories
ever_married (object) - Text categories
work_type (object) - Text categories
Residence_type (object) - Text categories
smoking_status (object) - Text categories

No incorrect data types detected
'''

'\nFindings:\n\nNumeric Columns:\nid (int64) - Integer identifier\nage (float64) - Continuous/decimal number\nhypertension (int64) - Integer (0 or 1)\nheart_disease (int64) - Integer (0 or 1)\navg_glucose_level (float64) - Continuous measurement\nbmi (float64) - Continuous measurement\nstroke (int64) - Integer (0 or 1)\n\nCategorical Columns:\ngender (object) - Text categories\never_married (object) - Text categories\nwork_type (object) - Text categories\nResidence_type (object) - Text categories\nsmoking_status (object) - Text categories\n\nNo incorrect data types detected\n'

In [37]:
# Step 4: First Look

In [39]:
print("First 10 rows of the dataset:")
print(df.head(10))

print("\n" + "="*80)
print("Sample of data (random 5 rows):")
print(df.sample(5, random_state=42))

print("\n" + "="*80)
print("Checking for potential placeholder values:")
print("\nUnique values in categorical columns:")
for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    print(f"{col}: {df[col].unique()}")

First 10 rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   
5  56669    Male  81.0             0              0          Yes   
6  53882    Male  74.0             1              1          Yes   
7  10434  Female  69.0             0              0           No   
8  27419  Female  59.0             0              0          Yes   
9  60491  Female  78.0             0              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never 

In [40]:
'''
Findings:

What do the actual values look like:
- Numeric values appear reasonable:
 -Ages range from infants (0.08 years ≈ 1 month old) to elderly (82 years)
 -Glucose levels look realistic (55-271 mg/dL)
 -BMI values are in expected ranges (10.3-97.6)

- Categorical values are clear and descriptive:
 -Gender: "Male", "Female", "Other"
 -Work type: "Private", "Self-employed", "Govt_job", "children", "Never_worked"
 -Smoking status: "never smoked", "formerly smoked", "smokes", "Unknown"

- Binary values are coded as 0/1 (hypertension, heart_disease, stroke)

Anything unusual or unexpected:
- Very young patients: There's an infant aged 0.08 years (about 1 month old) in the dataset. Pediatric stroke is rare but possible.
- "Unknown" smoking status: 30.2% of records have "Unknown" smoking status - this is a significant amount of missing information disguised as a category.
- "Other" gender: Only 1 patient classified as "Other" - this might need special handling in analysis.
- "Never_worked": 22 patients in this category - might overlap with "children" category.

Values that look like they might be placeholders for missing data:
- "Unknown" in smoking_status: This appears to be a placeholder for missing data (1,544 records). Rather than showing as NaN, the absence of smoking information is coded as "Unknown".
'''

'\nFindings:\n\nWhat do the actual values look like:\n- Numeric values appear reasonable:\n -Ages range from infants (0.08 years ≈ 1 month old) to elderly (82 years)\n -Glucose levels look realistic (55-271 mg/dL)\n -BMI values are in expected ranges (10.3-97.6)\n\n- Categorical values are clear and descriptive:\n -Gender: "Male", "Female", "Other"\n -Work type: "Private", "Self-employed", "Govt_job", "children", "Never_worked"\n -Smoking status: "never smoked", "formerly smoked", "smokes", "Unknown"\n\n- Binary values are coded as 0/1 (hypertension, heart_disease, stroke)\n\nAnything unusual or unexpected:\n- Very young patients: There\'s an infant aged 0.08 years (about 1 month old) in the dataset. Pediatric stroke is rare but possible.\n- "Unknown" smoking status: 30.2% of records have "Unknown" smoking status - this is a significant amount of missing information disguised as a category.\n- "Other" gender: Only 1 patient classified as "Other" - this might need special handling in an

In [41]:
# Step 5: Last Look

In [42]:
print("Last 10 rows of the dataset:")
print(df.tail(10))

print("\n" + "="*80)
print("Comparison of first and last rows:")
print("\nFirst row:")
print(df.iloc[0])
print("\nLast row:")
print(df.iloc[-1])

# Check if data ends cleanly
print("\n" + "="*80)
print(f"Total rows in dataset: {len(df)}")
print(f"Last index: {df.index[-1]}")
print(f"Are indices continuous? {df.index.is_monotonic_increasing and len(df) == df.index[-1] + 1}")

Last 10 rows of the dataset:
         id  gender   age  hypertension  heart_disease ever_married  \
5100  68398    Male  82.0             1              0          Yes   
5101  36901  Female  45.0             0              0          Yes   
5102  45010  Female  57.0             0              0          Yes   
5103  22127  Female  18.0             0              0           No   
5104  14180  Female  13.0             0              0           No   
5105  18234  Female  80.0             1              0          Yes   
5106  44873  Female  81.0             0              0          Yes   
5107  19723  Female  35.0             0              0          Yes   
5108  37544    Male  51.0             0              0          Yes   
5109  44679  Female  44.0             0              0          Yes   

          work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
5100  Self-employed          Rural              71.97  28.3     never smoked   
5101        Private          

In [43]:
'''
Findings:

The dataset ends cleanly at row 5,109
- No incomplete rows or truncated data
- No error messages or corruption visible
- All columns are present in the last rows
- Indices run continuously from 0 to 5,109 with no gaps

The last rows consistent with the first rows
- Same 12 columns present throughout
- Same data types maintained (no type changes mid-dataset)
- Same categorical values (no new categories appearing at the end)
- Mix of stroke outcomes (both 0 and 1 present)
- Similar patterns of missing BMI values (NaN appears in both sections)
- Value ranges appear consistent (ages, glucose, BMI all in similar ranges)
'''

'\nFindings:\n\nThe dataset ends cleanly at row 5,109\n- No incomplete rows or truncated data\n- No error messages or corruption visible\n- All columns are present in the last rows\n- Indices run continuously from 0 to 5,109 with no gaps\n\nThe last rows consistent with the first rows\n- Same 12 columns present throughout\n- Same data types maintained (no type changes mid-dataset)\n- Same categorical values (no new categories appearing at the end)\n- Mix of stroke outcomes (both 0 and 1 present)\n- Similar patterns of missing BMI values (NaN appears in both sections)\n- Value ranges appear consistent (ages, glucose, BMI all in similar ranges)\n'

In [44]:
# Step 6: Memory Usage

In [46]:
print("Memory Usage Report:")
print("="*80)
print(df.info(memory_usage='deep'))

print("\n" + "="*80)
print("Memory usage by column:")
print(df.memory_usage(deep=True))

print("\n" + "="*80)
# Calculate total memory in MB
memory_bytes = df.memory_usage(deep=True).sum()
memory_mb = memory_bytes / (1024**2)
print(f"Total memory usage: {memory_mb:.2f} MB")
print(f"Total memory usage: {memory_bytes:,} bytes")

# Memory per row
memory_per_row = memory_bytes / len(df)
print(f"\nMemory per row: {memory_per_row:.2f} bytes")

Memory Usage Report:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 1.6 MB
None

Memory usage by column:
Index                   132
id                    40880
gender               276819
age                   408

In [47]:
'''
Findings:

Total dataset memory usage is 1.62 MB

This is a small dataset by data science standards
'''

'\nFindings:\n\nTotal dataset memory usage is 1.62 MB\n\nThis is a small dataset by data science standards\n'

In [48]:
#Step 7: Missing Values

In [49]:
print("Missing Values Analysis:")
print("="*80)

# Count missing values per column
print("\nMissing values per column:")
print(df.isnull().sum())

print("\n" + "="*80)
# Calculate percentage of missing values
print("Percentage of missing values per column:")
missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
print(missing_pct)

print("\n" + "="*80)
# Total missing values
total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"Total missing values: {total_missing}")
print(f"Total cells in dataset: {total_cells}")
print(f"Overall percentage missing: {(total_missing/total_cells)*100:.2f}%")

print("\n" + "="*80)
# Check for hidden missing values
print("Checking for HIDDEN missing values (disguised as data):")
print("\nSmoking status value counts:")
print(df['smoking_status'].value_counts())
print("\nGender value counts:")
print(df['gender'].value_counts())
print("\nWork type value counts:")
print(df['work_type'].value_counts())

Missing Values Analysis:

Missing values per column:
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Percentage of missing values per column:
id                   0.00
gender               0.00
age                  0.00
hypertension         0.00
heart_disease        0.00
ever_married         0.00
work_type            0.00
Residence_type       0.00
avg_glucose_level    0.00
bmi                  3.93
smoking_status       0.00
stroke               0.00
dtype: float64

Total missing values: 201
Total cells in dataset: 61320
Overall percentage missing: 0.33%

Checking for HIDDEN missing values (disguised as data):

Smoking status value counts:
smoking_status
never smoked       1892
Unknown            1544
formerly smoked     885


In [53]:
'''
Findings:

Only the bmi column has missing values 

The bmi column is missing 3.93% (201 out of 5,110)
All other columns are missing 0.00%

There ARE values that look like data but actually represent missing information:
- smoking_status - "Unknown" category:
 -1,544 records (30.2% of dataset) are marked as "Unknown"
 -This is NOT detected by .isnull() because it's stored as a text string
 -This represents missing information about smoking history
 -Impact: Nearly 1/3 of smoking data is actually missing
 
- Potential issue with work_type - "Never_worked":
 -22 records classified as "Never_worked"
 -Could be legitimate data OR could indicate missing employment information
 -Needs clinical context to determine

- gender - "Other" category:
 -Only 1 record with "Other"
 -Too small for meaningful analysis, but is legitimate data
'''

'\nFindings:\n\nOnly the bmi column has missing values \n\nThe bmi column is missing 3.93% (201 out of 5,110)\nAll other columns are missing 0.00%\n\nThere ARE values that look like data but actually represent missing information:\n- smoking_status - "Unknown" category:\n -1,544 records (30.2% of dataset) are marked as "Unknown"\n -This is NOT detected by .isnull() because it\'s stored as a text string\n -This represents missing information about smoking history\n -Impact: Nearly 1/3 of smoking data is actually missing\n\n- Potential issue with work_type - "Never_worked":\n -22 records classified as "Never_worked"\n -Could be legitimate data OR could indicate missing employment information\n -Needs clinical context to determine\n\n- gender - "Other" category:\n -Only 1 record with "Other"\n -Too small for meaningful analysis, but is legitimate data\n'

In [54]:
# Step 8: Duplicates

In [55]:
print("Duplicate Analysis:")
print("="*80)

# Check for completely duplicate rows
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("\nDuplicate rows found:")
    print(df[df.duplicated(keep=False)])

print("\n" + "="*80)
# Check for duplicate IDs
duplicate_ids = df['id'].duplicated().sum()
print(f"Number of duplicate patient IDs: {duplicate_ids}")

if duplicate_ids > 0:
    print("\nDuplicate IDs found:")
    print(df[df['id'].duplicated(keep=False)].sort_values('id'))

print("\n" + "="*80)
# Verify uniqueness
print(f"Total rows in dataset: {len(df)}")
print(f"Unique patient IDs: {df['id'].nunique()}")
print(f"Are all IDs unique? {len(df) == df['id'].nunique()}")

Duplicate Analysis:
Number of duplicate rows: 0

Number of duplicate patient IDs: 0

Total rows in dataset: 5110
Unique patient IDs: 5110
Are all IDs unique? True


In [56]:
'''
Findings:

'''

'\nFindings:\n\n'