In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('full_data.csv')

# Review the DataFrame
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [3]:
#Find missing values (NaN or None) in the DataFrame
missing_values = df.isna()

# Display the DataFrame with missing value indicators
print(missing_values)

      gender    age  hypertension  heart_disease  ever_married  work_type  \
0      False  False         False          False         False      False   
1      False  False         False          False         False      False   
2      False  False         False          False         False      False   
3      False  False         False          False         False      False   
4      False  False         False          False         False      False   
...      ...    ...           ...            ...           ...        ...   
4976   False  False         False          False         False      False   
4977   False  False         False          False         False      False   
4978   False  False         False          False         False      False   
4979   False  False         False          False         False      False   
4980   False  False         False          False         False      False   

      Residence_type  avg_glucose_level    bmi  smoking_status  stroke  
0 

In [4]:
# Drop rows containing missing values (axis=0)
data_cleaned = df.dropna(axis=0)

data_cleaned

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


In [5]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on your data
scaler.fit(df[['age', 'avg_glucose_level', 'bmi']])

# Transform the numeric columns to scale the data
scaled_data = scaler.transform(df[['age', 'avg_glucose_level', 'bmi']])

# Replace the original numeric columns with scaled values
df[['age', 'avg_glucose_level', 'bmi']] = scaled_data

print(df)


      gender       age  hypertension  heart_disease ever_married  \
0       Male  1.040584             0              1          Yes   
1       Male  1.614270             0              1          Yes   
2     Female  0.246250             0              0          Yes   
3     Female  1.570141             1              0          Yes   
4       Male  1.658400             0              0          Yes   
...      ...       ...           ...            ...          ...   
4976    Male -0.106788             0              0           No   
4977    Male -0.150917             0              0          Yes   
4978  Female  0.069731             1              0          Yes   
4979    Male -0.150917             0              0          Yes   
4980  Female  1.614270             1              0          Yes   

          work_type Residence_type  avg_glucose_level       bmi  \
0           Private          Urban           2.723411  1.193238   
1           Private          Rural          -0.00

In [6]:
# Use get_dummies to one-hot encode categorical columns
encoded_data = pd.get_dummies(df, columns=['gender','ever_married', 'work_type', 'Residence_type', 'smoking_status'])

encoded_data.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.040584,0,1,2.723411,1.193238,1,0,1,0,1,0,1,0,0,0,1,0,1,0,0
1,1.61427,0,1,-0.000523,0.58939,1,0,1,0,1,0,1,0,0,1,0,0,0,1,0
2,0.24625,0,0,1.448529,0.869222,1,1,0,0,1,0,1,0,0,0,1,0,0,0,1
3,1.570141,1,0,1.51265,-0.662492,1,1,0,0,1,0,0,1,0,1,0,0,0,1,0
4,1.6584,0,0,1.780895,0.073909,1,0,1,0,1,0,1,0,0,0,1,0,1,0,0


In [7]:
#split off target variable and preserve clean data
target_df = encoded_data['stroke']
final_df = encoded_data.drop('stroke', axis=1)
final_df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.040584,0,1,2.723411,1.193238,0,1,0,1,0,1,0,0,0,1,0,1,0,0
1,1.61427,0,1,-0.000523,0.58939,0,1,0,1,0,1,0,0,1,0,0,0,1,0
2,0.24625,0,0,1.448529,0.869222,1,0,0,1,0,1,0,0,0,1,0,0,0,1
3,1.570141,1,0,1.51265,-0.662492,1,0,0,1,0,0,1,0,1,0,0,0,1,0
4,1.6584,0,0,1.780895,0.073909,0,1,0,1,0,1,0,0,0,1,0,1,0,0


In [8]:
#check d_types - look into uint8
final_df.dtypes

age                               float64
hypertension                        int64
heart_disease                       int64
avg_glucose_level                 float64
bmi                               float64
gender_Female                       uint8
gender_Male                         uint8
ever_married_No                     uint8
ever_married_Yes                    uint8
work_type_Govt_job                  uint8
work_type_Private                   uint8
work_type_Self-employed             uint8
work_type_children                  uint8
Residence_type_Rural                uint8
Residence_type_Urban                uint8
smoking_status_Unknown              uint8
smoking_status_formerly smoked      uint8
smoking_status_never smoked         uint8
smoking_status_smokes               uint8
dtype: object

In [9]:
#random under sample majority (non-stroke) + oversample minority (stroke) groups for bias tuning
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
#init sampler
rus = RandomUnderSampler(sampling_strategy='majority', random_state=0)
ros = RandomOverSampler(sampling_strategy='minority',random_state=0)

In [10]:
#fit and undersample majority (non-stroke) category - default is only undersampling - to use both over + under sampling comment out cell and use next cell
resampled_final, resampled_target = rus.fit_resample(final_df, target_df)

In [11]:
                                                          # use this cell for both over + under sampling
#random oversample minority category
#ros_final, ros_target = ros.fit_resample(final_df, target_df)
#random under sample majority category
#resampled_final, resampled_target = rus.fit_resample(ros_final, ros_target)

In [12]:
#split data into train test
X_train, X_test, y_train, y_test = train_test_split(resampled_final, resampled_target, random_state=10)
X_train.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
404,0.687547,0,0,0.268165,1.03123,0,1,0,1,0,0,1,0,1,0,0,0,0,1
240,-1.607197,0,0,-0.062869,-1.501988,1,0,1,0,0,0,0,1,0,1,1,0,0,0
197,0.466898,0,0,-0.265883,0.707214,1,0,0,1,0,1,0,0,0,1,0,0,0,1
419,0.864066,0,0,0.014121,-0.67722,1,0,0,1,1,0,0,0,1,0,1,0,0,0
467,1.526011,0,0,2.924428,0.118093,0,1,0,1,0,1,0,0,0,1,0,1,0,0


In [13]:
#import RandomForestClassifer + metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
#init rf model and fit to training data
rf_model = RandomForestClassifier(n_estimators=4000, random_state=42) #model max acc at n_estimators = 4000, acc loss beginning at 8000
rf_model.fit(X_train, y_train)

In [15]:
# Calculating the confusion matrix
predictions = rf_model.predict(X_test)
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [16]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,47,18
Actual 1,13,46


Accuracy Score : 0.75
Classification Report
              precision    recall  f1-score   support

           0       0.78      0.72      0.75        65
           1       0.72      0.78      0.75        59

    accuracy                           0.75       124
   macro avg       0.75      0.75      0.75       124
weighted avg       0.75      0.75      0.75       124

