In [1]:
#importing library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score,f1_score
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
#loading the dataset
df=pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
#getting preliminary information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [4]:

def onehot_encode(df,column):
    df=df.copy()

    dummies=pd.get_dummies(df[column],prefix=column)
 
    df=pd.concat([df,dummies],axis=1)
    df=df.drop(column,axis=1)
    return df

In [5]:
def preprocess_input(df):
    df=df.copy()

    #dropping the id column
    df=df.drop('id',axis=1)
    #encoding ever_married columns

    df['ever_married']=df['ever_married'].replace({'No':0,'Yes':1})
    df['Residence_type']=df['Residence_type'].replace({'Urban':1,'Rural':0})

    
    for column in ['gender','work_type','smoking_status']:
    
        df=onehot_encode(df,column)
    for column in df.columns:
        if df[column].dtypes=='bool':
            df[column]=df[column].astype(int)

    #splitting data into target and features

    y=df['stroke']
    x=df.drop('stroke',axis=1)

    #train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123,shuffle=True)
    

    #knn imputation of missing values

    imputer=KNNImputer()

    imputer.fit(x_train)

    x_train=pd.DataFrame(imputer.transform(x_train),index=x_train.index,columns=x_train.columns)
    x_test=pd.DataFrame(imputer.transform(x_test),index=x_test.index,columns=x_test.columns)

    #scaling the dataset
    scaler=StandardScaler()

    scaler.fit(x_train)
    
    x_train=pd.DataFrame(scaler.transform(x_train),index=x_train.index,columns=x_train.columns)
    x_test=pd.DataFrame(scaler.transform(x_test),index=x_test.index,columns=x_test.columns)

    







        
    return x_train,x_test,y_train,y_test

In [6]:
x_train,x_test,y_train,y_test=preprocess_input(df)
x_train

Unnamed: 0,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
2909,-0.906324,-0.316665,-0.244004,-1.392550,0.959715,-0.452067,-0.866446,0.850797,-0.850309,-0.016723,-0.378387,-0.074985,0.845438,-0.429918,-0.386551,-0.659843,-0.457006,1.313124,-0.431286
1237,-0.460009,-0.316665,-0.244004,0.718107,-1.041976,-0.657675,-0.538141,0.850797,-0.850309,-0.016723,2.642797,-0.074985,-1.182819,-0.429918,-0.386551,-0.659843,-0.457006,1.313124,-0.431286
2335,0.209463,-0.316665,-0.244004,0.718107,-1.041976,-0.003211,0.630625,-1.175369,1.176043,-0.016723,-0.378387,-0.074985,0.845438,-0.429918,-0.386551,-0.659843,2.188156,-0.761543,-0.431286
2750,0.700410,-0.316665,-0.244004,0.718107,0.959715,-0.366560,-0.025985,0.850797,-0.850309,-0.016723,-0.378387,-0.074985,-1.182819,2.326025,-0.386551,-0.659843,2.188156,-0.761543,-0.431286
607,-1.852512,-0.316665,-0.244004,-1.392550,0.959715,-0.098944,-1.562453,0.850797,-0.850309,-0.016723,-0.378387,-0.074985,-1.182819,-0.429918,2.586981,1.515511,-0.457006,-0.761543,-0.431286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1593,-0.549272,-0.316665,-0.244004,0.718107,0.959715,-0.774078,-0.406819,-1.175369,1.176043,-0.016723,-0.378387,-0.074985,0.845438,-0.429918,-0.386551,-0.659843,-0.457006,1.313124,-0.431286
4060,1.146725,-0.316665,-0.244004,0.718107,-1.041976,0.088606,-0.393687,0.850797,-0.850309,-0.016723,-0.378387,-0.074985,-1.182819,2.326025,-0.386551,-0.659843,-0.457006,1.313124,-0.431286
1346,0.254095,-0.316665,-0.244004,0.718107,0.959715,2.025021,-0.039118,-1.175369,1.176043,-0.016723,-0.378387,-0.074985,0.845438,-0.429918,-0.386551,-0.659843,-0.457006,1.313124,-0.431286
3454,0.834305,-0.316665,-0.244004,0.718107,-1.041976,-1.011669,-0.262365,-1.175369,1.176043,-0.016723,-0.378387,-0.074985,0.845438,-0.429918,-0.386551,1.515511,-0.457006,-0.761543,-0.431286


# Training the Models

In [7]:
models={
        'Logistic Regression':LogisticRegression(),
        'K-Nearest Neighbors':KNeighborsClassifier(),
        'Decision Tree':DecisionTreeClassifier(),
        'Support Vector Machine(Linear Kernel)':LinearSVC(),
        'Support Vector Machine(Non Linear Kernel':SVC(),
        'Neural Network':MLPClassifier(),
        'Random Forest':RandomForestClassifier(),
        'Gradient Boosting':GradientBoostingClassifier(),
        'XGBoost Classifier':XGBClassifier(eval_metric='mlogloss'),
        
        'CatBoost':CatBoostClassifier(verbose=0)
    
}

In [8]:
models

{'Logistic Regression': LogisticRegression(),
 'K-Nearest Neighbors': KNeighborsClassifier(),
 'Decision Tree': DecisionTreeClassifier(),
 'Support Vector Machine(Linear Kernel)': LinearSVC(),
 'Support Vector Machine(Non Linear Kernel': SVC(),
 'Neural Network': MLPClassifier(),
 'Random Forest': RandomForestClassifier(),
 'Gradient Boosting': GradientBoostingClassifier(),
 'XGBoost Classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='mlogloss',
               feature_types=None, gamma=None, grow_policy=None,
               importance_type=None, interaction_constraints=None,
               learning_rate=None, max_bin=None, max_cat_threshold=None,
               max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
               max_leaves=None, min_child_weight=Non

In [9]:
for name,model in models.items():
    model.fit(x_train,y_train)
    
    print(name+'_trained')
    print(model.score(x_test,y_test))

Logistic Regression_trained
0.9562948467058056
K-Nearest Neighbors_trained
0.954337899543379
Decision Tree_trained
0.9099804305283757
Support Vector Machine(Linear Kernel)_trained
0.9556425309849967
Support Vector Machine(Non Linear Kernel_trained
0.9556425309849967
Neural Network_trained
0.9504240052185258
Random Forest_trained
0.9536855838225701
Gradient Boosting_trained
0.9517286366601435
XGBoost Classifier_trained
0.9478147423352903
CatBoost_trained
0.9536855838225701


In [10]:
y_train.value_counts()

stroke
0    3396
1     181
Name: count, dtype: int64