In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsRegressor

data = pd.read_csv('../data/stroke.csv')
data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [2]:
data.shape

(5109, 12)

In [3]:
data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
data = data.dropna()

In [5]:
X = data.drop(columns=['stroke','id'], axis =1, inplace=False)
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)
X_test.shape

(982, 10)

In [6]:
data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
data.drop(data[data['smoking_status'] == 'Unknown'].index, inplace=True)

In [8]:
data['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes'], dtype=object)

In [9]:
data['bmi'].unique()

array([36.6, 32.5, 34.4, 24. , 29. , 27.4, 22.8, 29.7, 36.8, 27.3, 28.2,
       30.9, 37.5, 25.8, 37.8, 22.4, 48.9, 26.6, 27.2, 23.5, 28.3, 44.2,
       22.2, 30.5, 26.5, 33.7, 23.1, 29.9, 23.9, 28.5, 26.4, 20.2, 33.6,
       38.6, 39.2, 27.7, 31.4, 36.5, 33.2, 32.8, 40.4, 25.3, 30.2, 47.5,
       30. , 28.9, 28.1, 31.1, 21.7, 27. , 24.1, 22.9, 29.1, 32.3, 41.1,
       29.8, 26.3, 29.4, 24.4, 28. , 34.6, 30.3, 24.2, 41.5, 22.6, 56.6,
       31.3, 31. , 31.7, 35.8, 28.4, 20.1, 26.7, 38.7, 25. , 23.8, 21.8,
       27.5, 24.6, 32.9, 26.1, 31.9, 34.1, 25.6, 36.9, 37.3, 45.7, 34.2,
       23.6, 22.3, 37.1, 45. , 25.5, 30.8, 32. , 37.4, 34.5, 27.9, 29.5,
       46. , 42.5, 35.5, 26.9, 45.5, 31.5, 33. , 23.4, 30.7, 20.5, 21.5,
       27.1, 40. , 28.6, 42.2, 29.6, 35.4, 16.9, 32.6, 35.9, 21.2, 42.4,
       40.5, 29.3, 17.7, 54.6, 22. , 39.4, 19.7, 22.5, 25.2, 60.9, 23.7,
       24.5, 31.2, 25.1, 36. , 26.8, 34.9, 35.3, 36.7, 34.3, 27.6, 24.3,
       40.1, 21.9, 38.4, 25.9, 54.7, 24.9, 19.4, 48

In [10]:
data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [11]:
data.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [12]:
data['gender'].unique()

array(['Male', 'Female'], dtype=object)

In [13]:
data['gender'].value_counts()

gender
Female    2086
Male      1339
Name: count, dtype: int64

In [14]:
cat_var = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_var = ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

# Splice the training array
X_train_cat = X_train[cat_var]
X_train_num = X_train[num_var]
X_test_cat = X_test[cat_var]
X_test_num = X_test[num_var]
X_train_cat.shape

(3926, 5)

In [15]:
data['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [16]:
import warnings
warnings.filterwarnings('ignore')
dummy_e = OneHotEncoder(categories='auto', drop='first', sparse_output=False, handle_unknown='ignore')

# Categorical feature encoding
X_train_dummy = dummy_e.fit_transform(X_train_cat)
X_test_dummy = dummy_e.transform(X_test_cat)


# Check the shape
X_train_dummy.shape, X_test_dummy.shape

((3926, 10), (982, 10))

In [17]:
norm_e = StandardScaler()
X_train_norm = norm_e.fit_transform(X_train_num)
X_test_norm = norm_e.transform(X_test_num)

In [18]:
X_train = np.hstack((X_train_dummy, X_train_norm))
X_test = np.hstack((X_test_dummy, X_test_norm))

X_train.shape, X_test.shape

((3926, 15), (982, 15))

In [19]:
print(X_train.shape)
print(X_test.shape)

(3926, 15)
(982, 15)


In [20]:
dummy_e.get_feature_names_out()

array(['gender_Male', 'ever_married_Yes', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed',
       'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes'], dtype=object)

In [44]:
X_train = pd.DataFrame(X_train)
X_train.columns = ['gender_Male', 'ever_married_Yes', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed',
       'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

X_test = pd.DataFrame(X_test)

X_test.columns = ['gender_Male', 'ever_married_Yes', 'work_type_Never_worked',
       'work_type_Private', 'work_type_Self-employed',
       'work_type_children', 'Residence_type_Urban',
       'smoking_status_formerly smoked', 'smoking_status_never smoked',
       'smoking_status_smokes', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
y_train = pd.DataFrame(y_train)
y_train.columns = ['Stroke']

y_test = pd.DataFrame(y_test)
y_test.columns = ['Stroke']

X_train.to_csv('../data/stroke_X_train.csv', index = False)
X_test.to_csv('../data/stroke_X_test.csv' , index = False)
y_train.to_csv('../data/stroke_y_train.csv',  index = False)
y_test.to_csv('../data/stroke_y_test.csv',  index = False)