In [1]:
#IMPORTING THE LIBRARIES

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
#READING AND CLEANING THE DATASET

dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [3]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
#THEREFORE THE DATASET CONTAINS NULL VALUES IN THE BMI COLUMN WHICH NEEDS TO BE AMMENDED

bmi = round(dataset['bmi'].mean(),1)
dataset['bmi'].fillna(bmi, inplace=True)

In [6]:
dataset.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [7]:
#THE AGE COLUMN NEEDS TO BE AMMENDED

bins = [0,18,36,54,72,90]
labels = ['0-18','18-36','36-54','54-72','72-90']
dataset['age'] = pd.cut(dataset['age'], bins=bins, labels=labels)

In [8]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,54-72,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,54-72,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,72-90,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,36-54,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,72-90,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [9]:
#SOME COLUMNS NEED TO BE ENCODED

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
column = ['gender', 'age', 'hypertension', 'ever_married','work_type', 'Residence_type', 'smoking_status']
for i in column:
    dataset[i] = encoder.fit_transform(dataset[i])

In [10]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,3,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,3,0,0,1,3,0,202.21,28.9,2,1
2,31112,1,4,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,2,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,4,1,0,1,3,0,174.12,24.0,2,1


In [11]:
noStroke = dataset[dataset['stroke'] == 0]
stroke = dataset[dataset['stroke'] == 1]
print(noStroke.shape[0], noStroke.shape[1])
print(stroke.shape[0], stroke.shape[1])

4861 12
249 12


In [12]:
#THE DATASET IS IMBALANCED. HENCE BALANCING IS NEEDED TO PERFORM BETTER CLASSIFICATION

from sklearn.utils import resample
upsampled = resample(stroke, replace=True, n_samples=len(noStroke))
final_dataset = pd.concat([noStroke,upsampled])
final_dataset = final_dataset.sample(frac=1).reset_index(drop=True)

In [13]:
final_dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,17013,1,4,1,0,0,2,1,113.01,24.0,2,1
1,47472,0,3,0,0,1,2,1,107.26,38.6,1,1
2,71379,0,2,0,0,1,0,1,113.63,27.5,3,0
3,38047,0,3,0,0,1,2,0,100.98,28.2,1,1
4,66435,0,1,0,0,1,2,0,71.97,27.2,2,0


In [14]:
#BUILDING THE MODEL

X = final_dataset.iloc[:, :-1].values
y = final_dataset.iloc[:, -1].values

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [18]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 1]
 [0 0]
 ...
 [1 1]
 [1 1]
 [1 0]]


In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[858 137]
 [  0 950]]


0.9295629820051414

In [20]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 92.37 %
Standard Deviation: 0.65 %
