In [42]:
#IMPORTING THE LIBRARIES

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [43]:
#READING AND CLEANING THE DATASET

dataset = pd.read_csv('healthcare-dataset-stroke-data.csv')

In [44]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [45]:
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [46]:
#THEREFORE THE DATASET CONTAINS NULL VALUES IN THE BMI COLUMN WHICH NEEDS TO BE AMMENDED

bmi = round(dataset['bmi'].mean(),1)
dataset['bmi'].fillna(bmi, inplace=True)

In [47]:
dataset.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [48]:
#THE AGE COLUMN NEEDS TO BE AMMENDED

bins = [0,18,36,54,72,90]
labels = ['0-18','18-36','36-54','54-72','72-90']
dataset['age'] = pd.cut(dataset['age'], bins=bins, labels=labels)

In [49]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,54-72,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,54-72,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,31112,Male,72-90,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,36-54,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,72-90,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [50]:
#SOME COLUMNS NEED TO BE ENCODED

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
column = ['gender', 'age', 'hypertension', 'ever_married','work_type', 'Residence_type', 'smoking_status']
for i in column:
    dataset[i] = encoder.fit_transform(dataset[i])

In [51]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,3,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,3,0,0,1,3,0,202.21,28.9,2,1
2,31112,1,4,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,2,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,4,1,0,1,3,0,174.12,24.0,2,1


In [53]:
noStroke = dataset[dataset['stroke'] == 0]
stroke = dataset[dataset['stroke'] == 1]
print(noStroke.shape[0], noStroke.shape[1])
print(stroke.shape[0], stroke.shape[1])

4861 12
249 12


In [54]:
#THE DATASET IS IMBALANCED. HENCE BALANCING IS NEEDED TO PERFORM BETTER CLASSIFICATION

from sklearn.utils import resample
upsampled = resample(stroke, replace=True, n_samples=len(noStroke))
final_dataset = pd.concat([noStroke,upsampled])
final_dataset = final_dataset.sample(frac=1).reset_index(drop=True)

In [55]:
final_dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,11312,0,4,0,0,1,3,0,208.99,31.4,1,0
1,31189,1,2,0,0,1,0,1,72.96,37.7,3,0
2,64373,1,3,0,0,1,2,1,200.62,35.8,1,1
3,18687,1,3,0,0,1,3,1,93.67,29.3,0,0
4,14499,1,2,0,0,1,2,1,86.94,41.1,1,1


In [56]:
#BUILDING THE MODEL

X = final_dataset.iloc[:, :-1].values
y = final_dataset.iloc[:, -1].values

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [58]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [59]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

SVC(random_state=0)

In [60]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [1 1]
 [1 1]
 ...
 [1 1]
 [1 1]
 [1 1]]


In [61]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[756 220]
 [ 92 877]]


0.8395886889460155

In [62]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.93 %
Standard Deviation: 1.23 %
