In [61]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [62]:
# getting the dataset
dataset = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

In [63]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [64]:
# getting the amount of data
dataset.shape

(5110, 12)

In [65]:
# trying to see which columns have null values
# as observed only bmi has null values, and there are only 201. Hence we can just remove these 201 columns.
dataset.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [66]:
# Removing the columns which have null values
dataset = dataset.dropna().reset_index(drop=True)

In [67]:
# We don't need the ids as they are of no use to us
dataset = dataset.drop(columns=['id'])

In [68]:
# Getting all the unique values of the the smoking status
dataset['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [69]:
# removed all the columnes which had smoking status as 'Unknown'. Did not know how to create another model to get those values
dataset = dataset[(dataset.smoking_status != 'Unknown')].reset_index(drop=True)

In [70]:
# removing the columns that have gender as 'Other'
dataset = dataset[(dataset.gender != 'Other')].reset_index(drop=True)

In [71]:
dataset.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [72]:
# Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(columns=['stroke']), dataset['stroke'], test_size = 0.2)

In [73]:
X_train

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
1651,Female,80.0,0,0,Yes,Private,Urban,79.57,26.9,never smoked
2146,Female,82.0,1,0,No,Private,Rural,61.47,22.9,never smoked
3369,Female,61.0,0,0,Yes,Self-employed,Urban,69.88,27.1,never smoked
2249,Female,58.0,1,0,Yes,Private,Urban,194.53,39.5,never smoked
14,Female,60.0,0,0,No,Private,Urban,89.22,37.8,never smoked
...,...,...,...,...,...,...,...,...,...,...
1350,Female,38.0,0,0,Yes,Private,Rural,84.79,24.2,formerly smoked
1687,Male,49.0,0,0,Yes,Private,Rural,96.35,35.9,never smoked
2561,Male,23.0,0,0,No,Private,Urban,62.00,24.8,formerly smoked
2442,Male,78.0,0,0,Yes,Govt_job,Urban,59.74,27.0,formerly smoked


COLUMN Transformer

In [74]:
from sklearn.compose import ColumnTransformer

In [75]:
# very effective method to perform all the transformating in one go. I have used the 'Column Transformer'.
# In the dataset, smoking_status has been tranformed using the Ordinal Encoder
# gender, ever_marrind, work_type and residence_type has been transformed using the one hot encoder
transformer = ColumnTransformer(transformers=[
    ('tnf1', OrdinalEncoder(categories=[['never smoked', 'formerly smoked', 'smokes']]),['smoking_status']),
    ('tnf2', OneHotEncoder(sparse=False, drop='first'),['gender', 'ever_married', 'work_type', 'Residence_type'])
], remainder='passthrough')

In [76]:
# transforming X_train
X_train = transformer.fit_transform(X_train)



In [77]:
# transforming X_test
X_test = transformer.transform(X_test)

In [78]:
scaler = StandardScaler()

In [79]:
scaler.fit(X_train)

In [80]:
X_train = scaler.transform(X_train)

In [81]:
print(X_train)

[[-0.82768395 -0.80536707  0.57678837 ... -0.25231844 -0.59766476
  -0.46126194]
 [-0.82768395 -0.80536707 -1.73373814 ... -0.25231844 -0.97639194
  -1.01241276]
 [-0.82768395 -0.80536707  0.57678837 ... -0.25231844 -0.80041981
  -0.4337044 ]
 ...
 [ 0.42251121  1.24166983 -1.73373814 ... -0.25231844 -0.96530214
  -0.75061612]
 [ 0.42251121  1.24166983  0.57678837 ... -0.25231844 -1.01259072
  -0.44748317]
 [ 0.42251121 -0.80536707  0.57678837 ... -0.25231844  2.33904016
   0.44813691]]


In [82]:
scaler.fit(X_test)
X_test= scaler.transform(X_test)
print(X_test)

[[ 0.3327929   1.27456701  0.51138061 ... -0.25557546 -0.1194914
  -0.48007965]
 [ 0.3327929   1.27456701  0.51138061 ... -0.25557546  0.59748385
   0.16518131]
 [ 0.3327929   1.27456701  0.51138061 ... -0.25557546  0.23255127
  -0.829596  ]
 ...
 [-0.87336126  1.27456701  0.51138061 ... -0.25557546 -0.64649885
  -1.56895751]
 [ 0.3327929   1.27456701  0.51138061 ... -0.25557546  2.01748789
   0.62224116]
 [-0.87336126  1.27456701  0.51138061 ... -0.25557546 -0.5786683
   1.03897219]]


In [83]:
# creating the SVC classifier
classifier = svm.SVC(kernel='linear')

In [84]:
# providing the data to the classifier
classifier.fit(X_train, y_train)

In [85]:
# getting the accuracy score
X_test_prediction = classifier.predict(X_test)
testing_data_accuracy = accuracy_score(X_test_prediction, y_test)
print('Accuracy score of the testing data: ', testing_data_accuracy)

Accuracy score of the testing data:  0.9386861313868613


THE END
