# Data and libraries import 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

In [3]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import tree

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('/content/drive/MyDrive/PRML_Dataset/PRML Course Project Files/healthcare-dataset-stroke-data.csv')

In [6]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


# Preprocessing

In [7]:
#droping "id" column
df = df.drop("id",axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [8]:
#Handeling missing values
print(df.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [9]:
#Handeling missing values

df['bmi'] = df['bmi'].fillna(df['bmi'].mean())

In [10]:
#Finding non-numerical columns

col = df.select_dtypes(include=['object']).columns.tolist()
col

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [11]:
#Encoding non-numerical columns
df.gender.unique()
for i in col:
  print("Unique values for", i, "is",df[i].unique())

Unique values for gender is ['Male' 'Female' 'Other']
Unique values for ever_married is ['Yes' 'No']
Unique values for work_type is ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Unique values for Residence_type is ['Urban' 'Rural']
Unique values for smoking_status is ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [12]:
for i in range(len(df['gender'])):
  if df['gender'][i] == 'Male':
    df['gender'][i] = 0
  elif (df['gender'][i] == 'Female'):
    df['gender'][i] = 1
  else:
    df['gender'][i] = 2 

In [13]:
for i in range(len(df['ever_married'])):
  if df['ever_married'][i] == 'Yes':
    df['ever_married'][i] = 0
  else:
    df['ever_married'][i] = 1

In [14]:
for i in range(len(df['work_type'])):
  if df['work_type'][i] == 'Private':
    df['work_type'][i] = 0
  elif (df['work_type'][i] == 'Self-employed'):
    df['work_type'][i] = 1
  elif (df['work_type'][i] == 'Govt_job'):
    df['work_type'][i] = 2
  elif (df['work_type'][i] == 'children'):
    df['work_type'][i] = 3
  elif (df['work_type'][i] == 'Never_worked'):
    df['work_type'][i] = 4    

In [15]:
for i in range(len(df['Residence_type'])):
  if df['Residence_type'][i] == 'Urban':
    df['Residence_type'][i] = 0
  else:
    df['Residence_type'][i] = 1

In [16]:
for i in range(len(df['smoking_status'])):
  if df['smoking_status'][i] == 'formerly smoked':
    df['smoking_status'][i] = 0
  elif (df['smoking_status'][i] == 'never smoked'):
    df['smoking_status'][i] = 1
  elif (df['smoking_status'][i] == 'smokes'):
    df['smoking_status'][i] = 2
  else:
    df['smoking_status'][i] = 3         

In [17]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,0,0,228.69,36.600000,0,1
1,1,61.0,0,0,0,1,1,202.21,28.893237,1,1
2,0,80.0,0,1,0,0,1,105.92,32.500000,1,1
3,1,49.0,0,0,0,0,0,171.23,34.400000,2,1
4,1,79.0,1,0,0,1,1,174.12,24.000000,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,0,0,0,83.75,28.893237,1,0
5106,1,81.0,0,0,0,1,0,125.20,40.000000,1,0
5107,1,35.0,0,0,0,1,1,82.99,30.600000,1,0
5108,0,51.0,0,0,0,0,1,166.29,25.600000,0,0


In [18]:
#Defining y as target and x as features
x = df.iloc[:,0:10]
y = df.iloc[:,10]

#splitting data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [19]:
# As different features has different scaling or range, we need to do scaling for better accuracy, hence scaled testing as well as training dataset using standard scalar

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [20]:
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

In [21]:
X_train = x_train
Y_train = y_train
X_test = x_test
Y_test = y_test

In [22]:
np.unique(df['stroke'], return_counts=True)

(array([0, 1]), array([4861,  249]))

# XGBoost

In [23]:
from xgboost import XGBClassifier

xgbc = XGBClassifier()
xgbc.fit(x_train, y_train)

XGBClassifier()

In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc =  RandomForestClassifier()
rfc.fit(x_train,y_train)

RandomForestClassifier()

In [32]:
Booster.save_model

NameError: ignored

In [28]:
# save to JSON
xgbc.save_model("model.json")
# save to text format
xgbc.save_model("model.txt")

In [36]:
import joblib
joblib.dump(rfc,"PRML Course Project Deploy.pkl")

['PRML Course Project Deploy.pkl']

In [26]:
model = joblib.load("B20CS055_optional.pkl")

FileNotFoundError: ignored