# Naive Classifier

In [174]:
#Add libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [175]:
#Get data
data = 'Data\stroke-data.csv'
df=pd.read_csv(data)

### Exploratory Data Analysis

In [176]:
# View dimensions of the dataset
df.shape

#Preview the dataset
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [177]:
#View summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [178]:
#Drop the unique ids
df =df.drop(['id'],axis=1)

In [179]:
#Find Categorical Variables
categorical =[var for var in df.columns if df[var].dtype == 'object']

print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are:\n\n', categorical)

There are 5 categorical variables

The categorical variables are:

 ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [180]:
#view the categorical variables
df[categorical].head()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
0,Male,Yes,Private,Urban,formerly smoked
1,Female,Yes,Self-employed,Rural,never smoked
2,Male,Yes,Private,Rural,never smoked
3,Female,Yes,Private,Urban,smokes
4,Female,Yes,Self-employed,Rural,never smoked


In [181]:
# Check missing values in categorical variables
df[categorical].isnull().sum()

gender            0
ever_married      0
work_type         0
Residence_type    0
smoking_status    0
dtype: int64

In [182]:
# View frequency counts of values in categorical variables
for var in categorical:
    print(df[var].value_counts())

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64
Yes    3353
No     1757
Name: ever_married, dtype: int64
Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64
Urban    2596
Rural    2514
Name: Residence_type, dtype: int64
never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64


In [183]:
# View frequency distribution of categorical variables
for var in categorical:
    print(df[var].value_counts()/np.float(len(df)))

Female    0.585910
Male      0.413894
Other     0.000196
Name: gender, dtype: float64
Yes    0.656164
No     0.343836
Name: ever_married, dtype: float64
Private          0.572407
Self-employed    0.160274
children         0.134442
Govt_job         0.128571
Never_worked     0.004305
Name: work_type, dtype: float64
Urban    0.508023
Rural    0.491977
Name: Residence_type, dtype: float64
never smoked       0.370254
Unknown            0.302153
formerly smoked    0.173190
smokes             0.154403
Name: smoking_status, dtype: float64


#### Explore smoking variables

In [184]:
# Check labels in smoke class
df.smoking_status.unique()

#Replace 'Unknown' values in workclass variable with NaN
df['smoking_status'].replace('Unknown', np.NaN, inplace=True)
#Again check the frequency distribution of values in smoking status variable
df.smoking_status.value_counts()

never smoked       1892
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [185]:
df[categorical].isnull().sum()

gender               0
ever_married         0
work_type            0
Residence_type       0
smoking_status    1544
dtype: int64

In [186]:
# Check for cardinality in categorical variables
for var in categorical:
    print(var, 'contains', len(df[var].unique()), 'labels')

gender contains 3 labels
ever_married contains 2 labels
work_type contains 5 labels
Residence_type contains 2 labels
smoking_status contains 4 labels


In [187]:
# Find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='object']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

There are 6 numerical variables

The numerical variables are : ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'stroke']


In [188]:
#View the numerical variables
df[numerical].head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,67.0,0,1,228.69,36.6,1
1,61.0,0,0,202.21,,1
2,80.0,0,1,105.92,32.5,1
3,49.0,0,0,171.23,34.4,1
4,79.0,1,0,174.12,24.0,1


In [189]:
#Check Missing values
df[numerical].isnull().sum()

age                    0
hypertension           0
heart_disease          0
avg_glucose_level      0
bmi                  201
stroke                 0
dtype: int64

### Declaring Feature Vector and

In [190]:
#View the numerical variables
df[numerical].head()

In [191]:
#Check Missing values
df[numerical].isnull().sum()

### Declaring Feature Vector and

### Feature Engineering

In [192]:
# Check data types in X_train
X_train.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
dtype: object

In [193]:
# Display categorical Variables
categorical=[col for col in X_train.columns if X_train[col].dtypes=='object']
categorical

['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [194]:
# Display categorical Variables
numerical=[col for col in X_train.columns if X_train[col].dtypes!='object']
numerical

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [195]:
# Print percentage of missing value in the categorical variable
X_train[categorical].isnull().mean()

gender            0.000000
ever_married      0.000000
work_type         0.000000
Residence_type    0.000000
smoking_status    0.303327
dtype: float64

In [196]:
# Print percentage of missing value in the numerical variable
X_train[numerical].isnull().mean()

age                  0.000000
hypertension         0.000000
heart_disease        0.000000
avg_glucose_level    0.000000
bmi                  0.040257
dtype: float64

In [197]:
# Print categorical variables with missing data
for col in categorical:
    if X_train[col].isnull().mean()>0:
        print(col, (X_train[col].isnull().mean()))

smoking_status 0.30332681017612523


In [198]:
# Print numerical variables with missing data
for col in numerical:
    if X_train[col].isnull().mean()>0:
        print(col, (X_train[col].isnull().mean()))

bmi 0.04025719876991893


In [199]:
# Input missing categorical variable with most frequent value
for df2 in [X_train, X_test]:
    df2['smoking_status'].fillna(X_train['smoking_status'].mode()[0],inplace=True)
    df2['bmi'].fillna(X_train['bmi'].mode()[0],inplace=True)

#Check missing values in categorical variables in X train
X_train[categorical].isnull().sum()

gender            0
ever_married      0
work_type         0
Residence_type    0
smoking_status    0
dtype: int64

In [200]:
# Check missing values in categorical variables in X_test
X_test[categorical].isnull().sum()

gender            0
ever_married      0
work_type         0
Residence_type    0
smoking_status    0
dtype: int64

In [201]:
#Check missing values in numerical variables in X train
X_train[numerical].isnull().sum()

age                  0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
dtype: int64

In [202]:
#Check missing values in numerical variables in X test
X_test[numerical].isnull().sum()

age                  0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
dtype: int64

In [203]:
# Check missing values in X_train
X_train.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [204]:
# Check missing values in X_test
X_test.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [18]:
#Split data into separate training and test and test set
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test=train_test_split(X,Y, test_size=0.3, random_state=0)

In [19]:
#Check the shape of X_train and X_test
X_train.shape, X_test.shape

((3577, 10), (1533, 10))

###Feature Engineering