# Imports

In [978]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


___
# Data exploration

In [979]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

df = df.drop(["id"], axis=1)

print(df.head())

   gender   age  hypertension  heart_disease ever_married      work_type  \
0    Male  67.0             0              1          Yes        Private   
1  Female  61.0             0              0          Yes  Self-employed   
2    Male  80.0             0              1          Yes        Private   
3  Female  49.0             0              0          Yes        Private   
4  Female  79.0             1              0          Yes  Self-employed   

  Residence_type  avg_glucose_level   bmi   smoking_status  stroke  
0          Urban             228.69  36.6  formerly smoked       1  
1          Rural             202.21   NaN     never smoked       1  
2          Rural             105.92  32.5     never smoked       1  
3          Urban             171.23  34.4           smokes       1  
4          Rural             174.12  24.0     never smoked       1  


___
# Data prepocessing

## Transformation data to numerical values

First of all, we need to transform every non-numerical data into numerical values. It will concern the columns *gender*, *ever_married*, *work_type*, *Residence_type*, *smoking_status*.

In [980]:
print(df["gender"].value_counts())

gender_type_map = {"Female": 2, "Male": 1, "Other": 0}
df["gender"] = df["gender"].map(gender_type_map)

print(df["gender"].value_counts())

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64
2    2994
1    2115
0       1
Name: gender, dtype: int64


In [981]:
print(df["work_type"].value_counts())

work_type_map = {"Private": 4, "Self-employed": 3, "children": 2, "Govt_job": 1, "Never_worked": 0}
df["work_type"] = df["work_type"].map(work_type_map)

print(df["work_type"].value_counts())

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64
4    2925
3     819
2     687
1     657
0      22
Name: work_type, dtype: int64


In [982]:
print(df["Residence_type"].value_counts())

residence_type_map = {"Urban":1, "Rural":0}
df["Residence_type"] = df["Residence_type"].map(residence_type_map)

print(df["Residence_type"].value_counts())

Urban    2596
Rural    2514
Name: Residence_type, dtype: int64
1    2596
0    2514
Name: Residence_type, dtype: int64


In [983]:
print(df["ever_married"].value_counts())

married_type_map = {"Yes":1, "No":0}
df["ever_married"] = df["ever_married"].map(married_type_map)

print(df["ever_married"].value_counts())

Yes    3353
No     1757
Name: ever_married, dtype: int64
1    3353
0    1757
Name: ever_married, dtype: int64


In [984]:
print(df["smoking_status"].value_counts())

smoking_type_map = {"smokes": 3, "formerly smoked": 2, "never smoked": 1, "Unknown": 0}
df["smoking_status"] = df["smoking_status"].map(smoking_type_map)

print(df["smoking_status"].value_counts())


never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64
1    1892
0    1544
2     885
3     789
Name: smoking_status, dtype: int64


## Dropping NaN values

We saw ealier that the column *bmi* contains NaN values. So we will simply remove the rows containing this value.

In [985]:
df = df.dropna()

## Imbalanced data

In [986]:
print(df["stroke"].value_counts())
print(df["stroke"].value_counts(normalize=True))

0    4700
1     209
Name: stroke, dtype: int64
0    0.957425
1    0.042575
Name: stroke, dtype: float64


As we can see above, the dataset is imbalanced, with only 4% of of data representing a stroke. The risk of having an imbalanced dataset is that the model will probably overfit on the domimant class, here the class 0 (not a stroke).
The model can still have a high accuracy but still be completely incorrect.
Since the goal of our model is to predict if a stroke will happen, the most important metric to look at isn't the accuracy but the recall of the class 1. We want our model to predict the most stroke possible.

If we try to use our current data in a logistic regression :

In [987]:
df_copy = copy.deepcopy(df)

labels = df_copy["stroke"]
# features = df.drop(["stroke", "Residence_type", "gender"], axis=1)
features = df_copy.drop("stroke", axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(max_iter=400)

logistic_regression.fit(X_train, y_train)

y_pred = logistic_regression.predict(X_test)


print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, zero_division=1))
print("Recall: ", recall_score(y_test, y_pred, zero_division=1))
print("F1 Score: ", f1_score(y_test, y_pred))

Confusion matrix:
 [[929   0]
 [ 53   0]]
Accuracy:  0.9460285132382892
Precision:  1.0
Recall:  0.0
F1 Score:  0.0


As predicted, even tought the accuracy score is high, the model is completely false because it only predicted data to be of class 0, which means it detected 0 strokes. 
This is a critical error that we need to fix by balancing the data.

Here, we will use up-sampling to even the number of features in each class.

In [988]:
df_majority = df[df["stroke"]==0]
df_minority = df[df["stroke"]==1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=4700,
                                 random_state=123)

df = pd.concat([df_majority, df_minority_upsampled])

In [989]:
labels = df["stroke"]
# features = df.drop(["stroke", "Residence_type", "gender"], axis=1)
features = df.drop("stroke", axis=1)

print(labels.value_counts())
print(labels.value_counts(normalize=True))


0    4700
1    4700
Name: stroke, dtype: int64
0    0.5
1    0.5
Name: stroke, dtype: float64


## Splitting the data for training

In [990]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [991]:
print(features.head())

     gender   age  hypertension  heart_disease  ever_married  work_type  \
249       1   3.0             0              0             0          2   
250       1  58.0             1              0             1          4   
251       2   8.0             0              0             0          4   
252       2  70.0             0              0             1          4   
253       1  14.0             0              0             0          0   

     Residence_type  avg_glucose_level   bmi  smoking_status  
249               0              95.12  18.0               0  
250               1              87.96  39.2               1  
251               1             110.89  17.6               0  
252               0              69.04  35.9               2  
253               0             161.28  19.1               0  


In [992]:
# corr_matrix = df.corr()
# sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r')
# plt.show()

___
# Model evaluation

In [993]:
def print_metrics(y_test, y_pred):
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("Accuracy: ", round(accuracy_score(y_test, y_pred), 3))
    print("Precision: ", round(precision_score(y_test, y_pred, zero_division=1), 3))
    print("Recall: ", round(recall_score(y_test, y_pred, zero_division=1), 3))
    print("F1 Score: ", round(f1_score(y_test, y_pred), 3))


## Logistic Regression

In [994]:
logistic_regression = LogisticRegression(max_iter=400)

logistic_regression.fit(X_train, y_train)

y_pred = logistic_regression.predict(X_test)

acc = logistic_regression.score(X_test, y_test)

print_metrics(y_test, y_pred)

Confusion matrix:
 [[691 235]
 [181 773]]
Accuracy:  0.779
Precision:  0.767
Recall:  0.81
F1 Score:  0.788


## KNN Classifier

In [995]:
knn_classifier = KNeighborsClassifier(n_neighbors = 7)

knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

acc = knn_classifier.score(X_test, y_test)

print_metrics(y_test, y_pred)

Confusion matrix:
 [[795 131]
 [  0 954]]
Accuracy:  0.93
Precision:  0.879
Recall:  1.0
F1 Score:  0.936
