# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


___
# Data exploration

In [None]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

df = df.drop(["id"], axis=1)

print(df.head())

___
# Data prepocessing

## Transformation data to numerical values

First of all, we need to transform every non-numerical data into numerical values. It will concern the columns *gender*, *ever_married*, *work_type*, *Residence_type*, *smoking_status*.

In [None]:
print(df["gender"].value_counts())

gender_type_map = {"Female": 2, "Male": 1, "Other": 0}
df["gender"] = df["gender"].map(gender_type_map)

print(df["gender"].value_counts())

In [None]:
print(df["work_type"].value_counts())

work_type_map = {"Private": 4, "Self-employed": 3, "children": 2, "Govt_job": 1, "Never_worked": 0}
df["work_type"] = df["work_type"].map(work_type_map)

print(df["work_type"].value_counts())

In [None]:
print(df["Residence_type"].value_counts())

residence_type_map = {"Urban":1, "Rural":0}
df["Residence_type"] = df["Residence_type"].map(residence_type_map)

print(df["Residence_type"].value_counts())

In [None]:
print(df["ever_married"].value_counts())

married_type_map = {"Yes":1, "No":0}
df["ever_married"] = df["ever_married"].map(married_type_map)

print(df["ever_married"].value_counts())

In [None]:
print(df["smoking_status"].value_counts())

smoking_type_map = {"smokes": 3, "formerly smoked": 2, "never smoked": 1, "Unknown": 0}
df["smoking_status"] = df["smoking_status"].map(smoking_type_map)

print(df["smoking_status"].value_counts())


## Dropping NaN values

We saw ealier that the column *bmi* contains NaN values. So we will simply remove the rows containing this value.

In [None]:
df = df.dropna()

## Imbalanced data

In [None]:
print(df["stroke"].value_counts())
print(df["stroke"].value_counts(normalize=True))

As we can see above, the dataset is imbalanced, with only 4% of of data representing a stroke. The risk of having an imbalanced dataset is that the model will probably overfit on the domimant class, here the class 0 (not a stroke).
The model can still have a high accuracy but still be completely incorrect.
Since the goal of our model is to predict if a stroke will happen, the most important metric to look at isn't the accuracy but the recall of the class 1. We want our model to predict the most stroke possible.

If we try to use our current data in a logistic regression :

In [None]:
df_copy = copy.deepcopy(df)

labels = df_copy["stroke"]
# features = df.drop(["stroke", "Residence_type", "gender"], axis=1)
features = df_copy.drop("stroke", axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression(max_iter=400)

logistic_regression.fit(X_train, y_train)

y_pred = logistic_regression.predict(X_test)


print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred, zero_division=1))
print("Recall: ", recall_score(y_test, y_pred, zero_division=1))
print("F1 Score: ", f1_score(y_test, y_pred))

As predicted, even tought the accuracy score is high, the model is completely false because it only predicted data to be of class 0, which means it detected 0 strokes. 
This is a critical error that we need to fix by balancing the data.

Here, we will use up-sampling to even the number of features in each class.

In [None]:
df_majority = df[df["stroke"]==0]
df_minority = df[df["stroke"]==1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,
                                 n_samples=4700,
                                 random_state=123)

df = pd.concat([df_majority, df_minority_upsampled])

In [None]:
labels = df["stroke"]
# features = df.drop(["stroke", "Residence_type", "gender"], axis=1)
features = df.drop("stroke", axis=1)

print(labels.value_counts())
print(labels.value_counts(normalize=True))


## Splitting the date for training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
print(features.head())

In [None]:
# corr_matrix = df.corr()
# sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r')
# plt.show()

___
# Model evaluation

In [None]:
def print_metrics(y_test, y_pred):
    print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred, zero_division=1))
    print("Recall: ", recall_score(y_test, y_pred, zero_division=1))
    print("F1 Score: ", f1_score(y_test, y_pred))


## Logistic Regression

In [None]:
logistic_regression = LogisticRegression(max_iter=400)

logistic_regression.fit(X_train, y_train)

y_pred = logistic_regression.predict(X_test)

acc = logistic_regression.score(X_test, y_test)

print_metrics(y_test, y_pred)

## KNN Classifier

In [None]:
knn_classifier = KNeighborsClassifier(n_neighbors = 7)

knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

acc = knn_classifier.score(X_test, y_test)

print_metrics(y_test, y_pred)