# Do you know your stroke risk?

Source of the data: https://www.sciencedirect.com/science/article/pii/S0933365719302295?via%3Dihub
Liu, Tianyu; Fan, Wenhui; Wu, Cheng (2019), “Data for: A hybrid machine learning approach to cerebral stroke prediction based on imbalanced medical-datasets”, Mendeley Data, V1, doi: 10.17632/x8ygrw87jw.1

The medical dataset contains 43,400 records of potential patients which includes 783 occurrences of stroke. 

Cerebral stroke has become a significant global public health issue. The ideal solution to this concern is to prevent in advance by controlling related metabolic factors. However, it is difficult for medical staff to decide whether special precautions are needed for a potential patient only based on the monitoring of physiological indicators unless they are obviously abnormal. This project builds a machine learning model to predict whether someone is at risk of having a stroke.

The data in each row includes numerical factors, such as age and average glucose levels, and categorical factors, such as "has heart disease" (yes or no), work type, and smoking status. This is not an exhaustive list. We use this data to determine which factors contribute to having a stroke, and among those which hold the most weight.

## Basic analysis of the input dataset

In [None]:
# Dependencies and Setup
import pandas as pd
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as sts
#import seaborn as sns
%matplotlib inline
#sns.set_style('whitegrid')
import tensorflow
#tensorflow.keras.__version__
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

# Read the golf course dataset
input_csv = pd.read_csv("data/stroke_dataset.csv", delimiter=',', skipinitialspace=True)
input_csv.shape

### Neural Network Code Below

In [None]:
def neural_net(X,y, filename):
    dim = X.shape[1]
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 50)
    print(X_test.shape)
    print(y_test.shape)
    X_scaler = MinMaxScaler().fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    # Step 1: Label-encode data set
    label_encoder = LabelEncoder()
    label_encoder.fit(y_train)
    encoded_y_train = label_encoder.transform(y_train)
    encoded_y_test = label_encoder.transform(y_test)
    # Step 2: Convert encoded labels to one-hot-encoding
    y_train_categorical = to_categorical(encoded_y_train)
    y_test_categorical = to_categorical(encoded_y_test)
    

    
    # Create model and add layers
    model = Sequential()
    model.add(Dense(units=55, activation='relu', input_dim=dim))
    model.add(Dense(units=55, activation='relu'))
    model.add(Dense(units=55, activation='relu'))
    model.add(Dense(units=2, activation='softmax'))
    
    # Compile and fit the model
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    model.summary()
    model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=1
    )
    model_loss, model_accuracy = model.evaluate(
        X_test_scaled, y_test_categorical, verbose=2)
    
    print("/n")
    print("Test Results")
    print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    
    
    uniqueValues_train, occurCount_train = np.unique(y_train, return_counts=True)
    uniqueValues_test, occurCount_test = np.unique(y_test, return_counts=True)
    print(f"train unique values {uniqueValues_train}")
    print(f"train occur count {occurCount_train}")
    
    print(f"test unique values {uniqueValues_test}")
    print(f"test occur count {occurCount_test}")
   
    model.save(filename+".h5")
    
    return 

In [None]:
# Display the input data for preview
input_csv

In [None]:
#plotting histogram of age

x = input_csv['age']
num_bins = 20
# the histogram of the data
n, bins, patches = plt.hist(x, num_bins, facecolor='blue', alpha=0.5)
plt.ylim([1000, 3000])
# add a 'best fit' line
#y = mlab.normpdf(bins, mu, sigma)

plt.xlabel('Age')
plt.ylabel('Probability')
plt.title(r'Histogram of Age')

# Tweak spacing to prevent clipping of ylabel
plt.subplots_adjust(left=0.15)
plt.show()

### Looking into balanced datasets

In [None]:
stroke_data_df = input_csv.copy(deep=True)

In [None]:
stroke_data_df = stroke_data_df.dropna()


In [None]:
stroke_data_df.head()

In [None]:
stroke_data_df.shape

### Isolating Positive Stroke Data

In [None]:
stroke_positive = stroke_data_df[stroke_data_df['stroke'] == 1]

In [None]:
stroke_positive.head()

In [None]:
stroke_positive.shape

## Isolating Negative Stroke Data

In [None]:
stroke_negative = stroke_data_df[stroke_data_df['stroke'] == 0]

In [None]:
stroke_negative.head()

In [None]:
stroke_negative.shape

In [None]:
print(f"Total Number of Stroke Positive {len(stroke_positive)}")
print(f"Total Number of Stroke Negative {len(stroke_negative)}")

## Mapping String Data to Numeric



In [None]:
Gender = {"Other":2,"Male":1, "Female":0}
Married = {"Yes":1, "No":0}
Work_Type = {"Private":0, "Self-employed":1, "children":2, 
             "Govt_job":3,"Never_worked":4}
Residence = {"Urban":0, "Rural":1}
Smoking = {"never smoked":0, "formerly smoked":1, "smokes":2, "unknown":3}

In [None]:
stroke_negative["smoking_status"].value_counts()

In [None]:
stroke_negative = stroke_negative.replace({"gender":Gender, "ever_married":Married,
                                          "work_type":Work_Type, "Residence_type":Residence,
                                          "smoking_status":Smoking})

In [None]:
stroke_positive = stroke_positive.replace({"gender":Gender, "ever_married":Married,
                                          "work_type":Work_Type, "Residence_type":Residence,
                                          "smoking_status":Smoking})

In [None]:
stroke_negative.head()

In [None]:
stroke_negative["ever_married"].value_counts()

In [None]:
stroke_negative.dtypes

## Creating various Datasets

### Creating Data Set for Analysis - Equal set

In [None]:
stroke_negative_sample = stroke_negative.sample(1096)
stroke_positive_sample = stroke_positive.sample(548)

In [None]:
stroke_sample = pd.merge(stroke_negative_sample, stroke_positive_sample, how = 'outer')

In [None]:
print(f"Negative data set {len(stroke_negative_sample)}")
print(f"Positive data set {len(stroke_positive_sample)}")
print(f"Combined data set {len(stroke_sample)}")
print(f"Shape of combined {stroke_sample.shape}")

In [None]:
X = stroke_sample.drop("stroke", axis = 1)
y = stroke_sample["stroke"]
print(X.shape, y.shape)

In [None]:
neural_net(X,y, "NN_3")

## Loading a model to test performance

In [None]:
# Load the model

from tensorflow.keras.models import load_model
filename = "NN_3"
stroke_model = load_model(filename+".h5")

In [None]:
stroke_negative_sample = stroke_negative.sample(20000)
stroke_positive_sample = stroke_positive.sample(548)
stroke_sample = pd.merge(stroke_negative_sample, stroke_positive_sample, how = 'outer')

In [None]:
X = stroke_sample.drop("stroke", axis = 1)
y = stroke_sample["stroke"]
print(X.shape, y.shape)

In [None]:
X_scaler = MinMaxScaler().fit(X)
X_scaled = X_scaler.transform(X)

In [None]:
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)
y_categorical = to_categorical(encoded_y)


In [None]:
encoded_predictions = stroke_model.predict_classes(X_scaled)

In [None]:
encoded_predictions[1499]

In [None]:
encoded_y[0]

In [None]:
TN = 0
TP = 0
FN = 0
FP = 0
for i in range(0,len(encoded_y)):
    if encoded_predictions[i] == 0:
        if encoded_y[i] == 0:
            TN += 1
        else:
            FN += 1
    if encoded_predictions[i] == 1:
        if encoded_y[i] == 1:
            TP += 1
        else: 
            FP += 1
total = FP+FN+TN+TP
print(f"Total samples: {total}")
print(f"True Positive: {TP}")
print(f"True Negative: {TN}")
print(f"False Positive: {FP}")
print(f"False Negative: {FN}")