## Import Dependencies

In [1]:
# import dependencies

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sqlalchemy import create_engine 
from config import postgres_address
import pickle

## Connect to Database

In [2]:
# estblish engine
engine = create_engine(postgres_address)

In [3]:
# connect to database
conn = engine.connect()

In [4]:
# read in data
pd.read_sql('SELECT * FROM stroke_data', conn)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4904,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
4905,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
4906,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
4907,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [5]:
# join datasets and establish dataframe
df = pd.read_sql('SELECT * FROM health_data JOIN demographics ON health_data.id = demographics.id', conn)

In [6]:
df

Unnamed: 0,id,hypertension,heart_disease,avg_glucose_level,bmi,stroke,id.1,gender,age,ever_married,work_type,residence_type,smoking_status
0,9046,0,1,228.69,36.6,1,9046,Male,67.0,Yes,Private,Urban,formerly smoked
1,31112,0,1,105.92,32.5,1,31112,Male,80.0,Yes,Private,Rural,never smoked
2,60182,0,0,171.23,34.4,1,60182,Female,49.0,Yes,Private,Urban,smokes
3,1665,1,0,174.12,24.0,1,1665,Female,79.0,Yes,Self-employed,Rural,never smoked
4,56669,0,0,186.21,29.0,1,56669,Male,81.0,Yes,Private,Urban,formerly smoked
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4904,14180,0,0,103.08,18.6,0,14180,Female,13.0,No,children,Rural,Unknown
4905,44873,0,0,125.20,40.0,0,44873,Female,81.0,Yes,Self-employed,Urban,never smoked
4906,19723,0,0,82.99,30.6,0,19723,Female,35.0,Yes,Self-employed,Rural,never smoked
4907,37544,0,0,166.29,25.6,0,37544,Male,51.0,Yes,Private,Rural,formerly smoked


## Clean Data to Prepare for Machine Learning

In [7]:
# check the count and percentages of cases of target column
print(df['stroke'].value_counts(normalize=True))
print(df['stroke'].value_counts())

0    0.957425
1    0.042575
Name: stroke, dtype: float64
0    4700
1     209
Name: stroke, dtype: int64


In [8]:
# drop null values
df.dropna(inplace=True)

In [9]:
# check if there are null values after dropping
df.isnull().sum()

id                   0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
stroke               0
id                   0
gender               0
age                  0
ever_married         0
work_type            0
residence_type       0
smoking_status       0
dtype: int64

In [10]:
# estblish the label encoder
enc=LabelEncoder()

In [11]:
# transform text data into numerical data
gender=enc.fit_transform(df['gender'])
smoking_status=enc.fit_transform(df['smoking_status'])
work_type=enc.fit_transform(df['work_type'])
Residence_type=enc.fit_transform(df['residence_type'])
ever_married=enc.fit_transform(df['ever_married'])

In [12]:
# replace categorical columns with the transformed numerical columns
df['ever_married']=ever_married
df['residence_type']=Residence_type
df['smoking_status']=smoking_status
df['gender']=gender
df['work_type']=work_type

In [13]:
# check that the categorical columns were changed to numerical columns
df[['ever_married', 'residence_type', 'smoking_status', 'gender', 'work_type']]

Unnamed: 0,ever_married,residence_type,smoking_status,gender,work_type
0,1,1,1,1,2
1,1,0,2,1,2
2,1,1,3,0,2
3,1,0,2,0,3
4,1,1,1,1,2
...,...,...,...,...,...
4904,0,0,0,0,4
4905,1,1,2,0,3
4906,1,0,2,0,3
4907,1,0,1,1,2


In [14]:
# drop the "id" column and prepare the data for machine learning
X = df.drop(columns = ['stroke', 'id'])
y = df['stroke']

## Oversample the Data

In [15]:
# import dependencies
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

In [16]:
# resample the data so there are the same number of stoke occurences as the number of non stroke occurences
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [17]:
# check that the count and percentages of cases of target column match
print(y_resampled.value_counts(normalize=True))
print(y_resampled.value_counts())

1    0.5
0    0.5
Name: stroke, dtype: float64
1    4700
0    4700
Name: stroke, dtype: int64


In [18]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=78)

In [19]:
X_train.shape

(7050, 10)

## Machine Learing

In [20]:
# import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

### Logistic Regression

In [21]:
# create a logistic regression classifier.
lr_model = LogisticRegression(solver='lbfgs', random_state=1)
lr_model

LogisticRegression(random_state=1)

In [22]:
# fit the model
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [23]:
# predict outcomes for test data set
lr_predictions = lr_model.predict(X_test)
pd.DataFrame({"Prediction": lr_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
4481,0,0
4181,0,0
6209,0,1
2331,0,0
7915,1,1
...,...,...
1172,0,0
1056,1,0
2670,1,0
2058,0,0


In [24]:
# create a summary of the predictions
print(confusion_matrix(y_test, lr_predictions))
print(classification_report(y_test, lr_predictions))

[[903 305]
 [219 923]]
              precision    recall  f1-score   support

           0       0.80      0.75      0.78      1208
           1       0.75      0.81      0.78      1142

    accuracy                           0.78      2350
   macro avg       0.78      0.78      0.78      2350
weighted avg       0.78      0.78      0.78      2350



### Random Forest

In [25]:
# create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [26]:
# fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [27]:
# predict outcomes for test data set
rf_predictions = rf_model.predict(X_test)
pd.DataFrame({"Prediction": rf_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
4481,0,0
4181,0,0
6209,1,1
2331,0,0
7915,1,1
...,...,...
1172,0,0
1056,0,0
2670,0,0
2058,0,0


In [28]:
# create a summary of the predictions
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

[[1193   15]
 [   0 1142]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1208
           1       0.99      1.00      0.99      1142

    accuracy                           0.99      2350
   macro avg       0.99      0.99      0.99      2350
weighted avg       0.99      0.99      0.99      2350



### Support Vector Machines

In [29]:
# create a support vector machines classifier.
svm_model = SVC(kernel='poly')

In [30]:
# fit the model
svm_model.fit(X_train, y_train)

SVC(kernel='poly')

In [31]:
# predict outcomes for test data set
svm_predictions = svm_model.predict(X_test)
pd.DataFrame({"Prediction": svm_predictions, "Actual": y_test}).join(X_test)

Unnamed: 0,Prediction,Actual,hypertension,heart_disease,avg_glucose_level,bmi,gender,age,ever_married,work_type,residence_type,smoking_status
4481,0,0,0,0,73.07,26.8,1,39.00,1,0,0,3
4181,0,0,0,0,73.08,20.4,1,1.72,0,4,1,0
6209,0,1,0,0,162.23,27.3,0,56.00,1,0,0,0
2331,0,0,0,0,70.32,20.5,0,24.00,0,2,1,0
7915,1,1,1,0,190.14,36.5,0,73.00,1,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1172,0,0,0,0,76.66,24.8,0,33.00,1,0,0,2
1056,1,0,0,0,80.85,29.3,0,69.00,1,3,1,1
2670,0,0,0,1,100.96,33.4,0,51.00,1,3,1,2
2058,0,0,0,0,80.25,30.3,1,40.00,1,2,0,1


In [32]:
# create a summary of the predictions
print(confusion_matrix(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))

[[948 260]
 [267 875]]
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1208
           1       0.77      0.77      0.77      1142

    accuracy                           0.78      2350
   macro avg       0.78      0.78      0.78      2350
weighted avg       0.78      0.78      0.78      2350



## Test New Data with the Machine Learning Models

In [33]:
new_data = np.array([[0, 0, 162.23, 27.3, 0, 56, 1, 0, 0, 0]])

In [34]:
print(lr_model.predict(new_data))
print(rf_model.predict(new_data))
print(svm_model.predict(new_data))

[0]
[1]
[0]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


## Save Model

In [36]:
# save the model to disk
filename = 'flask/finalized_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))