In [1]:
import pandas as pd
import numpy as np

from random import randint
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, mean_squared_error # for model evaluation metrics
from sklearn.ensemble import RandomForestClassifier # for random forest classifier
from sklearn.neural_network import MLPClassifier # for neural network
from sklearn.preprocessing import StandardScaler, OneHotEncoder # for normalisation and one-hot encoding in Neural Network
from imblearn.over_sampling import SMOTENC # for imbalanced dataset oversampling
from imblearn.under_sampling import RandomUnderSampler # for imbalanced dataset undersampling to complement SMOTE
from imblearn.pipeline import Pipeline # for sequential oversampling then undersampling

## Coronary Artery Disease (CAD)

In [2]:
heart = pd.read_csv('heart.csv')

In [3]:
heart['HeartDisease'] = heart['HeartDisease'].apply(lambda x: 'Normal' if x==0 else 'Disease') 

In [4]:
def chestpaintype(x):
    if x=='TA':
        return "Typical Angina"
    elif x=='ATA':
        return "Atypical Angina"
    elif x=='NAP':
        return "Non-anginal Pain"
    else:
        return "Asymptomatic"

heart['ChestPainType'] = heart['ChestPainType'].apply(chestpaintype) 

In [5]:
heart['FastingBS'] = heart['FastingBS'].apply(lambda x: 'Less than or equal to 120 mg/dl' if x==0 else 'More than 120 mg/dl')
heart['Sex'] = heart['Sex'].apply(lambda x: 'Male' if x=='M' else 'Female')

In [6]:
heart.rename({'Sex': 'Gender','RestingBP': 'RestingBloodPressure', 'FastingBS': 'FastingBloodSugar', 'ExerciseAngina': 'ExerciseInducedAngina', 'HeartDisease': 'CoronaryArteryDisease'}, axis=1, inplace=True)
# removed 1 outlier with blood pressure of 0
heart = heart[heart['RestingBloodPressure']>0]
# note that we still have outliers in Cholesterol level, with 171 such case with a value of 0
# for now, we convert these values to NaN and obtain the average value later to input after train-test 
# split for a more objective change in value
heart['Cholesterol'] = heart['Cholesterol'].replace({0:np.nan})
heart = heart.iloc[:,[0,1,2,3,4,5,6,7,8,11]]
heart = heart.reset_index(drop=True)
heart

Unnamed: 0,Age,Gender,ChestPainType,RestingBloodPressure,Cholesterol,FastingBloodSugar,RestingECG,MaxHR,ExerciseInducedAngina,CoronaryArteryDisease
0,40,Male,Atypical Angina,140,289.0,Less than or equal to 120 mg/dl,Normal,172,N,Normal
1,49,Female,Non-anginal Pain,160,180.0,Less than or equal to 120 mg/dl,Normal,156,N,Disease
2,37,Male,Atypical Angina,130,283.0,Less than or equal to 120 mg/dl,ST,98,N,Normal
3,48,Female,Asymptomatic,138,214.0,Less than or equal to 120 mg/dl,Normal,108,Y,Disease
4,54,Male,Non-anginal Pain,150,195.0,Less than or equal to 120 mg/dl,Normal,122,N,Normal
...,...,...,...,...,...,...,...,...,...,...
912,45,Male,Typical Angina,110,264.0,Less than or equal to 120 mg/dl,Normal,132,N,Disease
913,68,Male,Asymptomatic,144,193.0,More than 120 mg/dl,Normal,141,N,Disease
914,57,Male,Asymptomatic,130,131.0,Less than or equal to 120 mg/dl,Normal,115,Y,Disease
915,57,Female,Atypical Angina,130,236.0,Less than or equal to 120 mg/dl,LVH,174,N,Disease


In [7]:
heart_rf = RandomForestClassifier(random_state=1)

# get X and Y with normalised values
X = heart.iloc[:,:-1]
X_cont = X.iloc[:,[0,3,4,7]]
X_cat = X.iloc[:,[1,2,5,6,8]]
Y = heart.iloc[:,-1]

# create one-hot for categorical variables
ohe_heart = OneHotEncoder().fit(X_cat)
columns = X_cat.columns
columns = ohe_heart.get_feature_names_out(columns)
X_cat = pd.DataFrame(ohe_heart.transform(X_cat).todense(), columns=columns)

X = pd.concat([X_cont.reset_index(drop=True),X_cat.reset_index(drop=True)], axis=1)
Y = Y.apply(lambda x: 1 if x=='Disease' else 0)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.1, random_state=1)

In [8]:
# now we get the average of the value for cholesterol for X_train
avg = int((np.round(X_train['Cholesterol'].mean(),1) + np.round(X_train['Cholesterol'].mean(),1))/2)
X_train['Cholesterol'] = X_train['Cholesterol'].fillna(avg)
X_test['Cholesterol'] = X_test['Cholesterol'].fillna(avg)

In [9]:
# grid search for best configuration
best_config = {}
best_rmse = float('inf')
best_accuracy = 0
for i in range(3,9):
    for j in range(5,16):
            print(f'{i},{j}', end=' -')
            temp = RandomForestClassifier(max_depth=i, min_samples_split=j, random_state=1)
            temp = temp.fit(X_train, Y_train)
            rmse = mean_squared_error(Y_test, temp.predict(X_test)) ** 0.5
            accuracy = accuracy_score(Y_test, temp.predict(X_test))
            if accuracy>best_accuracy:
                best_config = {'max_depth': i, 'min_samples_split': j}
                best_rmse = rmse
                best_accuracy = accuracy
                heart_rf = temp
            elif accuracy==best_accuracy and rmse<best_rmse:
                best_config = {'max_depth': i, 'min_samples_split': j}
                best_accuracy = accuracy
                heart_rf = temp
            print(f' Done! Accuracy is {accuracy}')
print(best_config)

3,5 - Done! Accuracy is 0.8586956521739131
3,6 - Done! Accuracy is 0.8586956521739131
3,7 - Done! Accuracy is 0.8586956521739131
3,8 - Done! Accuracy is 0.8586956521739131
3,9 - Done! Accuracy is 0.8586956521739131
3,10 - Done! Accuracy is 0.8586956521739131
3,11 - Done! Accuracy is 0.8586956521739131
3,12 - Done! Accuracy is 0.8586956521739131
3,13 - Done! Accuracy is 0.8586956521739131
3,14 - Done! Accuracy is 0.8695652173913043
3,15 - Done! Accuracy is 0.8695652173913043
4,5 - Done! Accuracy is 0.8804347826086957
4,6 - Done! Accuracy is 0.8804347826086957
4,7 - Done! Accuracy is 0.8695652173913043
4,8 - Done! Accuracy is 0.8913043478260869
4,9 - Done! Accuracy is 0.8586956521739131
4,10 - Done! Accuracy is 0.8695652173913043
4,11 - Done! Accuracy is 0.8695652173913043
4,12 - Done! Accuracy is 0.8695652173913043
4,13 - Done! Accuracy is 0.8695652173913043
4,14 - Done! Accuracy is 0.8695652173913043
4,15 - Done! Accuracy is 0.8695652173913043
5,5 - Done! Accuracy is 0.8586956521739131

In [10]:
print(confusion_matrix(Y_test, heart_rf.predict(X_test)))

[[36  5]
 [ 5 46]]


In [11]:
# eventual random forest model after grid search
print(f'Train set accuracy: {accuracy_score(Y_train, heart_rf.predict(X_train))}')
print(f'Test set accuracy: {accuracy_score(Y_test, heart_rf.predict(X_test))}')

Train set accuracy: 0.8290909090909091
Test set accuracy: 0.8913043478260869


## Stroke

In [16]:
stroke = pd.read_csv('stroke.csv')

In [17]:
stroke = stroke.drop(['id', 'ever_married', 'work_type', 'Residence_type'], axis=1)
stroke = stroke.dropna()
stroke = stroke[stroke['gender']!='Other']
stroke = stroke[stroke['smoking_status']!='Unknown']
stroke['hypertension'] = stroke['hypertension'].apply(lambda x: 'Present' if x==1 else 'Absent')
stroke['heart_disease'] = stroke['heart_disease'].apply(lambda x: 'Present' if x==1 else 'Absent')
stroke['stroke'] = stroke['stroke'].apply(lambda x: 'Present' if x==1 else 'Absent')

In [18]:
stroke.rename({'gender':'Gender','age':'Age','hypertension': 'Hypertension','heart_disease':'CoronaryArteryDisease', 'avg_glucose_level':'GlucoseLevel','bmi':'BMI','smoking_status':'SmokingStatus','stroke':'Stroke'}, axis=1, inplace=True)
stroke

Unnamed: 0,Gender,Age,Hypertension,CoronaryArteryDisease,GlucoseLevel,BMI,SmokingStatus,Stroke
0,Male,67.0,Absent,Present,228.69,36.6,formerly smoked,Present
2,Male,80.0,Absent,Present,105.92,32.5,never smoked,Present
3,Female,49.0,Absent,Absent,171.23,34.4,smokes,Present
4,Female,79.0,Present,Absent,174.12,24.0,never smoked,Present
5,Male,81.0,Absent,Absent,186.21,29.0,formerly smoked,Present
...,...,...,...,...,...,...,...,...
5100,Male,82.0,Present,Absent,71.97,28.3,never smoked,Absent
5102,Female,57.0,Absent,Absent,77.93,21.7,never smoked,Absent
5106,Female,81.0,Absent,Absent,125.20,40.0,never smoked,Absent
5107,Female,35.0,Absent,Absent,82.99,30.6,never smoked,Absent


In [19]:
# get X and Y with normalised values
X = stroke.iloc[:,:-1]
X_cont = X.iloc[:,[1,4,5]]
X_cat = X.iloc[:,[0,2,3,6]]
Y = stroke.iloc[:,-1]

# create scaler for continuous variables
stroke_scaler = StandardScaler()
columns = X_cont.columns
X_cont = stroke_scaler.fit_transform(X_cont)

# inverse transform (used for later)
stroke_inverse = stroke_scaler.inverse_transform(X_cont)
X_cont = pd.DataFrame(X_cont, columns=columns)

# create one-hot for categorical variables
ohe_stroke = OneHotEncoder().fit(X_cat)
columns = X_cat.columns
columns = ohe_stroke.get_feature_names_out(columns)
X_cat = pd.DataFrame(ohe_stroke.transform(X_cat).todense(), columns=columns)

X = pd.concat([X_cont.reset_index(drop=True),X_cat.reset_index(drop=True)], axis=1)

# train test split with seed of 1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

# SMOTE resampling to balance predictor
over = SMOTENC(categorical_features=[i for i in range(3,12)], sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.6)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, Y_train = pipeline.fit_resample(X_train, Y_train)

In [20]:
stroke_nn = MLPClassifier(solver = "adam", hidden_layer_sizes = (8,16,8), max_iter=10000, random_state=1)

In [21]:
stroke_nn.fit(X_train, Y_train)

MLPClassifier(hidden_layer_sizes=(8, 16, 8), max_iter=10000, random_state=1)

In [22]:
pred = stroke_nn.predict(X_test)

In [23]:
print(confusion_matrix(Y_test, stroke_nn.predict(X_test)))

[[285  35]
 [  8  15]]


In [24]:
print(f'Train set accuracy: {accuracy_score(Y_train, stroke_nn.predict(X_train))}')
print(f'Test set accuracy: {accuracy_score(Y_test, stroke_nn.predict(X_test))}')

Train set accuracy: 0.8368589743589744
Test set accuracy: 0.8746355685131195


## Diabetes

In [26]:
diabetes = pd.read_csv('diabetes.csv')

In [28]:
diabetes

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,0.0,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,2.0,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,0.0,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,0.0,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


In [29]:
diabetes = diabetes.drop(['CholCheck', 'AnyHealthcare', 'NoDocbcCost', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Education'], axis=1)
def convert_age(x):
    if x==1:
        return 21 + randint(-3,3)
    elif x==14:
        return np.nan
    else: 
        return 17 + (x*5) + randint(-2,2)
def convert_income(x):
    if x==1:
        return (randint(0,9999))*1.5
    elif x<=4:
        return ((x)*5000 + randint(0,4999))*1.5
    elif x==5:
        return (randint(25000,34999))*1.5
    elif x==6:
        return (randint(35000,49999))*1.5
    elif x==7:
        return (randint(50000,74999))*1.5
    else:
        return randint(75000,100000)*1.5
diabetes['Age'] = diabetes['Age'].apply(convert_age)
diabetes['Income'] = diabetes['Income'].apply(convert_income)
diabetes = diabetes.dropna()
diabetes['Diabetes_012'] = diabetes['Diabetes_012'].apply(lambda x: 'Absent' if x==0 else 'Present')
diabetes['HighBP'] = diabetes['HighBP'].apply(lambda x: 'Absent' if x==0 else 'Present')
diabetes['HighChol'] = diabetes['HighChol'].apply(lambda x: 'Normal' if x==0 else 'High')
diabetes['Smoker'] = diabetes['Smoker'].apply(lambda x: 'Never' if x==0 else 'Previously/Existing')
diabetes['Stroke'] = diabetes['Stroke'].apply(lambda x: 'Never' if x==0 else 'Previously')
diabetes['PhysActivity'] = diabetes['PhysActivity'].apply(lambda x: 'Normal' if x==0 else 'Active')
diabetes['HeartDiseaseorAttack'] = diabetes['HeartDiseaseorAttack'].apply(lambda x: 'Absent' if x==0 else 'Present')
diabetes['Fruits'] = diabetes['Fruits'].apply(lambda x: 'Infrequent' if x==0 else 'Frequent')
diabetes['Veggies'] = diabetes['Veggies'].apply(lambda x: 'Infrequent' if x==0 else 'Frequent')
diabetes['HvyAlcoholConsump'] = diabetes['HvyAlcoholConsump'].apply(lambda x: 'Infrequent' if x==0 else 'Frequent')
diabetes['GenHlth'] = diabetes['GenHlth'].apply(lambda x: 6-x)
diabetes['Sex'] = diabetes['Sex'].apply(lambda x: 'Female' if x==0 else 'Male')

In [30]:
diabetes.rename({'Diabetes_012':'Diabetes','HighBP':'Hypertension','HighChol': 'HighCholesterol','HeartDiseaseorAttack':'CoronaryArteryDisease', 'PhysActivity':'ActiveLifestyle','Fruits':'FruitsConsumption','Veggies':'VegetableConsumption','HvyAlcoholConsump':'AlcoholConsumption','GenHlth': "GeneralHealth"}, axis=1, inplace=True)
diabetes

Unnamed: 0,Diabetes,Hypertension,HighCholesterol,BMI,Smoker,Stroke,CoronaryArteryDisease,ActiveLifestyle,FruitsConsumption,VegetableConsumption,AlcoholConsumption,GeneralHealth,Sex,Age,Income
0,Absent,Present,High,40.0,Previously/Existing,Never,Absent,Normal,Infrequent,Frequent,Infrequent,1.0,Female,60.0,29788.5
1,Absent,Absent,Normal,25.0,Previously/Existing,Never,Absent,Active,Infrequent,Infrequent,Infrequent,3.0,Female,50.0,3400.5
2,Absent,Present,High,28.0,Never,Never,Absent,Normal,Frequent,Infrequent,Infrequent,1.0,Female,63.0,125104.5
3,Absent,Present,Normal,27.0,Never,Never,Absent,Active,Frequent,Frequent,Infrequent,4.0,Female,72.0,53820.0
4,Absent,Present,High,24.0,Never,Never,Absent,Active,Frequent,Frequent,Infrequent,4.0,Female,73.0,35901.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,Absent,Present,High,45.0,Never,Never,Absent,Normal,Frequent,Frequent,Infrequent,3.0,Male,42.0,85630.5
253676,Present,Present,High,18.0,Never,Never,Absent,Normal,Infrequent,Infrequent,Infrequent,2.0,Female,71.0,34782.0
253677,Absent,Absent,Normal,28.0,Never,Never,Absent,Active,Frequent,Infrequent,Infrequent,5.0,Female,26.0,17796.0
253678,Absent,Present,Normal,23.0,Never,Never,Absent,Normal,Frequent,Frequent,Infrequent,3.0,Male,52.0,5379.0


In [31]:
# get X and Y with normalised values
X = diabetes.iloc[:,1:]
X_cont = X.iloc[:,[2,10,12,13]]
X_cat = X.iloc[:,[0,1,3,4,5,6,7,8,9,11]]
Y = diabetes.iloc[:,0]

# create scaler for continuous variables
diabetes_scaler = StandardScaler()
columns = X_cont.columns
X_cont = diabetes_scaler.fit_transform(X_cont)

# inverse transform (used for later)
diabetes_inverse = diabetes_scaler.inverse_transform(X_cont)
X_cont = pd.DataFrame(X_cont, columns=columns)

# create one-hot for categorical variables
ohe_diabetes = OneHotEncoder().fit(X_cat)
columns = X_cat.columns
columns = ohe_diabetes.get_feature_names_out(columns)
X_cat = pd.DataFrame(ohe_diabetes.transform(X_cat).todense(), columns=columns)

X = pd.concat([X_cont.reset_index(drop=True),X_cat.reset_index(drop=True)], axis=1)

# train test split with seed of 1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

# SMOTE resampling to balance predictor
over = SMOTENC(categorical_features=[i for i in range(4,24)], sampling_strategy=0.3, k_neighbors = 10)
under = RandomUnderSampler(sampling_strategy=0.3) # basically no undersampling here
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_train, Y_train = pipeline.fit_resample(X_train, Y_train)

In [32]:
diabetes_nn = MLPClassifier(solver = "adam", hidden_layer_sizes = (12,24,12), max_iter=10000, random_state=1)

In [33]:
diabetes_nn.fit(X_train, Y_train)

MLPClassifier(hidden_layer_sizes=(12, 24, 12), max_iter=10000, random_state=1)

In [34]:
pred = diabetes_nn.predict(X_test)

In [35]:
print(confusion_matrix(Y_test, diabetes_nn.predict(X_test)))

[[19776  1564]
 [ 2500  1528]]


In [36]:
# underfitting is seen here but this is a common characteristic when we use SMOTE on train set
# nonetheless, the degree of underfitting is not very severe with a less than 5% difference
print(f'Train set accuracy: {accuracy_score(Y_train, diabetes_nn.predict(X_train))}')
print(f'Test set accuracy: {accuracy_score(Y_test, diabetes_nn.predict(X_test))}')

Train set accuracy: 0.8058648047731017
Test set accuracy: 0.8397981709239988


In [37]:
diabetes.to_csv('diabetes_cleaned.csv',index=False)

## Export Models

In [38]:
import joblib

In [39]:
joblib.dump(heart_rf, "coronary_rf.gz")
joblib.dump(stroke_nn, "stroke_nn.gz")
joblib.dump(diabetes_nn, "diabetes_nn.gz")

['diabetes_nn.gz']

In [40]:
# joblib.dump(heart_scaler, "heart_scaler.bin", compress=True)
joblib.dump(ohe_heart, 'coronary_onehot.joblib')
joblib.dump(ohe_stroke, 'stroke_onehot.joblib')
joblib.dump(stroke_scaler, "stroke_scaler.bin", compress=True)
joblib.dump(ohe_diabetes, 'diabetes_onehot.joblib')
joblib.dump(diabetes_scaler, "diabetes_scaler.bin", compress=True)

['diabetes_scaler.bin']