In [35]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
import random
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import IsolationForest

In [82]:
n = 10000
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index().rename(columns={'index':'id'})

In [153]:
# DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv').reset_index().rename(columns={'index':'id'})
CatCols = [
    'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic',
    'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
]
NumCols = [
    'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'
]

X = DF[DF.columns.difference(['id','HeartDisease'])]
Y = DF['HeartDisease']

In [154]:
X.head()

Unnamed: 0,AgeCategory,AlcoholDrinking,Asthma,BMI,Diabetic,DiffWalking,GenHealth,KidneyDisease,MentalHealth,PhysicalActivity,PhysicalHealth,Race,Sex,SkinCancer,SleepTime,Smoking,Stroke
0,55-59,No,Yes,16.6,Yes,No,Very good,No,30.0,Yes,3.0,White,Female,Yes,5.0,Yes,No
1,80 or older,No,No,20.34,No,No,Very good,No,0.0,Yes,0.0,White,Female,No,7.0,No,Yes
2,65-69,No,Yes,26.58,Yes,No,Fair,No,30.0,Yes,20.0,White,Male,No,8.0,Yes,No
3,75-79,No,No,24.21,No,No,Good,No,0.0,No,0.0,White,Female,Yes,6.0,No,No
4,40-44,No,No,23.71,No,Yes,Very good,No,0.0,Yes,28.0,White,Female,No,8.0,No,No


In [155]:
def columns_OHC_transforme(DF,columns):
    merged = pd.DataFrame()
    for column_name in columns:
        label_encoder = LabelEncoder()
        encoded = label_encoder.fit_transform(DF[column_name])
        labels =pd.Series(label_encoder.classes_)

        clmt = ColumnTransformer([(column_name,OneHotEncoder(),[0])], remainder='passthrough')
        clmt_fited_data = clmt.fit_transform(DF[[column_name]]) 
        clmt_fited_data = clmt_fited_data if type(clmt_fited_data)== np.ndarray else clmt_fited_data.toarray()
        NDF = pd.DataFrame(clmt_fited_data,columns=[f'{column_name}_{x}' for x in labels]).set_index(DF.index)
        merged = pd.concat([merged,NDF],axis=1)
    DF = DF.drop(columns=columns)
    DF = DF.merge(merged, left_index=True,right_index=True)

    return DF

In [156]:
X = columns_OHC_transforme(X,CatCols) 
Y_label_encoder = LabelEncoder()
Y = DF.iloc[X.index]['HeartDisease']
Y = Y_label_encoder.fit_transform(Y)

In [157]:
ss = StandardScaler()
X[NumCols] = ss.fit_transform(X[NumCols])

In [158]:
isoforest = IsolationForest(n_estimators=3000,max_features=50)
X['is_noise'] = isoforest.fit_predict(X)
X = X[X['is_noise'] == 1]
Y = Y[X.index]
X = X[X.columns.difference(['is_noise'])]

In [159]:
toml =RandomOverSampler(random_state=3020)
X, Y = toml.fit_resample(X,pd.DataFrame(Y,columns=['HeartDisease']))
# DF = pd.concat([X,pd.DataFrame(Y,columns=['HeartDisease'])],axis=1)

In [160]:
Y = pd.get_dummies(Y['HeartDisease'])

In [161]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=3020,train_size=.90)

In [162]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(50,activation='relu',input_shape=(50,)),
    tf.keras.layers.Dense(100,activation='relu'),
#     tf.keras.layers.Dense(100,activation='relu'),
#     tf.keras.layers.Dense(100,activation='relu'),
#     tf.keras.layers.Dense(200,activation='relu'),
    tf.keras.layers.Dense(2,activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Nadam(0.01),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.CategoricalAccuracy()]
)

In [163]:
model.fit(X_train, Y_train, epochs=200, validation_data=(X_test,Y_test), batch_size=60000)

Epoch 1/200


  output, from_logits = _get_logits(




  output, from_logits = _get_logits(


Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200


Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200


Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200


KeyboardInterrupt: 

In [164]:
model.summary()

Model: "sequential_32"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_110 (Dense)           (None, 50)                2550      
                                                                 
 dense_111 (Dense)           (None, 100)               5100      
                                                                 
 dense_112 (Dense)           (None, 2)                 202       
                                                                 
Total params: 7,852
Trainable params: 7,852
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.predict(X_test)[0]



array([1.0000000e+00, 4.7968434e-09], dtype=float32)

In [33]:
np.argmax(model.predict(X_test)[0])



0