In [1]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Model,Sequential,load_model
from tensorflow.keras.layers import Input,Dense,Flatten,Embedding,Conv1D,MaxPool1D,concatenate,Dropout
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler 

2022-12-14 19:40:06.651717: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-14 19:40:06.651748: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-14 19:40:07.558985: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-14 19:40:07.559097: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
n = 319795
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index(drop=True).rename(
columns={'HeartDisease':'target'})

In [3]:
TARGET = 'target'
NumCols = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
CatCols = DF.columns.difference([*NumCols,'target','Race'])
DF = DF[[TARGET,*NumCols,*CatCols]]

In [4]:
def label_encoder(DF,cols = []):
    DF = DF.copy()
    for col in cols:
        encoder = LabelEncoder()
        DF[col] = encoder.fit_transform(DF[col])
    return DF

# <font color='green'>Label Encoder </font>

In [5]:
DF = label_encoder(DF,[TARGET,*CatCols])[[TARGET,*NumCols,*CatCols]]

# <font color='green'>Scale Numerical Feature</font>

In [6]:
def feature_scaler(DF,Cols):
    DF = DF.copy()
    for col in Cols:
        ss = MinMaxScaler()
        DF[col] = ss.fit_transform(DF[[col]])
    return DF

In [7]:
DF = feature_scaler(DF,NumCols)

# <font color='green'>Train Test Split</font>

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(
    DF[DF.columns.difference([TARGET])],
    DF[TARGET],
    test_size=0.2, random_state=42
)
DF_train = pd.concat([X_train,Y_train],axis=1)

# <font color='green'>Drop Noise Data</font>

In [9]:
DF_train.shape

(255836, 17)

In [None]:
OutlierDetector = LocalOutlierFactor(n_neighbors=int(DF_train.shape[0]/100))
Y_predit = OutlierDetector.fit_predict(DF_train[DF_train.columns.difference([TARGET])])
print('noise data : '+str(Y_predit[Y_predit==-1].shape[0] / DF_train.shape[0] * 100)+'%') 

In [None]:
DF_train['is_noise'] = Y_predit
DF_train = DF_train[DF_train['is_noise'] == 1]
DF_train = DF_train[DF_train.columns.difference(['is_noise'])]

In [None]:
DF_train.shape

# <font color='green'>Handle Imbalanced Data</font>

In [None]:
smoteomek = RandomUnderSampler(random_state=3020)
X_train, Y_train = smoteomek.fit_resample(DF_train[DF_train.columns.difference([TARGET])], DF_train[TARGET] )

In [None]:
class_weight = dict(enumerate(
    compute_class_weight(
        class_weight='balanced',
        classes=np.unique(Y_train),
        y=Y_train
    )
))
class_weight

# <font color='green'>Model Evaluation : (Random Forest)</font>

In [None]:
model = RandomForestClassifier(
    random_state=3020,
    max_features=17,
#     max_depth=10,
    class_weight=class_weight,
    verbose=True
)
model.fit(X_train, Y_train)

Y_predicted = model.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

In [None]:
model = CatBoostClassifier(
#     iterations=200,
#     learning_rate=.01,
#     depth=16,
#     eval_metric='Accuracy',
    random_state=3020,
#     verbose=False
    class_weights=class_weight
)
model.fit(X_train, Y_train)

Y_predicted = model.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

# <font color='green'>Model Evaluation : (CNN Deep learning)</font>

In [None]:
input1 = Input(shape=(X_train.shape[1],1))
flat1 = Flatten()(input1)

flatX = concatenate([flat1])
dense1 = Dense(50,activation='relu')(flatX)
output = Dense(2,activation='softmax')(dense1)

model = Model(inputs=[input1],outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

In [None]:
history = model.fit(
    [X_train],
    Y_train,
    epochs=40,
    batch_size=200,
    validation_data=([X_test],Y_test),
    class_weight=class_weight
)

In [None]:
Y_predicted = model.predict([X_test])
# Y_predicted = [np.argmax(x) for x in Y_predicted]
Y_predicted = [1 if x[1]>.4 else 0 for x in Y_predicted]

print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))