In [1]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Model,Sequential,load_model
from tensorflow.keras.layers import Input,Dense,Flatten,Embedding,Conv1D,MaxPool1D,concatenate,Dropout
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from imblearn.under_sampling import RandomUnderSampler 

2022-12-14 19:36:34.714617: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-14 19:36:34.714650: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-14 19:36:35.622535: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-14 19:36:35.622659: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
n = 319795
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index(drop=True).rename(
columns={'HeartDisease':'target'})

In [3]:
TARGET = 'target'
NumCols = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']
CatCols = DF.columns.difference([*NumCols,'target','Race'])
DF = DF[[TARGET,*NumCols,*CatCols]]

In [4]:
def label_encoder(DF,cols = []):
    DF = DF.copy()
    for col in cols:
        encoder = LabelEncoder()
        DF[col] = encoder.fit_transform(DF[col])
    return DF

# <font color='green'>Label Encoder </font>

In [5]:
DF = label_encoder(DF,[TARGET,*CatCols])[[TARGET,*NumCols,*CatCols]]

# <font color='green'>Scale Numerical Feature</font>

In [6]:
def feature_scaler(DF,Cols):
    DF = DF.copy()
    for col in Cols:
        ss = MinMaxScaler()
        DF[col] = ss.fit_transform(DF[[col]])
    return DF

In [7]:
DF = feature_scaler(DF,NumCols)

# <font color='green'>Train Test Split</font>

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(
    DF[DF.columns.difference([TARGET])],
    DF[TARGET],
    test_size=0.2, random_state=42
)
DF_train = pd.concat([X_train,Y_train],axis=1)

# <font color='green'>Drop Noise Data</font>

In [26]:
DF_train.shape

(8000, 17)

In [27]:
OutlierDetector = LocalOutlierFactor(n_neighbors=int(DF_train.shape[0]/100))
Y_predit = OutlierDetector.fit_predict(DF_train[DF_train.columns.difference([TARGET])])
print('noise data : '+str(Y_predit[Y_predit==-1].shape[0] / DF_train.shape[0] * 100)+'%') 

noise data : 1.0%


In [28]:
DF_train['is_noise'] = Y_predit
DF_train = DF_train[DF_train['is_noise'] == 1]
DF_train = DF_train[DF_train.columns.difference(['is_noise'])]

In [29]:
DF_train.shape

(7920, 17)

# <font color='green'>Handle Imbalanced Data</font>

In [30]:
smoteomek = RandomUnderSampler(random_state=3020)
X_train, Y_train = smoteomek.fit_resample(DF_train[DF_train.columns.difference([TARGET])], DF_train[TARGET] )

In [31]:
class_weight = dict(enumerate(
    compute_class_weight(
        class_weight='balanced',
        classes=np.unique(Y_train),
        y=Y_train
    )
))
class_weight

{0: 1.0, 1: 1.0}

# <font color='green'>Model Evaluation : (Random Forest)</font>

In [32]:
model = RandomForestClassifier(
    random_state=3020,
    max_features=17,
#     max_depth=10,
    class_weight=class_weight,
    verbose=True
)
model.fit(X_train, Y_train)

Y_predicted = model.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           0       0.97      0.71      0.82      1826
           1       0.21      0.80      0.33       174

    accuracy                           0.72      2000
   macro avg       0.59      0.76      0.58      2000
weighted avg       0.91      0.72      0.78      2000

[[1301  525]
 [  35  139]]


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [33]:
model = CatBoostClassifier(
#     iterations=200,
#     learning_rate=.01,
#     depth=16,
#     eval_metric='Accuracy',
    random_state=3020,
#     verbose=False
    class_weights=class_weight
)
model.fit(X_train, Y_train)

Y_predicted = model.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

Learning rate set to 0.011763
0:	learn: 0.6890221	total: 51.7ms	remaining: 51.7s
1:	learn: 0.6843978	total: 54.8ms	remaining: 27.4s
2:	learn: 0.6799196	total: 56.7ms	remaining: 18.8s
3:	learn: 0.6756833	total: 58.1ms	remaining: 14.5s
4:	learn: 0.6712604	total: 59.9ms	remaining: 11.9s
5:	learn: 0.6668182	total: 62.5ms	remaining: 10.4s
6:	learn: 0.6641845	total: 63.5ms	remaining: 9.01s
7:	learn: 0.6596604	total: 65.1ms	remaining: 8.07s
8:	learn: 0.6559276	total: 66.9ms	remaining: 7.36s
9:	learn: 0.6522875	total: 68.7ms	remaining: 6.8s
10:	learn: 0.6490226	total: 70.7ms	remaining: 6.36s
11:	learn: 0.6452323	total: 72.6ms	remaining: 5.98s
12:	learn: 0.6411705	total: 74.4ms	remaining: 5.65s
13:	learn: 0.6382418	total: 76.2ms	remaining: 5.37s
14:	learn: 0.6341870	total: 78.1ms	remaining: 5.13s
15:	learn: 0.6302595	total: 79.9ms	remaining: 4.91s
16:	learn: 0.6273724	total: 81.8ms	remaining: 4.73s
17:	learn: 0.6241081	total: 83.6ms	remaining: 4.56s
18:	learn: 0.6210009	total: 85.5ms	remaining:

181:	learn: 0.4465780	total: 382ms	remaining: 1.72s
182:	learn: 0.4462975	total: 386ms	remaining: 1.72s
183:	learn: 0.4457804	total: 387ms	remaining: 1.72s
184:	learn: 0.4454649	total: 389ms	remaining: 1.72s
185:	learn: 0.4451298	total: 391ms	remaining: 1.71s
186:	learn: 0.4449048	total: 393ms	remaining: 1.71s
187:	learn: 0.4446195	total: 396ms	remaining: 1.71s
188:	learn: 0.4440438	total: 398ms	remaining: 1.71s
189:	learn: 0.4435570	total: 400ms	remaining: 1.7s
190:	learn: 0.4431756	total: 401ms	remaining: 1.7s
191:	learn: 0.4426609	total: 403ms	remaining: 1.7s
192:	learn: 0.4422807	total: 405ms	remaining: 1.69s
193:	learn: 0.4419310	total: 407ms	remaining: 1.69s
194:	learn: 0.4415831	total: 411ms	remaining: 1.7s
195:	learn: 0.4412904	total: 413ms	remaining: 1.7s
196:	learn: 0.4409253	total: 417ms	remaining: 1.7s
197:	learn: 0.4407119	total: 419ms	remaining: 1.7s
198:	learn: 0.4404078	total: 421ms	remaining: 1.69s
199:	learn: 0.4400662	total: 423ms	remaining: 1.69s
200:	learn: 0.43980

379:	learn: 0.3910492	total: 767ms	remaining: 1.25s
380:	learn: 0.3909548	total: 769ms	remaining: 1.25s
381:	learn: 0.3907580	total: 771ms	remaining: 1.25s
382:	learn: 0.3904496	total: 773ms	remaining: 1.25s
383:	learn: 0.3901773	total: 775ms	remaining: 1.24s
384:	learn: 0.3899814	total: 777ms	remaining: 1.24s
385:	learn: 0.3896604	total: 779ms	remaining: 1.24s
386:	learn: 0.3895286	total: 781ms	remaining: 1.24s
387:	learn: 0.3892997	total: 783ms	remaining: 1.24s
388:	learn: 0.3891285	total: 785ms	remaining: 1.23s
389:	learn: 0.3890080	total: 788ms	remaining: 1.23s
390:	learn: 0.3888544	total: 790ms	remaining: 1.23s
391:	learn: 0.3886170	total: 792ms	remaining: 1.23s
392:	learn: 0.3883613	total: 794ms	remaining: 1.23s
393:	learn: 0.3881074	total: 795ms	remaining: 1.22s
394:	learn: 0.3879364	total: 797ms	remaining: 1.22s
395:	learn: 0.3878564	total: 799ms	remaining: 1.22s
396:	learn: 0.3876476	total: 801ms	remaining: 1.22s
397:	learn: 0.3874463	total: 803ms	remaining: 1.21s
398:	learn: 

585:	learn: 0.3518317	total: 1.15s	remaining: 811ms
586:	learn: 0.3516115	total: 1.15s	remaining: 810ms
587:	learn: 0.3514556	total: 1.16s	remaining: 811ms
588:	learn: 0.3513981	total: 1.16s	remaining: 809ms
589:	learn: 0.3513538	total: 1.16s	remaining: 807ms
590:	learn: 0.3510425	total: 1.16s	remaining: 805ms
591:	learn: 0.3510151	total: 1.16s	remaining: 802ms
592:	learn: 0.3508818	total: 1.17s	remaining: 800ms
593:	learn: 0.3508000	total: 1.17s	remaining: 798ms
594:	learn: 0.3504885	total: 1.17s	remaining: 797ms
595:	learn: 0.3502796	total: 1.17s	remaining: 795ms
596:	learn: 0.3501165	total: 1.17s	remaining: 793ms
597:	learn: 0.3499548	total: 1.18s	remaining: 792ms
598:	learn: 0.3497514	total: 1.18s	remaining: 790ms
599:	learn: 0.3494807	total: 1.18s	remaining: 788ms
600:	learn: 0.3493629	total: 1.18s	remaining: 786ms
601:	learn: 0.3492425	total: 1.19s	remaining: 784ms
602:	learn: 0.3489895	total: 1.19s	remaining: 782ms
603:	learn: 0.3488277	total: 1.19s	remaining: 780ms
604:	learn: 

783:	learn: 0.3175103	total: 1.53s	remaining: 422ms
784:	learn: 0.3173607	total: 1.53s	remaining: 420ms
785:	learn: 0.3172889	total: 1.53s	remaining: 418ms
786:	learn: 0.3170774	total: 1.54s	remaining: 416ms
787:	learn: 0.3169969	total: 1.54s	remaining: 414ms
788:	learn: 0.3169260	total: 1.54s	remaining: 412ms
789:	learn: 0.3166697	total: 1.54s	remaining: 410ms
790:	learn: 0.3164318	total: 1.54s	remaining: 408ms
791:	learn: 0.3162258	total: 1.55s	remaining: 406ms
792:	learn: 0.3162109	total: 1.55s	remaining: 404ms
793:	learn: 0.3161985	total: 1.55s	remaining: 402ms
794:	learn: 0.3160283	total: 1.55s	remaining: 400ms
795:	learn: 0.3159692	total: 1.55s	remaining: 398ms
796:	learn: 0.3156443	total: 1.55s	remaining: 396ms
797:	learn: 0.3154633	total: 1.56s	remaining: 394ms
798:	learn: 0.3152748	total: 1.56s	remaining: 392ms
799:	learn: 0.3150130	total: 1.56s	remaining: 390ms
800:	learn: 0.3149217	total: 1.56s	remaining: 388ms
801:	learn: 0.3147887	total: 1.56s	remaining: 386ms
802:	learn: 

              precision    recall  f1-score   support

           0       0.97      0.74      0.84      1826
           1       0.22      0.79      0.35       174

    accuracy                           0.74      2000
   macro avg       0.60      0.77      0.59      2000
weighted avg       0.91      0.74      0.80      2000

[[1347  479]
 [  36  138]]


# <font color='green'>Model Evaluation : (CNN Deep learning)</font>

In [34]:
input1 = Input(shape=(X_train.shape[1],1))
flat1 = Flatten()(input1)

flatX = concatenate([flat1])
dense1 = Dense(50,activation='relu')(flatX)
output = Dense(2,activation='softmax')(dense1)

model = Model(inputs=[input1],outputs=output)
model.compile(
    optimizer=tf.keras.optimizers.Adam(0.01),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
)

2022-12-14 19:39:29.366803: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-12-14 19:39:29.366841: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-14 19:39:29.366877: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: masoud-Aspire-V3-571G
2022-12-14 19:39:29.366893: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: masoud-Aspire-V3-571G
2022-12-14 19:39:29.367020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2022-12-14 19:39:29.367089: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is:

In [35]:
history = model.fit(
    [X_train],
    Y_train,
    epochs=40,
    batch_size=200,
    validation_data=([X_test],Y_test),
    class_weight=class_weight
)

Epoch 1/40


  output, from_logits = _get_logits(


Epoch 2/40
Epoch 3/40
1/7 [===>..........................] - ETA: 0s - loss: 0.5065 - sparse_categorical_accuracy: 0.7650

  output, from_logits = _get_logits(


Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [36]:
Y_predicted = model.predict([X_test])
# Y_predicted = [np.argmax(x) for x in Y_predicted]
Y_predicted = [1 if x[1]>.4 else 0 for x in Y_predicted]

print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

              precision    recall  f1-score   support

           0       0.99      0.59      0.74      1826
           1       0.18      0.92      0.30       174

    accuracy                           0.62      2000
   macro avg       0.58      0.76      0.52      2000
weighted avg       0.92      0.62      0.70      2000

[[1085  741]
 [  14  160]]
