# Imbalance Data

In [3]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import random
from sklearn.utils import resample

In [23]:
n = 30000
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index().rename(columns={'index':'id'})

In [3]:
# DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv').reset_index().rename(columns={'index':'id'})

In [24]:
CatCols = [
    'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic',
    'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
]
NumCols = [
    'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'
]

X = DF[DF.columns.difference(['id','HeartDisease'])]
Y = DF['HeartDisease']

In [6]:
X.head()

Unnamed: 0,AgeCategory,AlcoholDrinking,Asthma,BMI,Diabetic,DiffWalking,GenHealth,KidneyDisease,MentalHealth,PhysicalActivity,PhysicalHealth,Race,Sex,SkinCancer,SleepTime,Smoking,Stroke
0,65-69,No,Yes,26.58,Yes,No,Fair,No,30.0,Yes,20.0,White,Male,No,8.0,Yes,No
1,80 or older,No,No,39.53,Yes,Yes,Poor,Yes,3.0,No,3.0,Black,Female,No,7.0,No,No
2,60-64,No,No,32.08,Yes,Yes,Fair,No,30.0,No,10.0,White,Male,No,3.0,Yes,Yes
3,80 or older,No,No,24.82,No,No,Good,No,0.0,Yes,0.0,White,Male,Yes,8.0,Yes,No
4,45-49,No,No,29.41,Yes,No,Very good,No,2.0,Yes,30.0,White,Male,No,7.0,No,No


In [25]:
# Drop rows with out of rage BMI
BMI_Out_Of_Range = X[X['BMI']>50]
X = X.drop(BMI_Out_Of_Range.index,errors='ignore')

In [26]:
# Drop rows with out of rage SleepTime
X = X.drop(X[X['SleepTime']>16]['SleepTime'].index,axis=0)

# <font color="Green">One Hot Encode Categorical Label :</font>

In [27]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def columns_OHC_transforme(DF,columns):
    merged = pd.DataFrame()
    for column_name in columns:
        label_encoder = LabelEncoder()
        encoded = label_encoder.fit_transform(DF[column_name])
        labels =pd.Series(label_encoder.classes_)

        clmt = ColumnTransformer([(column_name,OneHotEncoder(),[0])], remainder='passthrough')
        clmt_fited_data = clmt.fit_transform(DF[[column_name]]) 
        clmt_fited_data = clmt_fited_data if type(clmt_fited_data)== np.ndarray else clmt_fited_data.toarray()
        NDF = pd.DataFrame(clmt_fited_data,columns=[f'{column_name}_{x}' for x in labels]).set_index(DF.index)
        merged = pd.concat([merged,NDF],axis=1)
    DF = DF.drop(columns=columns)
    DF = DF.merge(merged, left_index=True,right_index=True)

    return DF

In [28]:
X_OHC = columns_OHC_transforme(X,CatCols) 
Y_label_encoder = LabelEncoder()
Y = DF.iloc[X_OHC.index]['HeartDisease']
Y_OHC = Y_label_encoder.fit_transform(Y)

 # <font color="Green">Normalize  :</font>

In [29]:
X_OHC_Norm = X_OHC.copy()
ss = StandardScaler()
X_OHC_Norm[NumCols] = ss.fit_transform(X_OHC[NumCols])

 # <font color="Green">Split Data  :</font>

In [30]:
X_train,X_test,Y_train,Y_test = train_test_split(
    X_OHC_Norm,
    pd.DataFrame(Y_OHC,columns=['HeartDisease']),
    random_state=3020,train_size=.90)

# <font color="Green">Imbalance Data  :</font>

In [31]:
from imblearn.over_sampling import RandomOverSampler
toml =RandomOverSampler(random_state=3020)
X_imb, Y_imb = toml.fit_resample(X_train,Y_train['HeartDisease'])
DF_imb = pd.concat([X_imb,Y_imb],axis=1)

In [15]:
DF_imb.shape

(8178, 51)

In [32]:
X_train, Y_train = DF_imb[DF_imb.columns.difference(['HeartDisease'])] ,DF_imb['HeartDisease']

In [33]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(
    random_state=3020,
    n_estimators=500,
    verbose=10
)

random_forest_classifier.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s remaining:    0.0s


building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.5s remaining:    0.0s


building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 

building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500
building tree 380 of 500
building tree 381 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   29.6s finished


In [34]:
Y_predicted = random_forest_classifier.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

Feature names must be in the same order as they were in fit.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s


              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2714
           1       0.50      0.00      0.01       263

    accuracy                           0.91      2977
   macro avg       0.71      0.50      0.48      2977
weighted avg       0.88      0.91      0.87      2977

[[2713    1]
 [ 262    1]]


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.3s finished
