# Imbalance Data

In [1]:
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
import random

In [2]:
n = 5000
skip = sorted(random.sample(range(1,319795),319795-n))
DF = pd.read_csv('../../datasets/heart_2020_cleaned.csv',skiprows=skip).reset_index().rename(columns={'index':'id'})

In [3]:
CatCols = [
    'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','AgeCategory','Race','Diabetic',
    'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
]
NumCols = [
    'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime'
]

X = DF[DF.columns.difference(['id','HeartDisease'])]
Y = DF['HeartDisease']

In [4]:
X.head()

Unnamed: 0,AgeCategory,AlcoholDrinking,Asthma,BMI,Diabetic,DiffWalking,GenHealth,KidneyDisease,MentalHealth,PhysicalActivity,PhysicalHealth,Race,Sex,SkinCancer,SleepTime,Smoking,Stroke
0,80 or older,No,No,27.98,No,No,Excellent,No,0.0,No,0.0,White,Male,No,3.0,No,No
1,80 or older,No,No,24.8,No,No,Excellent,No,0.0,Yes,0.0,White,Female,Yes,6.0,No,No
2,70-74,No,No,27.26,No,No,Good,No,0.0,No,0.0,White,Male,No,8.0,Yes,No
3,50-54,No,Yes,30.29,Yes,No,Fair,No,15.0,Yes,30.0,White,Female,No,4.0,Yes,No
4,80 or older,No,No,23.33,No,No,Good,No,0.0,Yes,5.0,White,Female,No,7.0,No,No


In [5]:
# Drop rows with out of rage BMI
BMI_Out_Of_Range = X[X['BMI']>50]
X = X.drop(BMI_Out_Of_Range.index,errors='ignore')

In [6]:
# CatCols = [
#     'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','Race','Diabetic',
#     'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
# ]
# NumCols = [
#     'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime','AgeCategory'
# ]

In [7]:
# _1 = X[X['AgeCategory']!='80 or older']['AgeCategory'].str.split('-').apply(
#     lambda x : (int(x[0])+int(x[1]))/2
# )
# _2 = X[X['AgeCategory']=='80 or older']['AgeCategory'].str.split(' or ').apply(
#     lambda x : 80
# )
# X.loc[_1.index,'AgeCategory'] = _1
# X.loc[_2.index,'AgeCategory'] = _2
# del _1,_2

In [8]:
# X['AgeCategory'] = X['AgeCategory'].astype(np.int64) 

In [9]:
# CatCols = [
#     'Smoking','AlcoholDrinking','Stroke','DiffWalking','Sex','Race','Diabetic',
#     'PhysicalActivity','GenHealth','Asthma','KidneyDisease','SkinCancer'
# ]
# NumCols = [
#     'BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime','AgeCategory'
# ]

In [10]:
# Drop rows with out of rage SleepTime
X = X.drop(X[X['SleepTime']>16]['SleepTime'].index,axis=0)

# <font color="Green">One Hot Encode Categorical Label :</font>

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

def OHC_transformer_single_column(DF,columns):
    
    for column_name in columns:
        label_encoder = LabelEncoder()
        encoded = label_encoder.fit_transform(DF[column_name])
        labels =pd.Series(label_encoder.classes_)

        clmt = ColumnTransformer([(column_name,OneHotEncoder(),[0])], remainder='passthrough')
        clmt_fited_data = clmt.fit_transform(DF[[column_name]]) 
        clmt_fited_data = clmt_fited_data if type(clmt_fited_data)== np.ndarray else clmt_fited_data.toarray()  
        NDF = pd.DataFrame(clmt_fited_data,columns=[f'{column_name}_{x}' for x in labels])
        
        merged = DF.merge(NDF, left_index=True,right_index=True)
        merged = merged.drop(columns=[column_name])
        DF = merged
    return DF

In [12]:
X_OHC = OHC_transformer_single_column(X,CatCols) 
Y = DF.iloc[X_OHC.index]['HeartDisease']
Y_label_encoder = LabelEncoder()
Y_OHC = Y_label_encoder.fit_transform(Y)

In [13]:
NDF = pd.concat([X,Y],axis=1)
NDF_OHC = pd.concat([X_OHC,pd.DataFrame(Y_OHC,columns=['HeartDisease'])],axis=1)

 # <font color="Green">Normalize  :</font>

In [14]:
X_OHC_Norm = X_OHC.copy()
ss = StandardScaler()
X_OHC_Norm[NumCols] = ss.fit_transform(X_OHC[NumCols])

In [15]:
X_OHC_Norm.shape

(4429, 50)

# <font color="Green">Imbalance Data  :</font>

In [16]:
from imblearn.over_sampling import RandomOverSampler
toml =RandomOverSampler(random_state=3020)
X_imb, Y_imb = toml.fit_resample(X_OHC_Norm,pd.DataFrame(Y_OHC,columns=['HeartDisease']))
DF_imb = pd.concat([X_imb,Y_imb],axis=1)

In [17]:
X_imb.shape

(8034, 50)

 # <font color="Green">Modeling  :</font>

In [18]:
X_train,X_test,Y_train,Y_test = train_test_split(DF_imb.iloc[:,:-1],DF_imb.iloc[:,-1],random_state=3020)

In [19]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(
    random_state=3020,
    n_estimators=3000,
    verbose=True
)

random_forest_classifier.fit(X_train,Y_train)
# random_forest_classifier.score(X_test,Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3000 out of 3000 | elapsed:   21.8s finished


RandomForestClassifier(n_estimators=3000, random_state=3020, verbose=True)

In [20]:
Y_predicted = random_forest_classifier.predict(X_test)
print(metrics.classification_report(Y_test,Y_predicted))
print(metrics.confusion_matrix(Y_test,Y_predicted))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1020
           1       0.98      1.00      0.99       989

    accuracy                           0.99      2009
   macro avg       0.99      0.99      0.99      2009
weighted avg       0.99      0.99      0.99      2009

[[999  21]
 [  4 985]]


[Parallel(n_jobs=1)]: Done 3000 out of 3000 | elapsed:    1.6s finished
