In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [33]:
df = pd.read_csv('heart failure.csv')

In [3]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
xtrain, xtest, ytrain, ytest = train_test_split(df.drop('DEATH_EVENT',axis=1) , df['DEATH_EVENT'] , train_size=.70 , random_state=1)

In [5]:
xtrain.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
14,49.0,1,80,0,30,1,427000.0,1.0,138,0,0,12
210,70.0,0,212,1,17,1,389000.0,1.0,136,1,1,188
236,75.0,0,119,0,50,1,248000.0,1.1,148,1,0,209
44,60.0,1,588,1,60,0,194000.0,1.1,142,0,0,33
163,50.0,1,2334,1,35,0,75000.0,0.9,142,0,0,126


In [6]:
clf = DecisionTreeClassifier()

In [7]:
clf.fit(xtrain, ytrain)

DecisionTreeClassifier()

In [8]:
clf.predict(xtest)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1], dtype=int64)

In [9]:
np.array(ytest)

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1], dtype=int64)

# Random forest classifier

In [10]:
rfc =  RandomForestClassifier()

In [11]:
rfc.fit(xtrain, ytrain)

RandomForestClassifier()

In [12]:
rfc.score(xtest,ytest)

0.8777777777777778

# Hyper Parameter Tunuing & RandomizedSearchCV

In [13]:
estimators = np.random.randint(25,200,25)
criterion = ['gini', 'entropy']
max_depth = np.random.randint(1,15,20)
min_samples_split = [2,3,4]
max_feature = ['sqrt', 'log2']

In [14]:
rcv_param = {
    'n_estimators' : estimators,
    'criterion' : criterion,
    'max_depth' : max_depth,
    'min_samples_split' : min_samples_split,
    'max_features' : max_feature
}

In [15]:
rcv = RandomizedSearchCV(rfc, rcv_param, n_iter = 200, cv = 3)

In [16]:
rcv.fit(xtrain, ytrain)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=200,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([12, 14,  8, 10, 12,  4, 14, 11,  1,  4,  5,  2, 10,  1,  4, 14,  2,
       13,  8,  4]),
                                        'max_features': ['sqrt', 'log2'],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': array([174, 140, 163,  53, 114, 182, 156, 155,  53, 135,  95, 181,  96,
        71, 189, 163,  28,  88,  65, 191, 157, 180, 178, 186,  51])})

In [17]:
rcv.cv_results_

{'mean_fit_time': array([0.17752687, 0.19780469, 0.1738778 , 0.11635359, 0.22206434,
        0.18150417, 0.19382469, 0.10272535, 0.19082276, 0.07214046,
        0.06249825, 0.20677153, 0.19248629, 0.17784778, 0.06116891,
        0.17818324, 0.12400301, 0.08643603, 0.2041138 , 0.06383832,
        0.07779376, 0.10905329, 0.21377134, 0.19914397, 0.07312814,
        0.10870075, 0.19880835, 0.10537553, 0.20310521, 0.20278144,
        0.05752381, 0.17984319, 0.07247281, 0.22774371, 0.20244948,
        0.19878976, 0.05784535, 0.0578266 , 0.17253955, 0.12666074,
        0.03325224, 0.18351102, 0.21842384, 0.12732697, 0.10371486,
        0.15025584, 0.14924788, 0.20975391, 0.03292139, 0.17386882,
        0.20909985, 0.17320236, 0.1761957 , 0.03091764, 0.03257775,
        0.19946694, 0.19482136, 0.07181899, 0.07346106, 0.10106174,
        0.21873864, 0.05718048, 0.21276498, 0.19812751, 0.19182253,
        0.05984036, 0.12965266, 0.06183449, 0.10738087, 0.06181622,
        0.03390876, 0.03257926,

In [18]:
rcv_result = pd.DataFrame(rcv.cv_results_)
rcv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_max_features,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.177527,0.015558,0.012299,0.000471,155,4,log2,2,entropy,"{'n_estimators': 155, 'min_samples_split': 4, ...",0.828571,0.785714,0.826087,0.813458,0.019644,40
1,0.197805,0.000921,0.014627,0.000470,174,2,log2,12,entropy,"{'n_estimators': 174, 'min_samples_split': 2, ...",0.785714,0.814286,0.840580,0.813527,0.022405,33
2,0.173878,0.008983,0.013963,0.001411,155,3,log2,4,gini,"{'n_estimators': 155, 'min_samples_split': 3, ...",0.785714,0.785714,0.826087,0.799172,0.019032,126
3,0.116354,0.011114,0.008652,0.000476,88,4,sqrt,10,entropy,"{'n_estimators': 88, 'min_samples_split': 4, '...",0.814286,0.800000,0.826087,0.813458,0.010666,40
4,0.222064,0.011053,0.017297,0.001705,189,4,log2,4,gini,"{'n_estimators': 189, 'min_samples_split': 4, ...",0.771429,0.814286,0.855072,0.813596,0.034151,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.105717,0.001629,0.008643,0.000470,95,4,sqrt,10,gini,"{'n_estimators': 95, 'min_samples_split': 4, '...",0.785714,0.828571,0.826087,0.813458,0.019644,40
196,0.202800,0.000485,0.014961,0.000001,181,2,sqrt,11,entropy,"{'n_estimators': 181, 'min_samples_split': 2, ...",0.785714,0.828571,0.797101,0.803796,0.018125,111
197,0.200114,0.000477,0.014637,0.000477,186,4,sqrt,4,gini,"{'n_estimators': 186, 'min_samples_split': 4, ...",0.771429,0.800000,0.855072,0.808834,0.034714,65
198,0.102051,0.000477,0.007989,0.000015,95,3,log2,2,entropy,"{'n_estimators': 95, 'min_samples_split': 3, '...",0.828571,0.785714,0.826087,0.813458,0.019644,40


In [19]:
rcv.best_score_

0.8325741890959281

In [20]:
rcv.best_params_

{'n_estimators': 157,
 'min_samples_split': 3,
 'max_features': 'sqrt',
 'max_depth': 10,
 'criterion': 'gini'}

In [21]:
rfc1 =   RandomForestClassifier(n_estimators=71, criterion='entropy', max_depth=12, min_samples_split=3, max_features = 'log2')

In [22]:
rfc1.fit(xtrain, ytrain)

RandomForestClassifier(criterion='entropy', max_depth=12, max_features='log2',
                       min_samples_split=3, n_estimators=71)

In [23]:
rfc1.score(xtest,ytest)

0.8666666666666667

# GridSearchCV

In [24]:
grid_prams = {
    'n_estimators' : estimators,
    'criterion' : criterion,
    'max_depth' : max_depth,
    'max_features' : max_feature
}

In [25]:
gcv = GridSearchCV(rfc, grid_prams, cv = 3)

In [26]:
gcv.fit(xtrain, ytrain)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([12, 14,  8, 10, 12,  4, 14, 11,  1,  4,  5,  2, 10,  1,  4, 14,  2,
       13,  8,  4]),
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': array([174, 140, 163,  53, 114, 182, 156, 155,  53, 135,  95, 181,  96,
        71, 189, 163,  28,  88,  65, 191, 157, 180, 178, 186,  51])})

In [27]:
gcv.best_score_

0.8469289164941339

In [28]:
gcv.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'max_features': 'sqrt',
 'n_estimators': 28}

In [29]:
rfc2 =   RandomForestClassifier(n_estimators=28, criterion='gini', max_depth=4, max_features = 'sqrt')

In [30]:
rfc2.fit(xtrain,ytrain)

RandomForestClassifier(max_depth=4, max_features='sqrt', n_estimators=28)

In [31]:
rfc2.score(xtest,ytest)

0.8888888888888888