In [150]:
import numpy as np 
import pandas as pd 

In [151]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [152]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [153]:
from sklearn.model_selection import train_test_split

In [154]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=8)

In [155]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# 

# Voting Ensemble

In [156]:
bm1 = RandomForestClassifier()
bm2= LogisticRegression()
bm3= KNeighborsClassifier()

In [157]:
estimators = [('bm1',bm1),('bm2',bm2),('bm3',bm3)]

In [158]:
from sklearn.ensemble import VotingClassifier

In [159]:
vc = VotingClassifier(estimators=estimators,voting='hard')

In [160]:
x = cross_val_score(vc,X,y,scoring='accuracy',cv=10,n_jobs=-1,verbose=2)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.7s finished


In [161]:
np.round(np.mean(x),2)

0.83

# 

# Bagging

In [162]:
from sklearn.ensemble import BaggingClassifier

In [163]:
# Bagging using decision tree
bag = BaggingClassifier(
              base_estimator=DecisionTreeClassifier(),
               n_estimators=500,
               max_samples=0.5,
               random_state=42)

In [164]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=500, random_state=42)

In [165]:
y_pred = bag.predict(X_test)

In [166]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.819672131147541

In [167]:
x = cross_val_score(BaggingClassifier(),X_train,y_train,scoring='accuracy',cv=10,n_jobs=-1)
np.round(np.mean(x),2)

0.76

# 

### Gridsearch CV

In [168]:
from sklearn.model_selection import GridSearchCV

In [169]:
parameter = {
    'n_estimators': [50,100,200,500], 
    'max_samples' : [0.25,0.5,0.75,1.0],
    'bootstrap'   : [True,False],
    'max_features': [0.1,0.4,0.7,1.0],
    
}

In [170]:
search = GridSearchCV(BaggingClassifier(),parameter,n_jobs=-1,cv=5)

In [171]:
search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=BaggingClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_features': [0.1, 0.4, 0.7, 1.0],
                         'max_samples': [0.25, 0.5, 0.75, 1.0],
                         'n_estimators': [50, 100, 200, 500]})

In [172]:
print(search.best_estimator_)
print(search.best_score_)

BaggingClassifier(bootstrap=False, max_features=0.4, max_samples=0.75,
                  n_estimators=50)
0.8225340136054422


# 

In [173]:
# Bagging using svm
from sklearn.svm import SVC

In [174]:
bag = BaggingClassifier(
     base_estimator=SVC(),
     bootstrap_features=True,
     n_estimators=500,
      max_samples=1.0,
      random_state=42,
max_features=0.75)
    

In [175]:
bag.fit(X_train,y_train)

BaggingClassifier(base_estimator=SVC(), bootstrap_features=True,
                  max_features=0.75, n_estimators=500, random_state=42)

In [176]:
y_pred = bag.predict(X_test)
accuracy_score(y_test,y_pred)

0.7377049180327869

# 

# Random Forest

In [177]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_samples=0.75,random_state=42)

In [178]:
rf.fit(X_train,y_train)

RandomForestClassifier(max_samples=0.75, random_state=42)

In [180]:
y_pred = rf.predict(X_test)

In [181]:
accuracy_score(y_test,y_pred)

0.8360655737704918

In [182]:
x =cross_val_score(RandomForestClassifier(),X_train,y_train,scoring='accuracy',cv=10,n_jobs=-1)

In [183]:
np.round(np.mean(x),2)

0.81

# 

### RandomSearchCV

In [184]:
from sklearn.model_selection import RandomizedSearchCV

In [185]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [186]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [187]:
rf_grid = RandomizedSearchCV(estimator=abc,param_distributions=param_grid,cv=5,n_jobs=-1,verbose=2)

In [188]:
rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: Invalid parameter min_samples_split for estimator AdaBoostClassifier(n_estimators=60). Check the list of available parameters with `estimator.get_params().keys()`.

In [189]:
print(rf_grid.best_params_)
print(rf_grid.best_score_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

# 

# AdaBoost

In [190]:
from sklearn.ensemble import AdaBoostClassifier

In [191]:
abc = AdaBoostClassifier()
c =cross_val_score(abc,X_train,y_train,scoring='accuracy',n_jobs=-1,cv=10)
np.round(np.mean(c),2)

0.76

In [192]:
from sklearn.model_selection import GridSearchCV

grid = dict()
grid['n_estimators'] = [10,20,40,80,50,100,400,500]
grid['learning_rate'] = [0.0001,0.001,0.01,0.1,0.25,0.5,1.0]
grid['algorithm'] = ['SAMME','SAMME.R']


grid_search = GridSearchCV(estimator=AdaBoostClassifier(),param_grid=grid,n_jobs=-1,cv=10,scoring='accuracy')

In [193]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=AdaBoostClassifier(), n_jobs=-1,
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5,
                                           1.0],
                         'n_estimators': [10, 20, 40, 80, 50, 100, 400, 500]},
             scoring='accuracy')

In [194]:
print("Best accuracy score: %f using %s" % (grid_search.best_score_, grid_search.best_params_))

Best accuracy score: 0.826500 using {'algorithm': 'SAMME', 'learning_rate': 0.01, 'n_estimators': 500}


# 

# Stacking

In [195]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

In [196]:
estimators = [
('rf',RandomForestClassifier(n_estimators=10,random_state=42)),
('knn',KNeighborsClassifier(n_neighbors=10)),
('gbdt',GradientBoostingClassifier())]

In [197]:
from sklearn.ensemble import StackingClassifier
scl = StackingClassifier(estimators=estimators,
                        final_estimator=LogisticRegression(),
                        cv=10,n_jobs=-1)

In [198]:
scl.fit(X_train,y_train)

StackingClassifier(cv=10,
                   estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('knn', KNeighborsClassifier(n_neighbors=10)),
                               ('gbdt', GradientBoostingClassifier())],
                   final_estimator=LogisticRegression(), n_jobs=-1)

In [199]:
y_pred = scl.predict(X_test)
accuracy_score(y_test,y_pred)

0.8688524590163934