In [76]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

In [77]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data",header=None)
df=shuffle(df)

#The second data set I use : https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia//arrhythmia.data
# The third data set I use : https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test

In [78]:
X = df.loc[:, 2:]
y = df.loc[:, 1]
le = LabelEncoder()
y = le.fit_transform(y)

## Bagging

## (1) Compare with Single Decision Tree

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.80, shuffle=True)

#Here I change the test_size to change the partition

In [80]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion = 'gini', max_depth = None)
bag = BaggingClassifier(base_estimator = tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True,
                       bootstrap_features=False, n_jobs=-1, random_state=10)

In [81]:
from sklearn.metrics import accuracy_score

tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

bag = bag.fit(X_train, y_train)
y_train_pred = bag.predict(X_train)
y_test_pred = bag.predict(X_test)

bag_train = accuracy_score(y_train, y_train_pred) 
bag_test = accuracy_score(y_test, y_test_pred) 
print('Bagging train/test accuracies %.3f/%.3f'
      % (bag_train, bag_test))

Decision tree train/test accuracies 1.000/0.895
Bagging train/test accuracies 1.000/0.919


## (2) Tuning Parameters for Bagging

In [82]:
from sklearn.model_selection import GridSearchCV
from sklearn import  metrics

## Adjust number of estimators
param_test1 = {'n_estimators':range(5,60,5)} 
gsearch1 = GridSearchCV(estimator = BaggingClassifier(max_samples=10, max_features=10 ,random_state=10), 
                       param_grid = param_test1, scoring='accuracy',cv=5,return_train_score=True, iid=True)
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=10, max_samples=10,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=10,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=None,
       param_grid={'n_estimators': range(5, 60, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [83]:
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 25}, 0.9557522123893806)

In [84]:
param_test2 = {'max_features': range(1,20,2), 'max_samples': range(5,55,5)}
gsearch2 = GridSearchCV(estimator = BaggingClassifier(n_estimators=45, 
                                  oob_score=True, random_state=10),
   param_grid = param_test2, scoring='accuracy',iid=False, cv=5,return_train_score=True)
gsearch2.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=45, n_jobs=None, oob_score=True, random_state=10,
         verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'max_features': range(1, 20, 2), 'max_samples': range(5, 55, 5)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [85]:
gsearch2.best_params_, gsearch2.best_score_

({'max_features': 3, 'max_samples': 20}, 0.9739130434782609)

## (3) Best accuracy after tuning parameters

In [86]:
def get_three_accuracy(estimator):
    position=list(estimator.cv_results_['mean_test_score']).index(estimator.best_score_)
    print('Train accuracy is ', estimator.cv_results_['mean_train_score'][position])
    print('Validation accuracy is ', estimator.best_score_)
    y_pred=estimator.predict(X_test)
    t=accuracy_score(y_test,y_pred)
    print('Test accuracy is', t)

In [87]:
get_three_accuracy(gsearch2)

Train accuracy is  0.9845410628019324
Validation accuracy is  0.9739130434782609
Test accuracy is 0.9407894736842105


## Boosting

## (1) Compare with Single Decision Tree

In [88]:
from sklearn.ensemble import AdaBoostClassifier

tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=1,
                              random_state=0)

ada = AdaBoostClassifier(base_estimator=tree,
                         n_estimators=500, 
                         learning_rate=0.1,
                         random_state=0)

In [89]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)

ada_train = accuracy_score(y_train, y_train_pred) 
ada_test = accuracy_score(y_test, y_test_pred) 
print('AdaBoost train/test accuracies %.3f/%.3f'
      % (ada_train, ada_test))

Decision tree train/test accuracies 0.965/0.886
AdaBoost train/test accuracies 1.000/0.934


## (2) Tuning Parameters

In [90]:
param_test1 = {'n_estimators':range(5,51,5), 'learning_rate': np.linspace(0.1, 3, 20)}
gsearch1 = GridSearchCV(estimator = AdaBoostClassifier(random_state=10), 
                       param_grid = param_test1, scoring='accuracy',cv=5, iid=True, return_train_score=True)
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=10),
       fit_params=None, iid=True, n_jobs=None,
       param_grid={'n_estimators': range(5, 51, 5), 'learning_rate': array([0.1    , 0.25263, 0.40526, 0.55789, 0.71053, 0.86316, 1.01579,
       1.16842, 1.32105, 1.47368, 1.62632, 1.77895, 1.93158, 2.08421,
       2.23684, 2.38947, 2.54211, 2.69474, 2.84737, 3.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [91]:
gsearch1.best_params_, gsearch1.best_score_

({'learning_rate': 1.6263157894736844, 'n_estimators': 35}, 0.9823008849557522)

## (3) Best accuracy after tuning parameters

In [92]:
get_three_accuracy(gsearch1)

Train accuracy is  1.0
Validation accuracy is  0.9823008849557522
Test accuracy is 0.9364035087719298


## Random Forest

## (1) Compare with Single Decision Tree

In [93]:
from sklearn.ensemble import RandomForestClassifier

tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=1,
                              random_state=0)

RF = RandomForestClassifier(
                         n_estimators=100, 
                         oob_score=True,
                         random_state=10,
                           criterion='gini')

In [94]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

RF=RF.fit(X_train, y_train)
y_train_pred = RF.predict(X_train)
y_test_pred = RF.predict(X_test)

RF_train = accuracy_score(y_train, y_train_pred)
RF_test = accuracy_score(y_test, y_test_pred)
print('Random Forest train/test accuracites %.3f/%.3f'
     % (RF_train, RF_test))

Decision tree train/test accuracies 0.965/0.886
Random Forest train/test accuracites 1.000/0.928


## (2) Tune Parameters

In [95]:
## Adjust number of estimators
param_test1 = {'n_estimators':range(20,301,10)} 
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=10), 
                       param_grid = param_test1, scoring='accuracy',cv=5, return_train_score=True, iid=True)
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=None,
       param_grid={'n_estimators': range(20, 301, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [96]:
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 130}, 0.9646017699115044)

In [97]:
param_test2 = {'max_depth':range(1,20,3), 'min_samples_leaf': range(1, 15, 1)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=25, random_state=10),
   param_grid = param_test2, scoring='accuracy',iid=False, cv=5)
gsearch2.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=25, n_jobs=None,
            oob_score=False, random_state=10, verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'max_depth': range(1, 20, 3), 'min_samples_leaf': range(1, 15)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [98]:
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 4, 'min_samples_leaf': 7}, 0.9652173913043478)

## (3) Best accuracy after tuning parameters

In [99]:
get_three_accuracy(gsearch2)

Train accuracy is  0.9822705314009662
Validation accuracy is  0.9652173913043478
Test accuracy is 0.9276315789473685




## SVM with Linear Kernal

## (1) Compare with Single Decision Tree

In [100]:
from sklearn import svm
SVM= svm.LinearSVC()

tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=1,
                              random_state=0)



In [101]:
tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)

tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print('Decision tree train/test accuracies %.3f/%.3f'
      % (tree_train, tree_test))

SVM=SVM.fit(X_train, y_train)
y_train_pred = SVM.predict(X_train)
y_test_pred = SVM.predict(X_test)

SVM_train = accuracy_score(y_train, y_train_pred)
SVM_test = accuracy_score(y_test, y_test_pred)
print('SVM with Linear kernal train/test accuracites %.3f/%.3f'
     % (RF_train, RF_test))

Decision tree train/test accuracies 0.965/0.886
SVM with Linear kernal train/test accuracites 1.000/0.928




## (2) Tune Parameters

In [102]:
param_test = {'C':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}
grid = GridSearchCV(SVM, param_grid = param_test, n_jobs = -1, cv = 3, return_train_score= True, iid = True)
grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [103]:
grid.best_params_, grid.best_score_

({'C': 1e-05}, 0.9646017699115044)

## (3) Best accuracy after tuning parameters

In [104]:
get_three_accuracy(grid)

Train accuracy is  0.9690643274853802
Validation accuracy is  0.9646017699115044
Test accuracy is 0.8903508771929824
