In [24]:
import pandas as pd
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
#bagging ~ bootstrap aggregation, sampling training data with replacement 
#& then averaging predictions of sub-models

In [4]:
names = ['pregnancies', 'plasma_glucose', 'blood_pressure', 'skin_thickness', 'serum_insulin',
        'BMI', 'diabetes_pedigree', 'age', 'class']

In [5]:
df = pd.read_csv("pima-indians-diabetes.data", names=names )

In [6]:
for i,x in enumerate(names):
    print(x, i)

pregnancies 0
plasma_glucose 1
blood_pressure 2
skin_thickness 3
serum_insulin 4
BMI 5
diabetes_pedigree 6
age 7
class 8


In [27]:
array = df.values
X = array[:, 0:8]
Y = array[:,8]
num_folds = 10 #number of samples to pull from the training data
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
#nice to be explicit, but not necessary since the default base_estimator for BaggingClassifier 
#is already a decision tree
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator = cart, n_estimators=num_trees, random_state=seed)
results=cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.770745044429


In [28]:
revised_model = BaggingClassifier(n_estimators=num_trees, random_state=seed, 
                                  max_samples=num_folds, oob_score=True)

In [20]:
revised_results = cross_validation.cross_val_score(revised_model, X, Y, cv=kfold)

In [21]:
print(results.mean())

0.759125085441


In [46]:
RF_model = RandomForestClassifier(n_estimators=num_trees, max_features=3)
RF_results = cross_validation.cross_val_score(RF_model, X, Y, cv=kfold)
print(RF_results.mean())

0.766848940533


In [29]:
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
#ExtraTrees is short for 'extremely randomized' meaning that the thresholds for splitting
#are not chosen to maximize discrimination, but rather chosen randomly
#idea is to reduce variance (but increases bias)

In [44]:
ET_model = ExtraTreesClassifier(n_estimators=num_trees, max_features=7)
ET_results = cross_validation.cross_val_score(ET_model, X, Y, cv=kfold)
print(ET_results.mean())

0.773342447027


In [39]:
from sklearn.ensemble import AdaBoostClassifier

In [40]:
#AdaBoost starts with weak learners and initial weights
#all set to w=1/N, then adjusts the weights of the incorrectly classified
#instances so that subsequent rounds focus on difficult cases
#predictions are combined by majority vote (sum)

In [56]:
AB_model = AdaBoostClassifier(n_estimators=20, random_state=seed)
AB_results = cross_validation.cross_val_score(AB_model, X, Y, cv=kfold)
print(AB_results.mean()) #up to to a point, actually does better with lower n_estimators

0.763055365687


In [50]:
from sklearn.ensemble import GradientBoostingClassifier

In [51]:
#can handle heterogeneous features
#robust to outliers
#this version can't be parallelized - see XGBoost package for that
#can use either logistic regression or adaboost as the loss function

In [57]:
GB_model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
GB_results = cross_validation.cross_val_score(GB_model, X, Y, cv=kfold)
print(GB_results.mean())

0.764285714286


In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [59]:
#combines predictions from multiple models, either by majority vote ("hard") or 
#averaging ("soft")

In [60]:
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC()
estimators.append(('svm', model2))
estimators

[('logistic',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False)),
 ('cart',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best')),
 ('svm',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              presort=False, random_state=None, splitter='best'))]

In [61]:
# create the ensemble model
ensemble = VotingClassifier(estimators)
V_results = cross_validation.cross_val_score(ensemble, X, Y, cv=kfold)
print(V_results.mean())

0.709586466165


In [65]:
combine_better = [('GBT', GB_model), ('adaboost', AB_model), ('extratrees', ET_model)]
combine_better

[('GBT',
  GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
                max_depth=3, max_features=None, max_leaf_nodes=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=100,
                presort='auto', random_state=7, subsample=1.0, verbose=0,
                warm_start=False)),
 ('adaboost', AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
            learning_rate=1.0, n_estimators=20, random_state=7)),
 ('extratrees',
  ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
             max_depth=None, max_features=7, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
             oob_score=False, random_state=None, verbose=0, warm_start=False))]

In [66]:
combined = VotingClassifier(combine_better)
cb_results = cross_validation.cross_val_score(combined, X, Y, cv=kfold)
print(cb_results.mean())

0.777289815448
