In [2]:
# load required packages
import numpy as np
import pandas as pd
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [5]:
# Bagging flow
data = pd.DataFrame({
        'animal_type': ['dog', 'wolf', 'wolf', 'dog', 'dog', 'wolf', 'dog'],
        'weight': [30, 40, 45, 50, 25, 15, 35],
        'anger_level': [0, 10, 7, 8, 5, 3, 2],
        'cuddly': ['cuddly', 'not', 'not', 'not', 'cuddly', 'cuddly', 'cuddly']
    })

In [6]:
data

Unnamed: 0,anger_level,animal_type,cuddly,weight
0,0,dog,cuddly,30
1,10,wolf,not,40
2,7,wolf,not,45
3,8,dog,not,50
4,5,dog,cuddly,25
5,3,wolf,cuddly,15
6,2,dog,cuddly,35


In [7]:
Y = data.animal_type.map(lambda x: 1 if x == 'dog' else 0).values

In [8]:
Y

array([1, 0, 0, 1, 1, 0, 1])

In [20]:
import patsy

X = patsy.dmatrix('~ cuddly + anger_level + weight', data = data,
                  return_type = 'dataframe')

In [21]:
X.drop('Intercept', axis = 1, inplace = True)

In [23]:
Xmat = X.values

In [24]:
# first we create the "base model" estimator
dtc = DecisionTreeClassifier(max_depth = None)

dtc.fit(Xmat, Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [25]:
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydot

dot_data = StringIO()

export_graphviz(dtc, out_file= dot_data,
                feature_names = x.columns,
               filled = True, rounded = True,
               special_characters = True)

graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())

ImportError: No module named pydot

In [26]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(dtc, Xmat, Y, cv=3)

print scores
print np.mean(scores)

[ 0.66666667  0.5         0.5       ]
0.555555555556


In [34]:
dtc_simple = DecisionTreeClassifier(max_depth = 1)
scores_simple = cross_val_score(dtc_simple, Xmat, Y, cv= 3)

print scores_simple
print np.mean(scores_simple)

[ 0.66666667  0.5         0.5       ]
0.555555555556


In [47]:
# first we create the "base model" estimator

dtc = DecisionTreeClassifier(max_depth = None)

# n_estimators is the number of "base models" (copies of the dtc classifier blueprint)
# that we want to fit and then average
n_estimators= 10

# max_samples tells us what fraction of the size of the original dataset the bootstrapped
# datasets are going to be
max_samples=0.5

max_features = 0.66

bag = BaggingClassifier(dtc)

In [56]:
rows = range(X.shape[0])
cols = range(X.shape[1])
print 'original rows:', rows
print 'original cols:', cols

number_of_resampled_rows = int(round(max_samples * X.shape[0]))
number_of_columns = int(round(max_features * X.shape[1]))
                               
# print number_of_resampled_rows
# print number_of_columns

resampled_rows =  np.random.choice(rows, size = number_of_resampled_rows, replace = True)
resampled_cols = np.random.choice(cols, size = number_of_columns, replace = False)

print 'resampled rows:', resampled_rows
print 'resampled cols:', resampled_cols

original rows: [0, 1, 2, 3, 4, 5, 6]
original cols: [0, 1, 2]
resampled rows: [2 5 5 5]
resampled cols: [2 0]


In [57]:
X.iloc[resampled_rows, resampled_cols]

Unnamed: 0,weight,cuddly[T.not]
2,45.0,1.0
5,15.0,0.0
5,15.0,0.0
5,15.0,0.0


In [58]:
from sklearn.cross_validation import KFold

In [60]:
folds = KFold(len(rows), n_folds=3)

for train_indices, test_indices in folds:
    
    Xtrain, Xtest = X[train_indices], X[test_indices]
    Ytrain, Ytest = Y[train_indices], Y[test_indices]
    
    for estimator_num in range(n_estimators):
        print 'estimator number:', estimator_num
        
        train_rows = range(Xtrain.shape[0])
        train_cols = range(Xtrain.shape[1])
        
        number_of_resampled_rows = int(round(max_samples * Xtrain.shape[0]))
        number_of_columns = int(round(max_features * Xtrain.shape[1]))
        
        resampled_rows =  np.random.choice(train_rows, size = number_of_resampled_rows, replace = True)
        resampled_cols = np.random.choice(train_cols, size = number_of_columns, replace = False)
        
        Xtrain_resamp = Xtrain.iloc[resampled_rows, resampled_cols]
        Ytrain_resamp = Ytrain.iloc[resampled_rows, resampled_cols]

        dtc = DecisionTreeClassifier(max_depth = None)
        dtc.fit(Xtrain_resamp, Ytrain_resamp)
        
        
        
        
        
#     print 'training indices', train_indices
#     print 'test indices', test_indices

training indices [3 4 5 6]
test indices [0 1 2]
training indices [0 1 2 5 6]
test indices [3 4]
training indices [0 1 2 3 4]
test indices [5 6]
