In [1]:
from sklearn import datasets
from scipy.io.arff import loadarff
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd




In [2]:
forest = datasets.fetch_covtype()

In [3]:
type(forest)

sklearn.datasets.base.Bunch

In [4]:
X, X_test, y, y_test = train_test_split(
    forest.data, forest.target == 2, test_size=0.3, random_state=1234)

In [5]:
print(X.shape, y.shape, X_test.shape, y_test.shape)

((406708, 54), (406708,), (174304, 54), (174304,))


In [6]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_leaf=1000, min_samples_split=3000)

clf = clf.fit(X, y)

y_hat = clf.predict(X_test)

print("AUC = %.2f" % roc_auc_score(y_test, y_hat))

AUC = 0.79


In [7]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydot_ng as pydot

%matplotlib inline

def plot_dtc(clf, features, classes):
    dot_data = StringIO()  
    tree.export_graphviz(clf, out_file=dot_data,  
                             feature_names=features,  
                             class_names=classes,  
                             filled=True, rounded=True,  
                             special_characters=True)  
    graph = pydot.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png()) 
    
plot_dtc(clf, [str(i) for i in range(X.shape[1])], ["0","1"])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [8]:
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold

paramgrid = {"max_features":      [4, "sqrt", 30, 40],
             "max_depth":         [10, 15, 20, 40, 50, 60],
             "min_samples_split": [100, 200, 500, 2000, 4000]
            }

random.seed(1)

from evolutionary_search import EvolutionaryAlgorithmSearchCV
cv = EvolutionaryAlgorithmSearchCV(estimator            =  DecisionTreeClassifier(),
                                   params               =  paramgrid,
                                   scoring              =  "roc_auc",
                                   cv                   =  StratifiedKFold(y, n_folds=4),
                                   verbose              = 1,
                                   population_size      = 10,
                                   gene_mutation_prob   = 0.2,
                                   gene_crossover_prob  = 0.5,
                                   tournament_size      = 5,
                                   generations_number   = 10,
                                   n_jobs=4)
%time cv.fit(X, y)

Types [1, 1, 1] and maxint [3, 4, 5] detected
--- Evolve in 120 possible combinations ---
gen	nevals	avg     	min     	max    
0  	10    	0.864516	0.774208	0.94592
1  	5     	0.938068	0.881377	0.946875
2  	8     	0.938308	0.869802	0.94592 
3  	3     	0.935995	0.84667 	0.94592 
4  	8     	0.939477	0.869802	0.957608
5  	5     	0.949426	0.94592 	0.957608
6  	1     	0.951953	0.924439	0.957608
7  	2     	0.957608	0.957608	0.957608
8  	7     	0.953218	0.924439	0.957608
9  	6     	0.953249	0.925711	0.957608
10 	2     	0.957608	0.957608	0.957608
Best individual is: {'max_features': 40, 'min_samples_split': 100, 'max_depth': 60}
with fitness: 0.957607833545
CPU times: user 11.7 s, sys: 11.2 s, total: 22.9 s
Wall time: 1min 33s


In [9]:
cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=60,
            max_features=40, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [10]:
clf = cv.best_estimator_
clf.fit(X,y)
print("AUC = %.2f" % roc_auc_score(y_test, clf.predict(X_test)))

AUC = 0.90


In [11]:
importance = pd.DataFrame(
    {'imp': clf.tree_.compute_feature_importances(), 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(10)

Unnamed: 0,f,imp
0,0,0.286072
5,5,0.139168
9,9,0.124827
13,13,0.106105
3,3,0.054553
4,4,0.039235
7,7,0.037195
6,6,0.020082
8,8,0.019812
1,1,0.019714


## Maintenant un peu de xgboost!

### D'abord sans deap

In [13]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=y)
dtest  = xgb.DMatrix(X_test, label=y_test)
evallist = [(dtest, 'eval'), (dtrain, 'train')]

param = {
    'bst:max_depth': 5, 
    'bst:eta': 0.5, 
    'silent': True, 
    'verbose': 0,
    'objective': 'binary:logistic',
    'nthread': 4,
    'eval_metric': 'auc'
}

num_round = 201
bst = xgb.train(param,
                dtrain,
                num_round,
                evallist,
                verbose_eval    = 40
               )


[0]	eval-auc:0.81286	train-auc:0.812862
[40]	eval-auc:0.920766	train-auc:0.924006
[80]	eval-auc:0.940982	train-auc:0.945078
[120]	eval-auc:0.956681	train-auc:0.961603
[160]	eval-auc:0.964821	train-auc:0.970127
[200]	eval-auc:0.970554	train-auc:0.976311


In [14]:
param = {
    'bst:max_depth': 9, 
    'bst:eta': 0.6,
    'colsample_bytree': 0.74,
    'subsample': 0.66,
    'silent': True, 
    'verbose': 0,
    'objective': 'binary:logistic',
    'nthread': 4,
    'eval_metric': 'auc'
}

num_round = 201
bst = xgb.train(param,
                dtrain,
                num_round,
                evallist,
                verbose_eval    = 40
               )


[0]	eval-auc:0.814078	train-auc:0.815259
[40]	eval-auc:0.91925	train-auc:0.922001
[80]	eval-auc:0.940377	train-auc:0.944257
[120]	eval-auc:0.954253	train-auc:0.958981
[160]	eval-auc:0.96236	train-auc:0.967631
[200]	eval-auc:0.967797	train-auc:0.973647


###  Ensuite avec deap

In [22]:
import xgboost as xgb

import deapexp as dp

grid_gen1 = {
    "max_depth":          range(3, 11, 2),
    "learning_rate":      np.logspace(-3, 0, 4, base=10),
    "subsample":          np.arange(0.5, 1, .1),
    "colsample_bytree":   np.arange(0.1, 1, .1)
}

genetic_params = {
    "generations_number": 5,
    "population_size": 20,
    "gene_mutation_prob": 0.2,
    "tournament_size": 3
}

num_rounds = 250

train = pd.DataFrame(X)
train["target"] = y

class IdentityTransformer(object):
    def __init__(*args, **kwargs):
        None
    def fit_transform(self, X):
        return X
    def transform(self, X):
        return X

e = dp.DEAPSplitXgboostExperiment(IdentityTransformer(),
                               grid_gen1,
                               test_prop=0.2,
                               genetic_params=genetic_params,
                               num_rounds=num_rounds)

e.experiment(train, "target")

Searching with train 325366 / test 81342
with params grid {'subsample': array([ 0.5,  0.6,  0.7,  0.8,  0.9]), 'learning_rate': array([ 0.001,  0.01 ,  0.1  ,  1.   ]), 'colsample_bytree': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]), 'max_depth': [3, 5, 7, 9]}
on genetic params {'tournament_size': 3, 'population_size': 20, 'gene_mutation_prob': 0.2, 'generations_number': 5, 'gene_crossover_prob': 0.5}



Types [2, 2, 2, 1] and maxint [4, 3, 8, 3] detected
--- Evolve in 720 possible combinations ---
[CV] subsample=0.7, learning_rate=1.0, colsample_bytree=0.5, max_depth=5 
[CV]  subsample=0.7, learning_rate=1.0, colsample_bytree=0.5, max_depth=5, score=0.966433, total=  28.3s
[CV] subsample=0.9, learning_rate=0.1, colsample_bytree=0.5, max_depth=7 
[CV]  subsample=0.9, learning_rate=0.1, colsample_bytree=0.5, max_depth=7, score=0.953245, total=  47.2s
[CV] subsample=0.6, learning_rate=0.01, colsample_bytree=0.1, max_depth=3 
[CV]  subsample=0.6, learning_rate=0.01, colsample_bytree=0.1, max_depth=3, score=0.828632, total=  10.7s
[CV] subsample=0.8, learning_rate=0.001, colsample_bytree=0.2, max_depth=3 
[CV]  subsample=0.8, learning_rate=0.001, colsample_bytree=0.2, max_depth=3, score=0.818982, total=  11.0s
[CV] subsample=0.6, learning_rate=0.001, colsample_bytree=0.4, max_depth=7 
[CV]  subsample=0.6, learning_rate=0.001, colsample_bytree=0.4, max_depth=7, score=0.867858, total=  38.8s

<deapexp.DEAPSplitXgboostExperiment at 0x7f1d8b9c7350>

In [23]:
(X.shape, y.shape)

((406708, 54), (406708,))

In [25]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=y)
dtest  = xgb.DMatrix(X_test, label=y_test)
evallist = [(dtest, 'eval'), (dtrain, 'train')]

param = {
    'bst:max_depth': 7, 
    'bst:eta': 1.0, 
    'silent': False, 
    'verbose': 0,
    'objective': 'binary:logistic',
    'nthread': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'eval_metric': 'auc'
}

num_round = 251
bst = xgb.train(param,
                dtrain,
                num_round,
                evallist,
                verbose_eval    = 40
               )

[0]	eval-auc:0.823641	train-auc:0.825341
[40]	eval-auc:0.918839	train-auc:0.921866
[80]	eval-auc:0.945117	train-auc:0.949082
[120]	eval-auc:0.956481	train-auc:0.961079
[160]	eval-auc:0.964963	train-auc:0.970227
[200]	eval-auc:0.969845	train-auc:0.975541
[240]	eval-auc:0.974818	train-auc:0.980868
