In [None]:
from sklearn import datasets
from scipy.io.arff import loadarff
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd


In [None]:
forest = datasets.fetch_covtype()

In [None]:
type(forest)

In [None]:
X, X_test, y, y_test = train_test_split(
    forest.data, forest.target == 2, test_size=0.3, random_state=1234)

In [None]:
print(X.shape, y.shape, X_test.shape, y_test.shape)

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_leaf=1000, min_samples_split=3000)

clf = clf.fit(X, y)

y_hat = clf.predict(X_test)

print("AUC = %.2f" % roc_auc_score(y_test, y_hat))

In [None]:
from IPython.display import Image  
from sklearn.externals.six import StringIO  
import pydot_ng as pydot

%matplotlib inline

def plot_dtc(clf, features, classes):
    dot_data = StringIO()  
    tree.export_graphviz(clf, out_file=dot_data,  
                             feature_names=features,  
                             class_names=classes,  
                             filled=True, rounded=True,  
                             special_characters=True)  
    graph = pydot.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png()) 
    
plot_dtc(clf, [str(i) for i in range(X.shape[1])], ["0","1"])

In [None]:
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedKFold

paramgrid = {"max_features":      [4, "sqrt", 30, 40],
             "max_depth":         [10, 15, 20, 40, 50, 60],
             "min_samples_split": [100, 200, 500, 2000, 4000]
            }

random.seed(1)

from evolutionary_search import EvolutionaryAlgorithmSearchCV
cv = EvolutionaryAlgorithmSearchCV(estimator            =  DecisionTreeClassifier(),
                                   params               =  paramgrid,
                                   scoring              =  "roc_auc",
                                   cv                   =  StratifiedKFold(y, n_folds=4),
                                   verbose              = 1,
                                   population_size      = 10,
                                   gene_mutation_prob   = 0.2,
                                   gene_crossover_prob  = 0.5,
                                   tournament_size      = 5,
                                   generations_number   = 10,
                                   n_jobs=4)
%time cv.fit(X, y)

In [None]:
clf = tree.DecisionTreeClassifier(min_samples_split=100, max_features=40, max_depth=50)

clf = clf.fit(X, y)

y_hat = clf.predict(X_test)

print("AUC = %.2f" % roc_auc_score(y_test, y_hat))

In [None]:
importance = pd.DataFrame(
    {'imp': clf.tree_.compute_feature_importances(), 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(10)

In [None]:
cv.best_estimator_

In [None]:
clf = cv.best_estimator_
clf.fit(X,y)
print("AUC = %.2f" % roc_auc_score(y_test, clf.predict(X_test)))

In [None]:
importance = pd.DataFrame(
    {'imp': clf.tree_.compute_feature_importances(), 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(10)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs = -1, n_estimators=400)

%time rf.fit(X,y)

print(roc_auc_score(y_test, rf.predict(X_test)))

In [None]:
importance = pd.DataFrame(
    {'imp': rf.feature_importances_, 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(10)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

extra = ExtraTreesClassifier(n_jobs = -1, n_estimators=400)

%time extra.fit(X,y)

print(roc_auc_score(y_test, extra.predict(X_test)))

In [None]:
importance = pd.DataFrame(
    {'imp': extra.feature_importances_, 'f': range(X.shape[1])})
importance.sort_values(by="imp", ascending=False).head(10)

## Maintenant un peu de xgboost!

### D'abord sans deap

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X, label=y)
dtest  = xgb.DMatrix(X_test, label=y_test)
evallist = [(dtest, 'eval'), (dtrain, 'train')]

param = {
    'bst:max_depth': 5, 
    'bst:eta': 0.5, 
    'silent': True, 
    'verbose': 0,
    'objective': 'binary:logistic',
    'nthread': 4,
    'eval_metric': 'auc'
}

num_round = 201
bst = xgb.train(param,
                dtrain,
                num_round,
                evallist,
                verbose_eval    = 40
               )


In [None]:
param = {
    'bst:max_depth': 9, 
    'bst:eta': 0.6,
    'colsample_bytree': 0.74,
    'subsample': 0.66,
    'silent': True, 
    'verbose': 0,
    'objective': 'binary:logistic',
    'nthread': 4,
    'eval_metric': 'auc'
}

num_round = 201
bst = xgb.train(param,
                dtrain,
                num_round,
                evallist,
                verbose_eval    = 40
               )


###  Ensuite avec deap

In [None]:
import xgboost as xgb

import deapexp as dp

grid_gen1 = {
    "max_depth":          range(3, 11),
    "learning_rate":      np.logspace(-4, -1, 40, base=10),
    "subsample":          np.arange(0.5, 0.8, 2e-2),
    "colsample_bytree":   np.arange(0.1, 0.4, 2e-2)
}

genetic_params = {
    "generations_number": 10,
    "population_size": 30,
    "gene_mutation_prob": 0.2,
    "tournament_size": 3
}

num_rounds = 100

train = pd.DataFrame(X)
train["target"] = y

class IdentityTransformer(object):
    def __init__(*args, **kwargs):
        None
    def fit_transform(self, X):
        return X
    def transform(self, X):
        return X

e = dp.DEAPSplitXgboostExperiment(IdentityTransformer(),
                               grid_gen1,
                               test_prop=0.2,
                               genetic_params=genetic_params,
                               num_rounds=num_rounds)

e.experiment(train, "target")

In [None]:
(X.shape, y.shape)

In [None]:
e