In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from skutil.preprocessing import BoxCoxTransformer
from skutil.feature_selection import MulticollinearityFilterer

In [2]:
iris = load_iris()
X = pd.DataFrame.from_records(data=iris.data, columns=iris.feature_names)

In [3]:
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [4]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, iris.target, train_size=0.7)

### Build a simple Pipeline

In [6]:
from sklearn.pipeline import Pipeline
from skutil.preprocessing import SelectiveScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# build a pipeline
pipe = Pipeline([
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('scaler'      , SelectiveScaler()),
        ('boxcox'      , BoxCoxTransformer()),
        ('pca'         , PCA(n_components=0.9)),
        ('model'       , RandomForestClassifier())
    ])

# fit the pipe, report scores
pipe.fit(X_train, y_train)

# report scores
print 'Train RF accuracy: %.5f' % accuracy_score(y_train, pipe.predict(X_train))
print 'Test RF accuracy: %.5f'  % accuracy_score(y_test,  pipe.predict(X_test))

Train RF accuracy: 1.00000
Test RF accuracy: 0.82222


### Can we make this better with a gridsearch?

In [8]:
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from scipy.stats import randint, uniform

# default CV does not shuffle, so we define our own
custom_cv = KFold(n=y_train.shape[0], n_folds=5, shuffle=True, random_state=42)

# build a pipeline
pipe = Pipeline([
        ('collinearity', MulticollinearityFilterer(threshold=0.85)),
        ('scaler'      , SelectiveScaler()),
        ('boxcox'      , BoxCoxTransformer()),
        ('pca'         , PCA(n_components=0.9)),
        ('model'       , RandomForestClassifier())
    ])

# let's define a set of hyper-parameters over which to search
hp = {
    'collinearity__threshold' : uniform(0.8, 0.15),
    'collinearity__method'    : ['pearson','kendall','spearman'],
    'scaler__scaler'          : [StandardScaler(), RobustScaler()],
    'pca__n_components'       : randint(1,2),
    'pca__whiten'             : [True, False],
    'model__n_estimators'     : randint(10,100),
    'model__max_depth'        : randint(4,15),
    'model__min_samples_leaf' : randint(1,10),
    'model__max_features'     : uniform(loc=.5, scale=.5),
    'model__max_leaf_nodes'   : randint(10,50)
}

# define the gridsearch
search = RandomizedSearchCV(pipe, hp,
                            n_iter=30,
                            scoring='accuracy',
                            cv=custom_cv,
                            random_state=42)

# fit the search
search.fit(X_train, y_train)

# get the best estimator:
best_model = search.best_estimator_

# report scores
print 'Train RF accuracy: %.5f' % accuracy_score(y_train, best_model.predict(X_train))
print 'Test RF accuracy: %.5f' % accuracy_score(y_test, best_model.predict(X_test))

Train RF accuracy: 0.94286
Test RF accuracy: 0.91111
