## Using make_pipeline and make_union in Sklearn
**Transformer** in scikit-learn - some class that have fit and transform method, or fit_transform method.

**Predictor** - some class that has fit and predict methods, or fit_predict method.

**Pipeline** is just an abstract notion, it's not some existing ml algorithm. Often in ML tasks you need to perform sequence of different transformations (find set of features, generate new features, select only some good features) of raw dataset before applying final estimator.

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.naive_bayes import BernoulliNB
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import clone
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss
import pickle

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.ix[:, train.columns != 'target']
y_train = train.ix[:, train.columns == 'target']

test_id = test['ID']

X_train = X_train.drop('ID', axis =1)
X_test = test.drop('ID', axis =1 )


In [7]:
# Split training set and test set into 2 part for validation
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 1)

In [8]:
# Drop all columns which have percentage of missing values superior 40%
class DropColumnsWithMissingData(BaseEstimator, TransformerMixin):
    
    def __init__(self, thresholds=0.40):
        self.thresholds = thresholds
    
    def fit(self, X, y=None):
        a = X.isnull().mean()
        self.kept_columns = a.index[a < self.thresholds].tolist()
        return self
    
    def transform(self, X):
        return X[self.kept_columns]

In [9]:
X_train_n = DropColumnsWithMissingData(thresholds=0.40).fit_transform(X_train)

categorical_features = X_train_n.select_dtypes(include=["object"]).columns
numerical_features = X_train_n.select_dtypes(exclude=["object"]).columns


In [10]:
# select categorical features or numerical features 
class select_features(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
         
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.features]

    
class FillMissingValues(BaseEstimator, TransformerMixin):
    
    def __init__(self, replace_value):
        self.replace_value = replace_value
        # replace_value = 'nan' for filling missing data in categorical features
        # or -999 in numerical features
       
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.fillna(self.replace_value)
    
    
class ColumnApplier(BaseEstimator, TransformerMixin):
    """
    Some sklearn transformers can apply only on ONE column at a time (such as LabelEnconder())
    Wrap them with ColumnApplier to apply on all columns in the dataset
    """

    def __init__(self, underlying):
        self.underlying = underlying
        #TODO: underlying is one model method

    def fit(self, X, y=None):
        m = {}
        X = pd.DataFrame(X)  # TODO: :( reimplement in pure numpy?
        for c in X.columns:
            k = clone(self.underlying) 
            #TODO: clone helps to construct a new estimator with the same parameters.
            #      deep copy of the model in an estimator without actually copying attached data
            
            k.fit(X[c])
            # fit model k for every column in X 
            
            m[c] = k
            # put it in dictionary with column c as key and k as items
        
        self._column_stages = m
        # self.column_stages is a dictionary with column c in X as key and model k.fit as items 
        return self

    def transform(self, X):
        ret = {}
        X = pd.DataFrame(X)
        for c, k in self._column_stages.items():
            ret[c] = k.transform(X[c])
            # ret is a dict which has c as key and k.transform as items
        return pd.DataFrame(ret)[X.columns]  # keep the same order

class TolerantLabelEncoder(LabelEncoder):
    """
    LabelEncoder is not tolerant to unseen values
    """

    def transform(self, y):
        return np.searchsorted(self.classes_, y)
    

In [12]:
preproc = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.40),
    make_union(
    make_pipeline(
        select_features(categorical_features),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder())
    ),
    make_pipeline(
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()
        
    )
  )
)

In [13]:
y_train_n = y_train['target'].values
skf = list(StratifiedKFold(y_train_n, n_folds= 5, shuffle=True, random_state=1))

## Optimize hyperparameters of models

### LogisticRegression

In [140]:
pipeline = make_pipeline(
        preproc, LogisticRegression())
pipeline.get_params().keys()

dict_keys(['steps', 'pipeline', 'logisticregression', 'pipeline__steps', 'pipeline__dropcolumnswithmissingdata', 'pipeline__featureunion', 'pipeline__dropcolumnswithmissingdata__thresholds', 'pipeline__featureunion__n_jobs', 'pipeline__featureunion__transformer_list', 'pipeline__featureunion__transformer_weights', 'pipeline__featureunion__pipeline-1', 'pipeline__featureunion__pipeline-2', 'pipeline__featureunion__pipeline-1__steps', 'pipeline__featureunion__pipeline-1__select_features', 'pipeline__featureunion__pipeline-1__fillmissingvalues', 'pipeline__featureunion__pipeline-1__columnapplier', 'pipeline__featureunion__pipeline-1__select_features__features', 'pipeline__featureunion__pipeline-1__fillmissingvalues__replace_value', 'pipeline__featureunion__pipeline-1__columnapplier__underlying', 'pipeline__featureunion__pipeline-2__steps', 'pipeline__featureunion__pipeline-2__select_features', 'pipeline__featureunion__pipeline-2__fillmissingvalues', 'pipeline__featureunion__pipeline-2__st

In [146]:
params_lg = {
    'logisticregression__penalty': ('l1', 'l2'),
    'logisticregression__C': [0.01, 0.1, 10]
}

In [147]:
model_grid = GridSearchCV(pipeline, params_lg, cv = skf) 

In [148]:
model_grid.fit(X_train, y_train_n)

In [149]:
best_score = model_grid.best_score_
best_parameters = model_grid.best_estimator_.get_params()

In [151]:
best_score

0.7645331400579826

In [185]:
best_parameters['logisticregression']

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [155]:
clf_lg = best_parameters['logisticregression']

In [156]:
lg = make_pipeline(
    preproc, clf_lg
)

In [157]:
mod_lg = lg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [159]:
y_pred_lg = mod_lg.predict_proba(X_test)
score = log_loss(y_test, y_pred_lg[:,1:])
score

0.49578993619090211

### Xgboost

In [170]:
pipeline_xgb = make_pipeline(
        preproc, XGBClassifier()
)
pipeline_xgb.get_params().keys()

dict_keys(['steps', 'pipeline', 'xgbclassifier', 'pipeline__steps', 'pipeline__dropcolumnswithmissingdata', 'pipeline__featureunion', 'pipeline__dropcolumnswithmissingdata__thresholds', 'pipeline__featureunion__n_jobs', 'pipeline__featureunion__transformer_list', 'pipeline__featureunion__transformer_weights', 'pipeline__featureunion__pipeline-1', 'pipeline__featureunion__pipeline-2', 'pipeline__featureunion__pipeline-1__steps', 'pipeline__featureunion__pipeline-1__select_features', 'pipeline__featureunion__pipeline-1__fillmissingvalues', 'pipeline__featureunion__pipeline-1__columnapplier', 'pipeline__featureunion__pipeline-1__select_features__features', 'pipeline__featureunion__pipeline-1__fillmissingvalues__replace_value', 'pipeline__featureunion__pipeline-1__columnapplier__underlying', 'pipeline__featureunion__pipeline-2__steps', 'pipeline__featureunion__pipeline-2__select_features', 'pipeline__featureunion__pipeline-2__fillmissingvalues', 'pipeline__featureunion__pipeline-2__standar

In [174]:
params_xgb = {
    'xgbclassifier__n_estimators' : [30, 100, 300, 800],
    'xgbclassifier__max_depth' : [3, 5, 7]
}

In [175]:
xgb_search = GridSearchCV(pipeline_xgb, params_xgb, cv = skf) 

In [176]:
xgb_search.fit(X_train, y_train_n)

GridSearchCV(cv=[(array([    0,     1, ..., 80022, 80023]), array([    2,    21, ..., 80014, 80015])), (array([    1,     2, ..., 80021, 80022]), array([    0,     4, ..., 80016, 80023])), (array([    0,     1, ..., 80022, 80023]), array([    3,    10, ..., 80013, 80019])), (array([    0,     1, ..., 80022, 80023]), array([    9,    15, ..., 80012, 80018])), (array([    0,     2, ..., 80019, 80023]), array([    1,     5, ..., 80021, 80022]))],
       error_score='raise',
       estimator=Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.4)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['v3', 'v22', 'v24', 'v31', 'v47', 'v52',...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'xgbclassifier__n_estimators': 

In [180]:
bestparams_xgb = xgb_search.best_estimator_.get_params()

In [184]:
bestparams_xgb['xgbclassifier']

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [183]:
best_score_xgb = model_grid.best_score_
best_score_xgb

0.7645331400579826

In [186]:
xgb = make_pipeline(
    preproc, bestparams_xgb['xgbclassifier']
)

In [187]:
mod_xgb = xgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [189]:
y_pred_xgb = mod_xgb.predict_proba(X_test)
score_xgb = log_loss(y_test, y_pred_xgb[:,1:])
score_xgb

0.46865203486185841

In [190]:
def SearchBestParams(X_train, y_train, clf, params, cv = skf):
    pipeline = make_pipeline(
        preproc, clf)
    search = GridSearchCV(pipeline, params, cv)
    search.fit(X_train, y_train)
    
    GetParams = search.best_estimator_.get_params()
    return GetParams[clf.lower()]

### LogisticRegression

In [46]:
pipeline_lg = make_pipeline(
        preproc, 
        GridSearchCV(
            LogisticRegression(),
                {
                    'C' : [0.05, 0.1 , 1, 10], 
                    'penalty' : ('l2', 'l1') 
                    
                },
            cv = skf,
            verbose=1, 
            scoring='log_loss'
        )
    )

In [47]:
pipeline_lg.fit(X_train, y_train_n)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 52.6min finished


Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.4)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['v3', 'v22', 'v24', 'v31', 'v47', 'v52',...re_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='log_loss', verbose=1))])

In [50]:
pickle.dump(pipeline_lg, open('pipeline_lg.pkl', 'wb'))

### Xgboost

In [195]:
pipeline_xgb = make_pipeline(
        preproc, 
        GridSearchCV(
            XGBClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'max_depth' : [3, 5, 7]
                },
            cv = skf,
            verbose=1
        )
    )

In [196]:
pipeline_xgb.fit(X_train, y_train_n)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 96.4min finished


Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.4)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['v3', 'v22', 'v24', 'v31', 'v47', 'v52',...max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1))])

In [314]:
a = pipeline_xgb.steps[-1][1]
a.best_params_

{'max_depth': 3, 'n_estimators': 300}

In [315]:
a.grid_scores_

[mean: 0.77842, std: 0.00178, params: {'max_depth': 3, 'n_estimators': 30},
 mean: 0.78032, std: 0.00214, params: {'max_depth': 3, 'n_estimators': 100},
 mean: 0.78074, std: 0.00184, params: {'max_depth': 3, 'n_estimators': 300},
 mean: 0.78057, std: 0.00119, params: {'max_depth': 3, 'n_estimators': 800},
 mean: 0.77993, std: 0.00203, params: {'max_depth': 5, 'n_estimators': 30},
 mean: 0.78059, std: 0.00236, params: {'max_depth': 5, 'n_estimators': 100},
 mean: 0.78018, std: 0.00183, params: {'max_depth': 5, 'n_estimators': 300},
 mean: 0.77877, std: 0.00205, params: {'max_depth': 5, 'n_estimators': 800},
 mean: 0.77989, std: 0.00202, params: {'max_depth': 7, 'n_estimators': 30},
 mean: 0.78063, std: 0.00221, params: {'max_depth': 7, 'n_estimators': 100},
 mean: 0.77972, std: 0.00251, params: {'max_depth': 7, 'n_estimators': 300},
 mean: 0.77592, std: 0.00225, params: {'max_depth': 7, 'n_estimators': 800}]

In [198]:
y_pred_xgb = pipeline_xgb.predict_proba(X_test)

In [200]:
score_xgb_1 = log_loss(y_test, y_pred_xgb[:,1:])
score_xgb_1

0.46926432423787995

### ExtraTreesClassifier

In [None]:
pipeline_et = make_pipeline(
        preproc, 
        GridSearchCV(
            ExtraTreesClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'max_depth' : [3, 5, 7]
                },
            cv = skf,
            verbose=1, 
            scoring='accuracy'
        )
    )

In [None]:
pipeline_xgb.fit(X_train, y_train_n)
BestEt = pipeline_xgb.best_estimator_.get_params()['extratreesclassifier']
print(pipeline_xgb.best_score_)

### RandomForestClassifier

In [None]:
pipeline_rf = make_pipeline(
        preproc, 
        GridSearchCV(
            RandomForestClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'criterion' : ('gini', 'entropy'),
                    'max_depth' : [3, 5, 7]
                },
            cv = skf,
            verbose=1, 
            scoring='accuracy'
        )
    )



In [None]:
pipeline_rf.fit(X_train, y_train_n)
BestEt = pipeline_xgb.best_estimator_.get_params()['extratreesclassifier']
print(pipeline_xgb.best_score_)

### One-hot Encoding

In [33]:
class TreatmentSpecialColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, column = 'v22', threshold = 50 ):
        self.column = column
        self.threshold = threshold 
        
    def fit(self, X, y = None):
        values, counts = np.unique(X[self.column], return_counts=True)
        counts = {x : y for x, y in zip(values, counts)}
        X[self.column] = X[self.column].apply(lambda x: x if counts.get(x, 0) > self.threshold else 0)
        return self
    
    def transform(self, X):
        return X

In [None]:
preproc = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.40),
    make_union(
    make_pipeline(
        select_features(categorical_features),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder()),
        OneHotEncoder(handle_unknown = 'ignore')
    ),
    make_pipeline(
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()        
    )
  )
)

In [None]:
xgb_ohe = make_pipeline(
        preproc,
        GridSearchCV(
            XGBClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    # Number of boosted trees to fit.
                    'max_depth' : [ 3, 5, 7],
                    'learning_rate': [0.1, 0.5]
                }
            
        )
    )


In [45]:
xgb_ohe.fit(X_train, y_train_n)

Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.4)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['v3', 'v22', 'v24', 'v31', 'v47', 'v52',...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0))])

In [51]:
y_pred_xgb = xgb_ohe.predict_proba(X_test)

In [53]:
score_xgb = log_loss(y_test, y_pred_xgb[:,1:])
score_xgb

0.46675254991029314

In [54]:
pickle.dump(xgb_ohe, open('xgb_ohe.pkl', 'wb'))