## Using make_pipeline and make_union in Sklearn
**Transformer** in scikit-learn - some class that have fit and transform method, or fit_transform method.

**Predictor** - some class that has fit and predict methods, or fit_predict method.

**Pipeline** is just an abstract notion, it's not some existing ml algorithm. Often in ML tasks you need to perform sequence of different transformations (find set of features, generate new features, select only some good features) of raw dataset before applying final estimator.

In [96]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns 
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.naive_bayes import BernoulliNB
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import clone
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import log_loss
import pickle

In [97]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.ix[:, train.columns != 'target']
y_train = train.ix[:, train.columns == 'target']

test_id = test['ID']

X_train = X_train.drop('ID', axis =1)
X_test_w = test.drop('ID', axis =1 )


In [98]:
# Split training set and test set into 2 part for validation
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.3, random_state = 1)

In [6]:
# Drop all columns which have percentage of missing values superior 40%
class DropColumnsWithMissingData(BaseEstimator, TransformerMixin):
    
    def __init__(self, thresholds=0.40):
        self.thresholds = thresholds
    
    def fit(self, X, y=None):
        a = X.isnull().mean()
        self.kept_columns = a.index[a < self.thresholds].tolist()
        return self
    
    def transform(self, X):
        return X[self.kept_columns]

In [7]:
X_train_n = DropColumnsWithMissingData(thresholds=0.40).fit_transform(X_train)

categorical_features = X_train_n.select_dtypes(include=["object"]).columns
numerical_features = X_train_n.select_dtypes(exclude=["object"]).columns


In [8]:
# select categorical features or numerical features 
class select_features(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
         
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.features]

    
class FillMissingValues(BaseEstimator, TransformerMixin):
    
    def __init__(self, replace_value):
        self.replace_value = replace_value
        # replace_value = 'nan' for filling missing data in categorical features
        # or -999 in numerical features
       
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.fillna(self.replace_value)
    
    
class ColumnApplier(BaseEstimator, TransformerMixin):
    """
    Some sklearn transformers can apply only on ONE column at a time (such as LabelEnconder())
    Wrap them with ColumnApplier to apply on all columns in the dataset
    """

    def __init__(self, underlying):
        self.underlying = underlying
        #TODO: underlying is one model method

    def fit(self, X, y=None):
        m = {}
        X = pd.DataFrame(X)  # TODO: :( reimplement in pure numpy?
        for c in X.columns:
            k = clone(self.underlying) 
            #TODO: clone helps to construct a new estimator with the same parameters.
            #      deep copy of the model in an estimator without actually copying attached data
            
            k.fit(X[c])
            # fit model k for every column in X 
            
            m[c] = k
            # put it in dictionary with column c as key and k as items
        
        self._column_stages = m
        # self.column_stages is a dictionary with column c in X as key and model k.fit as items 
        return self

    def transform(self, X):
        ret = {}
        X = pd.DataFrame(X)
        for c, k in self._column_stages.items():
            ret[c] = k.transform(X[c])
            # ret is a dict which has c as key and k.transform as items
        return pd.DataFrame(ret)[X.columns]  # keep the same order

class TolerantLabelEncoder(LabelEncoder):
    """
    LabelEncoder is not tolerant to unseen values
    """

    def transform(self, y):
        return np.searchsorted(self.classes_, y)

In [9]:
preproc = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.40),
    make_union(
    make_pipeline(
        select_features(categorical_features),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder())
    ),
    make_pipeline(
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()
        
    )
  )
)

In [29]:
y_train_n = y_train['target'].values
skf = list(StratifiedKFold(y_train_n, n_folds= 5, shuffle=True, random_state=1))

## Optimize hyperparameters of models

### LogisticRegression

### Xgboost

In [11]:
pipeline_xgb = make_pipeline(
        preproc, 
        GridSearchCV(
            XGBClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'max_depth' : [ 3, 5, 7]
                },
            cv = skf,
            verbose= 1, 
            scoring='log_loss',
            n_jobs = 1
            # nb of jobs to run in parallel
        )
    )


In [30]:
pipeline_xgb.fit(X_train, y_train_n)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


KeyboardInterrupt: 

In [15]:
best_xgb = pipeline_xgb.steps[-1][1]

In [16]:
y_pred_xgb = pipeline_xgb.predict_proba(X_test)
score_xgb = log_loss(y_test, y_pred_xgb[:,1:])

In [17]:
score_xgb

0.46646328202720494

In [18]:
y_submit_xgb = pipeline_xgb.predict_proba(X_test_w)

In [24]:
pickle.dump(pipeline_xgb, open('pipeline_xgb.pkl', 'wb'))

### ExtraTreesClassifier

In [27]:
pipeline_et = make_pipeline(
        preproc, 
        GridSearchCV(
                ExtraTreesClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'criterion' : ('gini', 'entropy'),
                    'max_depth' : [3, 5, 7]
                },
            cv = skf,
            verbose= 1, 
            scoring='log_loss'
        )
    )

In [31]:
pipeline_et.fit(X_train, y_train_n)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 14.7min finished


Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.4)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['v3', 'v22', 'v24', 'v31', 'v47', 'v52',...re_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='log_loss', verbose=1))])

In [60]:
y_pred_et = pipeline_et.predict_proba(X_test)
score_et = log_loss(y_test, y_pred_et[:,1:])
score_et

0.51102877363203014

In [110]:
y_submit_et = pipeline_et.predict_proba(X_test_w)

In [111]:
pickle.dump(pipeline_et, open('pipeline_et.pkl', 'wb'))

## OneHotEncoder

In [107]:
class TreatmentSpecialColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, column = 'v22'):
        self.column = column
        
    def fit(self, X, y = None):
        values, counts = np.unique(X[self.column].fillna('nan'), return_counts=True)
        counts = {x : y for x, y in zip(values, counts)}
        X[self.column] = X[self.column].apply(lambda x: x if counts.get(x, 0) > 50 else 0)
        return self
    
    def transform(self, X):
        return X

In [108]:
### One-hot Encoding
preproc = make_pipeline (
    TreatmentSpecialColumns(column = 'v22'),
    DropColumnsWithMissingData(thresholds=0.40),
    make_union(
    make_pipeline(
        select_features(categorical_features),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder()),
        OneHotEncoder(handle_unknown = 'ignore')
    ),
    make_pipeline(
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()        
    )
  )
)

In [36]:
xgb_ohe = make_pipeline(
        preproc, 
        GridSearchCV(
                ExtraTreesClassifier(),
                {
                    'n_estimators' : [30, 100, 300, 800],
                    'criterion' : ('gini', 'entropy'),
                    'max_depth' : [3, 5, 7]
                },
            cv = skf,
            verbose= 1, 
            scoring='log_loss'
        )
    )

In [37]:
xgb_ohe.fit(X_train, y_train_n)

TypeError: '>' not supported between instances of 'float' and 'str'

## One Shot

In [169]:
def BestParamsModel(X_train, y_train_n, preproc, clfs, skf):
    ListModel = {}
    for clf, params in clfs.items():
        pipeline = make_pipeline(
            preproc, 
            GridSearchCV(
                clf,
                params,
                cv = skf,
                verbose= 1, 
                scoring='log_loss'
                )
            )
        pipeline.fit(X_train, y_train_n)
        ListModel[clf] = pipeline
    return ListModel


In [158]:
clfs = {
    XGBClassifier() : { 'n_estimators' : [30, 100, 300, 800],
                             'max_depth' : [ 3, 5, 7] } ,
    ExtraTreesClassifier() : {
                                    'n_estimators' : [30, 100, 300, 800],
                                    'criterion' : ('gini', 'entropy'),
                                    'max_depth' : [3, 5, 7]
                              }, 
    LogisticRegression() : {   'C' : [0.05, 0.1 , 1, 10], 
                                'penalty' : ('l2', 'l1') 
                            }
    }


In [159]:
preproc = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.40),
    make_union(
    make_pipeline(
        select_features(categorical_features),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder()),
        OneHotEncoder(handle_unknown = 'ignore')
    ),
    make_pipeline(
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()
        
    )
  )
)

In [None]:
pipe = BestParamsModel(X_train, y_train_n, preproc, clfs, skf)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
list(clfs.keys())[0]