## Using make_pipeline and make_union in Sklearn
**Transformer** in scikit-learn - some class that have fit and transform method, or fit_transform method.

**Predictor** - some class that has fit and predict methods, or fit_predict method.

**Pipeline** is just an abstract notion, it's not some existing ml algorithm. Often in ML tasks you need to perform sequence of different transformations (find set of features, generate new features, select only some good features) of raw dataset before applying final estimator.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from scipy import stats
from scipy.stats import norm
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.naive_bayes import BernoulliNB
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import clone
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import log_loss



In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.ix[:, train.columns != 'target']
y_train = train.ix[:, train.columns == 'target']

X_test = test.ix[:, test.columns != 'target']

test_id = X_test['ID']

X_train = X_train.drop('ID', axis =1)
X_test = X_test.drop('ID', axis =1 )

#categorical_features = X_train.select_dtypes(include=["object"]).columns
#numerical_features = X_train.select_dtypes(exclude=["object"]).columns

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  after removing the cwd from sys.path.
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  import sys


In [3]:
# Drop all columns which have percentage of missing values superior 40%
class DropColumnsWithMissingData(BaseEstimator, TransformerMixin):
    
    def __init__(self, thresholds=0.40):
        self.thresholds = thresholds
    
    def fit(self, X, y=None):
        a = X.isnull().mean()
        self.kept_columns = a.index[a < self.thresholds].tolist()
        return self
    
    def transform(self, X):
        return X[self.kept_columns]

In [4]:
def factorize(train, test):
    for (train_name, train_series), (test_name, test_series) in zip(train.items(), test.items()):
        
        #LabelEncoder 
        if train_series.dtype == 'O': #check if object
            train[train_name], tmp_indexer = pd.factorize(train[train_name])
            # pd.factorize will return, in second position, the list of unique values (or categorical labels) in the provided 
            # column, in first position, the indices that would let you recreate the original column from the unique values
            
            # train[train_name] will be replaced by its index-based representation, 
            # tmp_indexer will contains the unique values in the original train[train_name].

            test[train_name] = tmp_indexer.get_indexer(test[train_name])
            #  get_indexer will return the indices where the values in test[test_name] are to be found in tmp_indexer
            #  the current test column is replaced by a list of indices in the exact same way the corresponding train column was in the line above.
    return train, test

def preprocess_data(train, test):
    train = DropColumnsWithMissingData(thresholds=0.40).fit_transform(train)

    columns_n = train.columns
    X_test = test[columns_n]
    
    train, test = factorize(train, test)

    return (train, test)

X_train, X_test = preprocess_data(X_train,X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
numerical_features = X_train.select_dtypes(exclude= ['object']).columns
categorical_features = X_train_n.select_dtypes(include=["object"]).columns


In [6]:
# select categorical features or numerical features 
class select_features(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features
         
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.features]

    
class FillMissingValues(BaseEstimator, TransformerMixin):
    
    def __init__(self, replace_value):
        self.replace_value = replace_value
        # replace_value = 'nan' for filling missing data in categorical features
        # or -999 in numerical features
       
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.fillna(self.replace_value)

In [7]:
preproc = make_union(
    make_pipeline (
        select_features(numerical_features),
        FillMissingValues(-999),
        StandardScaler()
    ),
    make_pipeline(
        select_features(cate)
    )
)

In [8]:
xgb = make_pipeline(
    preproc, 
    XGBClassifier(n_estimators=800)                  
)

In [9]:
X_train_n, X_test_n, y_train_n, y_test = train_test_split( X_train, y_train, test_size=0.3, random_state=0)

In [10]:
model1 = xgb.fit(X_train_n, y_train_n)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [11]:
y_pred = model1.predict_proba(X_test_n)

In [12]:
y_predict = model1.predict_proba(X_test)

In [13]:
score = log_loss(y_test, y_pred[:, 1])

In [14]:
score

0.46858837559110444

In [129]:
result = pd.DataFrame({"ID": test_id, "PredictedProb": y_predict[1]})

In [134]:
result.to_csv('predict_bnp_5.csv', index = False)

## ExtraTressClassifier

In [86]:
ext = make_pipeline(
    preproc , 
    ExtraTreesClassifier(n_estimators = 800),  
)   

In [87]:
model_ext = ext.fit(X_train_n, y_train_n)

  self._final_estimator.fit(Xt, y, **fit_params)


In [88]:
y_pred2 = model_ext.predict_proba(X_test_n)

In [89]:
score = log_loss(y_test, y_pred2)
score

0.47659111744476035

In [94]:
clf.fit(X_train, y_train) 
y_pred3 = clf.predict_proba(X_test)

  y = column_or_1d(y, warn=True)


In [95]:
score = log_loss(y_test, y_pred3)
score

0.47964501495561412

In [None]:
from sklearn.externals.joblib import dump, load
dump()

In [None]:
from sklearn.externals.joblib import dump, load

dump(xgb, 'fitted/xgb.pkl')