In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns 
from scipy import stats
from scipy.stats import norm
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
sns.set(style = "whitegrid", color_codes = True)
sns.set(font_scale = 1)
#from astropy.table import Table, Column
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.cross_validation import StratifiedKFold, cross_val_score, KFold
from sklearn.naive_bayes import BernoulliNB
#import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn import clone

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.metrics import log_loss, roc_auc_score, roc_curve, mean_squared_error
import pickle
import random
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 59.6 ms


In [16]:
# Drop all columns which have percentage of missing values superior threshold
class DropColumnsWithMissingData(BaseEstimator, TransformerMixin):
    """
    drop all columns which have percentage of missing values superior threshold
    """
    def __init__(self, thresholds=0.40):
        self.thresholds = thresholds
    
    def fit(self, X, y=None):
        a = X.isnull().mean()
        self.kept_columns = a.index[a < self.thresholds].tolist()
        return self
    
    def transform(self, X):
        return X[self.kept_columns]

time: 9.56 ms


In [17]:
class select_features(BaseEstimator, TransformerMixin):
    """
    Select categorical features or numerical features 
    """
    def __init__(self, features):
        self.features = features
         
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.features]

    
class FillMissingValues(BaseEstimator, TransformerMixin):
    """
    Fill missing values 
    'nan' for categorical features
    or -999 for numerical features
    """
    
    def __init__(self, replace_value):
        self.replace_value = replace_value
        # replace_value = 'nan' for filling missing data in categorical features
        # or -999 in numerical features
       
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.fillna(self.replace_value)
    
    
class ColumnApplier(BaseEstimator, TransformerMixin):
    """
    Some sklearn transformers can apply only on ONE column at a time (such as LabelEnconder())
    Wrap them with ColumnApplier to apply on all columns in the dataset
    """

    def __init__(self, underlying):
        self.underlying = underlying
        #TODO: underlying is one model method

    def fit(self, X, y=None):
        m = {}
        X = pd.DataFrame(X)  # TODO: :( reimplement in pure numpy?
        for c in X.columns:
            k = clone(self.underlying) 
            #TODO: clone helps to construct a new estimator with the same parameters.
            #      deep copy of the model in an estimator without actually copying attached data
            
            k.fit(X[c])
            # fit model k for every column in X 
            
            m[c] = k
            # put it in dictionary with column c as key and k as items
        
        self._column_stages = m
        # self.column_stages is a dictionary with column c in X as key and model k.fit as items 
        return self

    def transform(self, X):
        ret = {}
        X = pd.DataFrame(X)
        for c, k in self._column_stages.items():
            ret[c] = k.transform(X[c])
            # ret is a dict which has c as key and k.transform as items
        return pd.DataFrame(ret)[X.columns]  # keep the same order

class TolerantLabelEncoder(LabelEncoder):
    """
    LabelEncoder is not tolerant to unseen values
    """
    def transform(self, y):
        return np.searchsorted(self.classes_, y)

time: 109 ms


## Import data

In [160]:
#Import training data
train = pd.read_csv("data/train.csv")
#Import test data
test = pd.read_csv("data/test.csv")

time: 36.3 ms


In [161]:
train.drop(['Id'], axis= 1, inplace= True)
test.drop(['Id'], axis= 1 , inplace= True)

time: 2.69 ms


In [163]:
Xtrain = train.ix[:, train.columns != 'SalePrice']
ytrain = train.SalePrice

time: 3.26 ms


## Encoding pipeline

In [164]:
X_train = DropColumnsWithMissingData(thresholds=0.90).fit_transform(Xtrain)

CAT = X_train.select_dtypes(include=["object"]).columns
NUM = X_train.select_dtypes(exclude=["object"]).columns

time: 13.9 ms


In [166]:
# Label Encoding
preproc_le = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.90),
    make_union(make_pipeline(
        select_features(CAT),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder())
    ),
    make_pipeline(
        select_features(NUM),
        FillMissingValues(-999),
        StandardScaler()
    )
  )
)

time: 4.39 ms


In [167]:
### One-hot Encoding
preproc_ohe = make_pipeline (
    DropColumnsWithMissingData(thresholds=0.9),
    make_union(
    make_pipeline(
        select_features(CAT),
        FillMissingValues('nan'),
        ColumnApplier(TolerantLabelEncoder()),
        OneHotEncoder(handle_unknown = 'ignore')
    ),
    make_pipeline(
        select_features(NUM),
        FillMissingValues(-999),
        StandardScaler()        
    )
  )
)

time: 5.25 ms


## Split training data set into 2 part: train et test

In [168]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xtrain, ytrain, test_size = 0.2)

time: 3.87 ms


In [169]:
Xtest.shape

(292, 79)

time: 2.32 ms


## XGBoost with random search or GridSearch

#### Predict results of Xtest

In [170]:
xgb = make_pipeline(
            preproc_ohe, 
            GridSearchCV(
                XGBRegressor(),
                param_grid = {'n_estimators' : [30, 100, 300, 800],
                                        'max_depth' : [ 3, 5, 7] },
                cv = 5,
                verbose = 1,
                error_score = "neg_mean_squared_error"
                )
            )

time: 3.24 ms


In [171]:
xgb.fit(Xtrain, Ytrain)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.0min finished


Pipeline(steps=[('pipeline', Pipeline(steps=[('dropcolumnswithmissingdata', DropColumnsWithMissingData(thresholds=0.9)), ('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('select_features', select_features(features=Index(['MSZoning', 'Street', 'LotShape', 'LandC...     pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1))])

time: 1min 4s


In [172]:
xgb.score(Xtest, Ytest)

0.90933699412817404

time: 47.2 ms


In [174]:
np.sqrt(mean_squared_error(Ytest, xgb.predict(Xtest)))

22273.077012616151

time: 47.4 ms


### Backtest model