In [1]:
import time
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer
#
import xgboost as xgb
#
from blending import BlendedRegressor
from cleaning import TargetThresholdFilter
#
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
# plot plots inline
%matplotlib inline  

In [2]:
trn = pd.read_csv('../data/train.csv')
# trn = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is DataFrame index
# trn = pd.read_csv('../data/train_1.csv', index_col=0)
# test = pd.read_csv('../data/test.csv')

In [3]:
t = time.time()
trn_withRef = trn[trn['Ref'].notnull()]
# test_withRef = test[test['Ref'].notnull()]
del trn
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 1.4094 secs


In [4]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_withRef_comb = trn_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
trn_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_withRef_comb.columns.values)]
trn_withRef_comb = trn_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
# test_withRef_comb = test_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
# test_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(test_withRef_comb.columns.values)]
# test_withRef_comb = test_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
del trn_withRef
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 18.4084 secs


# Impute after cv split

In [5]:
X = trn_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
y = np.log10(1+trn_withRef_comb['Expected_mean'])
def MAE_logy(ground_truth, predictions):
    """Custom scoring function for log(y) or log(1+y)
       NOTE: please change this if you use another non-linearity on y
    """
    return np.float64(np.mean(np.abs(10**ground_truth - 10**predictions)))

In [12]:
# preprocessing
imp1 = Imputer(strategy='mean', copy=True)  # Get a imputor with column-mean filling config
ss1 = StandardScaler(copy=False, with_mean=True, with_std=True)
clf1 = RandomForestRegressor(n_estimators=40, max_features='sqrt', max_depth=5, n_jobs=4)
pip1 = Pipeline([('imp',imp1), ('ss',ss1), ('clf', clf1)])  # a Pipeline wrapper to chain'em up

In [13]:
scorer = make_scorer(MAE_logy, greater_is_better=True)  # define scoring metric
scores = cross_val_score(estimator=pip1, X=X, y=y, scoring=scorer, cv=10, n_jobs=1, verbose=1)
print np.mean(scores), np.std(scores)

23.2683538325 1.58087644069


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  4.7min finished


# Impute before cv split

In [14]:
imp2 = Imputer(strategy='mean', copy=True)
X_imp = imp2.fit_transform(X)
ss2 = StandardScaler(copy=False, with_mean=True, with_std=True)
clf2 = RandomForestRegressor(n_estimators=40, max_features='sqrt', max_depth=5, n_jobs=4)
pip2 = Pipeline([('ss',ss2), ('clf', clf2)])  # a Pipeline wrapper to chain'em up

In [15]:
# preprocessing
scorer = make_scorer(MAE_logy, greater_is_better=True)  # define scoring metric
scores = cross_val_score(estimator=pip2, X=X_imp, y=y, scoring=scorer, cv=10, n_jobs=1, verbose=1)
print np.mean(scores), np.std(scores)

23.268932147 1.58039111499


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  5.1min finished


# Impute after cv split + median

In [6]:
# preprocessing
imp3 = Imputer(strategy='median')  # Get a imputor with column-mean filling config
ss3 = StandardScaler(copy=False, with_mean=True, with_std=True)
clf3 = RandomForestRegressor(n_estimators=40, max_features='sqrt', max_depth=5, n_jobs=4)
pip3 = Pipeline([('imp',imp3), ('ss',ss3), ('clf', clf3)])  # a Pipeline wrapper to chain'em up

In [7]:
scorer = make_scorer(MAE_logy, greater_is_better=True)  # define scoring metric
scores = cross_val_score(estimator=pip3, X=X, y=y, scoring=scorer, cv=10, n_jobs=1, verbose=1)
print np.mean(scores), np.std(scores)

23.2674629028 1.57987458739


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  7.8min finished
