In [1]:
import time
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer
#
import xgboost as xgb
#
from blending import BlendedRegressor
from cleaning import TargetThresholdFilter, LogPlusOne
#
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
# plot plots inline
%matplotlib inline  

In [8]:
trn = pd.read_csv('../data/train.csv')
# trn = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is DataFrame index
# trn = pd.read_csv('../data/train_1.csv', index_col=0)

In [9]:
t = time.time()
trn_withRef = trn[trn['Ref'].notnull()]
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 0.3542 secs


In [10]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_withRef_comb = trn_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
trn_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_withRef_comb.columns.values)]
trn_withRef_comb = trn_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 7.0609 secs


In [11]:
X = trn_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
y = trn_withRef_comb['Expected_mean']

In [12]:
ttf = TargetThresholdFilter(threshold=np.inf)
lpo = LogPlusOne()
imp = Imputer(strategy='median', copy=False)  # Get a imputor with column-mean filling config
ss = StandardScaler(copy=False, with_mean=True, with_std=True)
clf = xgb.sklearn.XGBRegressor(n_estimators=500, nthread=5, max_depth=5, learning_rate=0.3)
pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('ss',ss), ('clf',clf)])  # a Pipeline wrapper to chain'em up
def LogPlusOne_score(ground_truth, predictions, lpo=lpo):
    return np.float64(np.mean(np.abs(ground_truth - (np.power(10, predictions) - 1))))
scorer = make_scorer(LogPlusOne_score, greater_is_better=True)  # define scoring metric

# CV

In [13]:
scores = cross_val_score(estimator=pip, X=X, y=y, scoring=scorer, cv=10, n_jobs=1, verbose=1)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 71.8min finished


In [7]:
# Pipeline(steps=[
# ('imp', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0))
# ('ss', StandardScaler(copy=False, with_mean=True, with_std=True)), 
# ('clf', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
#       learning_rate=0.1, max_delta_step=0, max_depth=3,
#       min_child_weight=1, missing=None, n_estimators=500, nthread=5,
#       objective='reg:linear', reg_alpha=0, reg_lambda=1,
#       scale_pos_weight=1, seed=0, silent=True, subsample=1))])
print np.mean(scores), np.std(scores)

23.1870628005 1.5801378357


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 46.5min finished


In [19]:
print np.mean(scores), np.std(scores)
pip.get_params()

23.2120059037 1.55426425433


{'clf': XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
        learning_rate=0.3, max_delta_step=0, max_depth=5,
        min_child_weight=1, missing=None, n_estimators=500, nthread=5,
        objective='reg:linear', reg_alpha=0, reg_lambda=1,
        scale_pos_weight=1, seed=0, silent=True, subsample=1),
 'clf__base_score': 0.5,
 'clf__colsample_bylevel': 1,
 'clf__colsample_bytree': 1,
 'clf__gamma': 0,
 'clf__learning_rate': 0.3,
 'clf__max_delta_step': 0,
 'clf__max_depth': 5,
 'clf__min_child_weight': 1,
 'clf__missing': None,
 'clf__n_estimators': 500,
 'clf__nthread': 5,
 'clf__objective': 'reg:linear',
 'clf__reg_alpha': 0,
 'clf__reg_lambda': 1,
 'clf__scale_pos_weight': 1,
 'clf__seed': 0,
 'clf__silent': True,
 'clf__subsample': 1,
 'imp': Imputer(axis=0, copy=False, missing_values='NaN', strategy='median',
     verbose=0),
 'imp__axis': 0,
 'imp__copy': False,
 'imp__missing_values': 'NaN',
 'imp__strategy': 'median',
 'imp__verbose': 0,
 'lpo

# Generate submission

In [15]:
pip.fit(X, y)

Pipeline(steps=[('ttf', TargetThresholdFilter(threshold=inf)), ('lpo', LogPlusOne()), ('imp', Imputer(axis=0, copy=False, missing_values='NaN', strategy='median',
    verbose=0)), ('ss', StandardScaler(copy=False, with_mean=True, with_std=True)), ('clf', XGBRegressor(base_score=0.5, colsample_bylevel=1, cols...g:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

In [16]:
test = pd.read_csv('../data/test.csv')
test_withRef = test[test['Ref'].notnull()]
test_withRef_comb = test_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
test_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(test_withRef_comb.columns.values)]
test_X = test_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
test_y_predict = 10**pip.predict(X=test_X)-1

In [17]:
# Ref-samples
test_result_withRef = pd.DataFrame()
test_result_withRef['Id'] = test_withRef_comb.index
test_result_withRef['Expected'] = test_y_predict
# All-samples
test_result = pd.DataFrame()
test_result['Id'] = test['Id'].unique()
# Merge and set Non-Ref samples to -1
test_result = pd.merge(test_result, test_result_withRef, how='left', on=['Id'], sort=True)
test_result.loc[test_result['Expected'].isnull(), 'Expected'] = -1
# Write file
datetime_str = time.strftime('%Y%m%d_%H%M%S')
test_result.to_csv('../data/result_'+datetime_str+'.csv', index=False)
print '../data/result_'+datetime_str+'.csv'

../data/result_20151122_102648.csv
