In [1]:
# Basics
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Sklearn
from sklearn.preprocessing import StandardScaler, Imputer 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer
# xgboost
import xgboost as xgb
# Our custom modules
sys.path.append('..')
from anrg.pipeline import Pipeline # my customized Pipeline
from anrg.blending import BlendedRegressor
from anrg.cleaning import TargetThresholdFilter, LogPlusOne
##### setting #######
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
%matplotlib inline

In [2]:
trn = pd.read_csv('../data/train.csv')
# trn = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is DataFrame index
# trn = pd.read_csv('../data/train_1.csv', index_col=0)

In [3]:
t = time.time()
trn_withRef = trn[trn['Ref'].notnull()]
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 0.3516 secs


In [4]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_withRef_comb = trn_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
trn_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_withRef_comb.columns.values)]
trn_withRef_comb = trn_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 6.9771 secs


In [5]:
X = trn_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
y = trn_withRef_comb['Expected_mean']
print X.shape, y.shape

(731556, 132) (731556,)


In [None]:
ttf = TargetThresholdFilter(threshold=45)
lpo = LogPlusOne()
imp = Imputer(strategy='median', copy=False)  # Get a imputor with column-mean filling config
ss = StandardScaler(copy=False, with_mean=True, with_std=True)
base1 = RandomForestRegressor(n_estimators=500, max_features=0.2, max_depth=25, n_jobs=7)  # NOTE: n_jobs=-1 will use all of your cores, set to a prefered number e.g. 4
base2 = xgb.sklearn.XGBRegressor(n_estimators=500, nthread=6)
base3 = LinearRegression()
base4 = Ridge(alpha=2.0)
base5 = Lasso(alpha=0.01)
blender = LinearRegression()
clf = BlendedRegressor(base_models=(base1, base2, base3, base4, base5), blending_model=blender, blending_split=0.1)
pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('ss',ss), ('clf',clf)])  # a Pipeline wrapper to chain'em up
def LogPlusOne_score(ground_truth, predictions, lpo=lpo):
    return np.float64(np.mean(np.abs(ground_truth - (np.power(10, predictions) - 1))))
scorer = make_scorer(LogPlusOne_score, greater_is_better=False)  # define scoring metric

# CV

In [None]:
scores = cross_val_score(estimator=pip, X=X, y=y, scoring=scorer, cv=10, n_jobs=1, verbose=2)

In [13]:
# ttf = TargetThresholdFilter(threshold=45)
# base1 = RandomForestRegressor(n_estimators=500, max_features=0.2, max_depth=25, n_jobs=7)  # NOTE: n_jobs=-1 will use all of your cores, set to a prefered number e.g. 4
# base2 = xgb.sklearn.XGBRegressor(n_estimators=500, nthread=6) # base3 = LinearRegression()
# base4 = Ridge(alpha=2.0) # base5 = Lasso(alpha=0.01) # blender = LinearRegression()
# clf = BlendedRegressor(base_models=(base1, base2, base3, base4, base5), blending_model=blender, blending_split=10)
# pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('ss',ss), ('clf',clf)])  # a Pipeline wrapper to chain'em up
print np.mean(scores), np.std(scores)
print pip.get_params()['steps']

23.0912481243 1.58129656429
[('ttf', TargetThresholdFilter(threshold=45)), ('lpo', LogPlusOne()), ('imp', Imputer(axis=0, copy=False, missing_values='NaN', strategy='median',
    verbose=0)), ('ss', StandardScaler(copy=False, with_mean=True, with_std=True)), ('clf', BlendedRegressor(base_models=(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=0.2, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=7, oob_score=False, random_state=None,
           ...ve=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)),
         blending_model=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
         blending_split=10))]
BlendedRegressor(base_models=(RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=0.2, max_leaf_nodes=None, min_samples_leaf=1,
          

In [None]:
# ttf = TargetThresholdFilter(threshold=45)
# base1 = RandomForestRegressor(n_estimators=500, max_features=0.2, max_depth=25, n_jobs=7)  # NOTE: n_jobs=-1 will use all of your cores, set to a prefered number e.g. 4
# base2 = xgb.sklearn.XGBRegressor(n_estimators=500, nthread=6)# base3 = LinearRegression()
# base4 = Ridge(alpha=2.0)# base5 = Lasso(alpha=0.01)# blender = LinearRegression()
# clf = BlendedRegressor(base_models=(base1, base2, base3, base4, base5), blending_model=blender, blending_split=0.1)
# pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('ss',ss), ('clf',clf)])  # a Pipeline wrapper to chain'em up
print np.mean(scores), np.std(scores)
print pip.get_params()['steps']

# Generate submission

In [14]:
pip.fit(X, y)

Pipeline(steps=[('ttf', TargetThresholdFilter(threshold=45)), ('lpo', LogPlusOne()), ('imp', Imputer(axis=0, copy=False, missing_values='NaN', strategy='median',
    verbose=0)), ('ss', StandardScaler(copy=False, with_mean=True, with_std=True)), ('clf', BlendedRegressor(base_models=(RandomForestRegressor(boo...egression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
         blending_split=10))])

In [15]:
test = pd.read_csv('../data/test.csv')
test_withRef = test[test['Ref'].notnull()]
test_withRef_comb = test_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
test_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(test_withRef_comb.columns.values)]
test_X = test_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
test_y_predict = 10**pip.predict(X=test_X)-1

In [16]:
# Ref-samples
test_result_withRef = pd.DataFrame()
test_result_withRef['Id'] = test_withRef_comb.index
test_result_withRef['Expected'] = test_y_predict
# All-samples
test_result = pd.DataFrame()
test_result['Id'] = test['Id'].unique()
# Merge and set Non-Ref samples to -1
test_result = pd.merge(test_result, test_result_withRef, how='left', on=['Id'], sort=True)
test_result.loc[test_result['Expected'].isnull(), 'Expected'] = -1
# Write file
datetime_str = time.strftime('%Y%m%d_%H%M%S')
test_result.to_csv('../data/result_'+datetime_str+'.csv', index=False)
print '../data/result_'+datetime_str+'.csv'

../data/result_20151125_092807.csv
