In [3]:
import time
#
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer
#
import xgboost as xgb
#
from blending import BlendedRegressor
from cleaning import TargetThresholdFilter, LogPlusOne
#
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
# plot plots inline
%matplotlib inline  

In [4]:
# trn = pd.read_csv('../data/train.csv')
# trn = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is DataFrame index
trn = pd.read_csv('../data/train_1.csv', index_col=0)
# test = pd.read_csv('../data/test.csv')

In [5]:
t = time.time()
trn_withRef = trn[trn['Ref'].notnull()]
# test_withRef = test[test['Ref'].notnull()]
del trn
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 0.0175 secs


In [6]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_withRef_comb = trn_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
trn_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_withRef_comb.columns.values)]
trn_withRef_comb = trn_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
# test_withRef_comb = test_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
# test_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(test_withRef_comb.columns.values)]
# test_withRef_comb = test_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
del trn_withRef
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 0.4887 secs


In [7]:
X = trn_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
y1 = np.log10(1+trn_withRef_comb['Expected_mean'])
y2 = trn_withRef_comb['Expected_mean']

In [8]:
# preprocessing
lpo = LogPlusOne()
imp = Imputer(strategy='median')  # Get a imputor with column-mean filling config
ss = StandardScaler(copy=False, with_mean=True, with_std=True)
clf = RandomForestRegressor(n_estimators=10, max_features='sqrt', max_depth=5, n_jobs=4, random_state=0)
pip1 = Pipeline([('imp',imp), ('ss',ss), ('clf', clf)]) 
pip2 = Pipeline([('lpo',lpo), ('imp',imp), ('ss',ss), ('clf', clf)])

In [9]:
def MAE_logy(ground_truth, predictions):
    """Custom scoring function for log(y) or log(1+y)
       NOTE: please change this if you use another non-linearity on y
    """
    return np.float64(np.mean(np.abs(np.power(10,ground_truth) - np.power(10,predictions))))
def LogPlusOne_score(ground_truth, predictions, lpo):
    return np.float64(np.mean(np.abs(ground_truth - (np.power(10, predictions) - 1))))

In [10]:
scorer = make_scorer(MAE_logy, greater_is_better=True)  # define scoring metric
scores = cross_val_score(estimator=pip1, X=X, y=y1, scoring=scorer, cv=10, n_jobs=2, verbose=1)
print np.mean(scores), np.std(scores)

17.3314446451 2.44936404358


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:   11.1s finished


In [11]:
scorer = make_scorer(LogPlusOne_score, greater_is_better=True)  # define scoring metric
scores = cross_val_score(estimator=pip2, X=X, y=y2, scoring=scorer, cv=10, n_jobs=2, verbose=1)
print np.mean(scores), np.std(scores)

17.3314446451 2.44936404358
aha
aha
ahaaha

ahaaha

ahaaha

ahaaha



[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:   11.1s finished


In [14]:
y1 = pd.Series(np.array([1, 2, 3, 4]))
y2 = pd.Series(np.array([1.1, 1.9, 3.2, 3.9]))
print MAE_logy(np.log10(1+y1), np.log10(1+y2))
print lpo.metric(y1, np.log10(1+y2))

0.125
0.125
