In [1]:
# Basics
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Sklearn
from sklearn.preprocessing import StandardScaler, Imputer 
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer
# xgboost
import xgboost as xgb
from sklearn.grid_search import GridSearchCV
# Our custom modules
sys.path.append('..')
from anrg.pipeline import Pipeline # my customized Pipeline
from anrg.blending import BlendedRegressor
from anrg.cleaning import TargetThresholdFilter, LogPlusOne
from sklearn.ensemble import ExtraTreesRegressor
##### setting #######
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
%matplotlib inline

In [2]:
trn = pd.read_csv('../data/train_10.csv')

In [3]:
t = time.time()
trn_withRef = trn[trn['Ref'].notnull()]
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 0.1136 secs


In [4]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_withRef_comb = trn_withRef.groupby('Id').agg(['mean','std','median','count','min', 'max'])
trn_withRef_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_withRef_comb.columns.values)]
#trn_withRef_comb = trn_withRef_comb.drop(['Expected_count', 'Expected_median', 'Expected_std', 'Expected_min','Expected_max'], axis =1)
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 7.1565 secs


In [5]:
X = trn_withRef_comb.loc[:, 'minutes_past_mean':'Kdp_5x5_90th_max']  # NOTE: used range slicing on column
y = trn_withRef_comb['Expected_mean']

In [6]:
ttf = TargetThresholdFilter(threshold=69)
lpo = LogPlusOne()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0) # Get a imputor with column-mean filling config
#ss = StandardScaler(copy=False, with_mean=True, with_std=True)
clf = ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=0, n_jobs=5)
#pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('ss',ss), ('clf',clf)])  # a Pipeline wrapper to chain'em up
pip = Pipeline([('ttf',ttf), ('lpo',lpo), ('imp',imp), ('clf',clf)])  # a Pipeline wrapper to chain'em up
def LogPlusOne_score(ground_truth, predictions, lpo=lpo):
    return np.float64(np.mean(np.abs(ground_truth - (np.power(10, predictions) - 1))))
scorer = make_scorer(LogPlusOne_score, greater_is_better=True)  # define scoring metric: set the second argument false

In [None]:
param_name, param_range='clf__n_estimators', np.arange(20,400, 50)

###############################################################################
# Prediction


#Parameters of pipelines can be set using ‘__’ separated parameter names:

estimator = GridSearchCV(pip,
                         dict(clf__n_estimators = param_range), scoring = scorer, n_jobs = 2, verbose=2)

estimator.fit(X,y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
