In [3]:
# Basics
import sys
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Sklearn
from sklearn.preprocessing import StandardScaler, Imputer 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score, ShuffleSplit, train_test_split, KFold
from sklearn.learning_curve import learning_curve, validation_curve
from sklearn.metrics import make_scorer, confusion_matrix
# xgboost
import xgboost as xgb
# Our custom modules
sys.path.append('..')
from anrg.pipeline import Pipeline, SelectTailK, LeaveTailK, SelectK2Last, DummyRegressor
from anrg.blending import BlendedRegressor
from anrg.cleaning import TargetThresholdFilter, LogPlusOne
from anrg.classified_regression import ClassifiedRegressor
##### setting #######
pd.set_option('display.max_columns', 500)  # force pandas to display all columns for better visual inspection
%matplotlib inline

In [46]:
trn = pd.read_csv('../data/train.csv')
# trn = pd.read_csv('../data/train_10.csv', index_col=0)  # column #0 in our file is DataFrame index
# trn = pd.read_csv('../data/train_1.csv', index_col=0)

In [47]:
# Combine observations by 'Id', aggregate features
t = time.time()
trn_comb = trn.groupby('Id').agg(['mean','std','median','count', 'min', 'max'])
trn_comb.columns = ['_'.join(tup) for (i,tup) in enumerate(trn_comb.columns.values)]
# ignore id's where all Ref vales are NaN
trn_withRef_comb = trn_comb[pd.notnull(trn_comb.Ref_mean)]
# Gargage collection
del trn
del trn_comb
# Timing
print 'Time elapsed: {:.4f} secs'.format(time.time()-t)  # toc

Time elapsed: 11.9227 secs


In [48]:
# Add deterministic estimator as columns
# MP params=0.82
trn_withRef_comb.loc[:,'MP'] = 1+pow(pow(10, trn_withRef_comb['Ref_mean']/10)/200, 0.625 * 0.82)
#KDP
kdp_aa, kdp_bb, kdp_aa_scaling, kdp_bb_scaling = 4.06, 0.0866, 0.33, 0.79
trn_withRef_comb.loc[:,'KDP'] = np.sign(trn_withRef_comb['Kdp_mean'])*(kdp_aa*kdp_aa_scaling)*pow(np.abs(trn_withRef_comb['Kdp_mean']),kdp_bb*kdp_bb_scaling)
#KDP_ZDR
kdpzdr_aa, kdpzdr_bb, kdpzdr_cc, kdpzdr_aa_scaling = 13.6, 0.0968, -0.286, 0.003
trn_withRef_comb.loc[:, 'KDP_ZDR'] = np.sign(trn_withRef_comb['Kdp_mean'])*(kdpzdr_aa*kdpzdr_aa_scaling)*pow(np.abs(trn_withRef_comb['Kdp_mean']),kdpzdr_bb)*pow(pow(10,trn_withRef_comb['Zdr_mean']/10),kdpzdr_cc)
#REF_ZDR
refzdr_aa, refzdr_bb, refzdr_cc, refzdr_aa_scaling, refzdr_bb_scaling, refzdr_cc_scaling = 0.00746, 0.945, -4.76, 0.0017, 0.9, 0.8
trn_withRef_comb.loc[:,'REF_ZDR'] = (refzdr_aa*refzdr_aa_scaling)*pow(pow(10,trn_withRef_comb['Ref_mean']/10),refzdr_bb*refzdr_bb_scaling)*pow(pow(10,trn_withRef_comb['Zdr_mean']/10),refzdr_cc*refzdr_cc_scaling)
# Regularizing negative predictions to 0
for name in ['MP','KDP', 'KDP_ZDR', 'REF_ZDR']:
    trn_withRef_comb.loc[trn_withRef_comb[name]<0, name] = 0
# Taking log(1+x) on all predictions
trn_withRef_comb.loc[:, ['MP','KDP', 'KDP_ZDR', 'REF_ZDR']] = np.log10(1+trn_withRef_comb.loc[:, ['MP','KDP', 'KDP_ZDR', 'REF_ZDR']])

In [49]:
# Extract X and y
y = trn_withRef_comb['Expected_mean']
X = trn_withRef_comb.ix[:, [col for col in trn_withRef_comb.columns if not 'Expected' in col]]  # NOTE: used range slicing on column
# Garbage collection
del trn_withRef_comb
print X.shape, y.shape

(731556, 136) (731556,)


In [50]:
#
seed = 12345
n_tree = 500
n_jobs = 7
#
lpo = LogPlusOne()
imp = Imputer(strategy='median', copy=False)  # Get a imputor with column-mean filling config
ss = StandardScaler(copy=False, with_mean=True, with_std=True)
def LogPlusOne_score(ground_truth, predictions):
    return np.float64(np.mean(np.abs(ground_truth - (np.power(10, predictions) - 1))))
scorer = make_scorer(LogPlusOne_score, greater_is_better=False)  # define scoring metric
reg_sub = [None, None]
for i in [0, 1]:
    base1 = Pipeline([('sel', LeaveTailK(K=4)), ('rf', RandomForestRegressor(n_estimators=n_tree, max_features=0.2, max_depth=25, n_jobs=n_jobs , random_state=seed))], copy = False)
    base2 = Pipeline([('sel', LeaveTailK(K=4)), ('xgb', xgb.sklearn.XGBRegressor(n_estimators=n_tree, nthread=n_jobs , seed=seed))], copy = False)
    base3 = Pipeline([('sel', LeaveTailK(K=4)), ('ri', Ridge(alpha=2.0, random_state=seed))], copy = False)
    base4 = Pipeline([('sel', LeaveTailK(K=4)), ('la', Lasso(alpha=0.01, random_state=seed))], copy = False)
    base5 = Pipeline([('sel', SelectK2Last(K=4)), ('mp', DummyRegressor())], copy = False)
    base6 = Pipeline([('sel', SelectK2Last(K=3)), ('kdp', DummyRegressor())], copy = False)
    base7 = Pipeline([('sel', SelectK2Last(K=2)), ('kdp_zdr', DummyRegressor())], copy = False)
    base8 = Pipeline([('sel', SelectK2Last(K=1)), ('ref_zdr', DummyRegressor())], copy = False)
    blender = Ridge(alpha=1.45, random_state=seed)
    base_models=(base1, base2, base3, base4, base5, base6, base7, base8)
    reg_sub[i] = BlendedRegressor(base_models=base_models, blending_model=blender, blending_split=0.15, with_feature=True, random_state=seed)
reg_sub = tuple(reg_sub)
clf = xgb.sklearn.XGBClassifier(n_estimators=n_tree, nthread=n_jobs , seed=seed)
reg = ClassifiedRegressor(labeling_thresh=np.log10(1+45) , classifier=clf, proba_thresh=0.4, regressors=reg_sub, verbose=0)
pip = Pipeline([('lpo',lpo), ('imp',imp), ('ss',ss), ('reg',reg)], copy=True)  # a Pipeline wrapper to chain'em up
pip_sub = Pipeline([('lpo', lpo), ('imp', imp), ('ss', ss), ('clf', clf)], copy=True)

# CV

In [51]:
kf = KFold(n=y.shape[0], n_folds=10, random_state=seed)
param_range = np.arange(0, 1.1, 0.05)
score = pd.DataFrame(np.zeros([10, len(param_range)]))
score = pd.DataFrame(score, index=range(0,10), columns=param_range)

In [52]:
for ifold, (train_index, val_index) in enumerate(kf):
    print "Fold {} starts...".format(ifold)
    t = time.time()
    X_trn, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_trn, y_val = y.iloc[train_index], y.iloc[val_index]
    pip.fit(X_trn, y_trn)
    for iparam, th in enumerate(param_range):
        reg.proba_thresh = th        
        y_label_val_predict_prob = pip_sub.predict_proba(X_val)[:, 0]
        c = confusion_matrix(y_val>45, y_label_val_predict_prob<th)
        TP, FN, TN, FP = c[1,1], c[1,0], c[0,0], c[0,1]
        score.iloc[ifold, iparam] = LogPlusOne_score(y_val, pip.predict(X_val))
        print "th {} (TP {}, FN {}, TN {}, FP {}) score {}".format(th, TP, FN, TN, FP, score.iloc[ifold, iparam])
    print ".....................................used {} mins".format((time.time()-t)/60.0)

Fold 0 starts...
th 0.0 (TP 0, FN 1886, TN 71270, FP 0) score 22.188780163
th 0.05 (TP 0, FN 1886, TN 71270, FP 0) score 22.188780163
th 0.1 (TP 0, FN 1886, TN 71270, FP 0) score 22.188780163
th 0.15 (TP 0, FN 1886, TN 71270, FP 0) score 22.188780163
th 0.2 (TP 0, FN 1886, TN 71270, FP 0) score 22.188780163
th 0.25 (TP 3, FN 1883, TN 71270, FP 0) score 22.1904996166
th 0.3 (TP 13, FN 1873, TN 71269, FP 1) score 22.2015467566
th 0.35 (TP 18, FN 1868, TN 71269, FP 1) score 22.194981058
th 0.4 (TP 30, FN 1856, TN 71269, FP 1) score 22.1959373541
th 0.45 (TP 37, FN 1849, TN 71266, FP 4) score 22.1311727236
th 0.5 (TP 52, FN 1834, TN 71261, FP 9) score 22.0407349409
th 0.55 (TP 58, FN 1828, TN 71241, FP 29) score 22.2650052692
th 0.6 (TP 67, FN 1819, TN 71236, FP 34) score 22.3135918951
th 0.65 (TP 80, FN 1806, TN 71223, FP 47) score 22.3157089185
th 0.7 (TP 97, FN 1789, TN 71196, FP 74) score 22.6009785456
th 0.75 (TP 124, FN 1762, TN 71137, FP 133) score 23.0205634853
th 0.8 (TP 170, FN 1

In [53]:
# Write file
datetime_str = time.strftime('%Y%m%d_%H%M%S')
score.to_csv('../data/score_'+datetime_str+'.csv')
print '../data/score_'+datetime_str+'.csv'

../data/score_20151205_172418.csv


In [6]:
score = pd.read_csv('../data/score_20151205_172418.csv', index_col=0)

In [7]:
score.mean(axis=0)

0.0           23.079747
0.05          23.079747
0.1           23.084853
0.15          23.085086
0.2           23.086460
0.25          23.091100
0.3           23.055389
0.35          23.034485
0.4           23.034717
0.45          22.987305
0.5           22.970050
0.55          23.013693
0.6           23.066809
0.65          23.162571
0.7           23.307174
0.75          23.515441
0.8           24.116975
0.85          25.582544
0.9           31.657403
0.95    20759166.181774
1.0     20759345.673563
1.05    20759345.673563
dtype: float64

In [8]:
score

Unnamed: 0,0.0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1.0,1.05
0,22.18878,22.18878,22.18878,22.18878,22.18878,22.1905,22.201547,22.194981,22.195937,22.131173,22.040735,22.265005,22.313592,22.315709,22.600979,23.020563,22.993475,24.731291,31.286663,58.66267,239.6651,239.6651
1,22.463131,22.463131,22.463131,22.463131,22.463131,22.471869,22.438637,22.40989,22.484114,22.544989,22.501259,22.58345,22.707611,23.117594,23.304335,23.429278,24.130974,25.533862,31.477217,1722.483,1897.552,1897.552
2,23.337152,23.337152,23.337152,23.337152,23.337152,23.337715,23.347309,23.347309,23.316142,23.274108,23.278292,23.18199,23.243769,23.28539,23.282946,23.304265,23.692311,24.768297,29.892314,57.07477,233.8361,233.8361
3,24.860735,24.860735,24.860735,24.860735,24.860735,24.860735,24.786021,24.821211,24.844904,24.822038,24.67453,24.712245,24.733967,24.830766,24.898019,25.103273,25.851245,26.883883,31.548907,64.04928,240.6041,240.6041
4,23.701079,23.701079,23.756463,23.756463,23.756463,23.761619,23.747249,23.775619,23.773047,23.744754,23.713981,23.853494,23.997656,23.760778,23.8123,23.87716,24.359738,26.109668,33.10706,207589500.0,207589600.0,207589600.0
5,23.856624,23.856624,23.856624,23.856624,23.856624,23.856624,23.827223,23.812875,23.798228,23.80485,23.897853,24.010103,23.913885,24.07817,24.244677,24.262142,24.803892,26.105183,32.450986,61.9247,245.4042,245.4042
6,19.846963,19.846963,19.846963,19.846963,19.846963,19.850582,19.821652,19.790658,19.773004,19.766379,19.764636,19.755207,19.827193,19.957553,20.10338,20.405684,21.094712,22.537438,28.345786,54.43788,235.3107,235.3107
7,25.538065,25.538065,25.538065,25.538065,25.551802,25.551802,25.38287,25.306744,25.381081,25.290849,25.257279,25.196753,25.258953,25.400411,25.619236,25.743284,26.366266,27.866106,33.687489,59.35282,238.3991,238.3991
8,23.602911,23.602911,23.598592,23.60092,23.60092,23.627523,23.629799,23.530169,23.378954,23.22484,23.280344,23.258369,23.351681,23.305936,23.418538,23.701543,24.402891,25.865925,32.14979,62.24993,239.835,239.835
9,21.402027,21.402027,21.402027,21.402027,21.402027,21.402027,21.371578,21.355396,21.401761,21.269068,21.291595,21.320312,21.319784,21.573407,21.78733,22.307217,23.474244,25.423791,32.627817,61.99589,243.6798,243.6798
