In [166]:
import numpy as np
import pandas as pd
import time
import random

from scipy import sparse
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
%matplotlib inline

# Load Data

In [194]:
train_data = pd.read_csv('../input/train.csv')
train_size=train_data.shape[0]

test_data = pd.read_csv('../input/test.csv')
print train_data.shape, test_data.shape

(188318, 132) (125546, 131)


In [195]:
full_data=pd.concat((train_data,test_data), ignore_index = True, axis = 0)
del( train_data, test_data)

# Group features

In [196]:
data_types = full_data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'id'
target_col = 'loss'
num_cols.remove('id')
num_cols.remove('loss')

print ("Categorical features:", len(cat_cols))
print ( "Numerica features:", len(num_cols))
print ( "ID: %s, target: %s" %( id_col, target_col))

('Categorical features:', 116)
('Numerica features:', 14)
ID: id, target: loss


# Feature Generation (cat)

In [197]:
#comb_features = [['cat103','cat111'],['cat72','cat103'],['cat80','cat81'],['cat73','cat1'],['cat6','cat103'],['cat80','cat79'],['cat111','cat2'],['cat50','cat111'],['cat9','cat90'],['cat76','cat111'],['cat111','cat13'],['cat79','cat12'],['cat103','cat11'],['cat103','cat4'],['cat111','cat87'],['cat111','cat38'],['cat111','cat36'],['cat25','cat2'],['cat103','cat23'],['cat103','cat10'],['cat111','cat5'],['cat80','cat57'],['cat24','cat103'],['cat7','cat87'],['cat80','cat3'],['cat73','cat40'],['cat85','cat79'],['cat16','cat57'],['cat24','cat28'],['cat46','cat79'],['cat87','cat89'],['cat60','cat73'],['cat9','cat22'],['cat6','cat14'],['cat9','cat47'],['cat9','cat70'],['cat34','cat57'],['cat55','cat57']]
# 101_comb_features, ['cat72_cat103', 'cat80_cat79', 'cat81_cat90'] may increase the mae
comb_features = [['cat103','cat111'], ['cat72','cat103'], ['cat80','cat81'], ['cat73','cat1'], ['cat72','cat111'], \
                ['cat6','cat103'], ['cat6','cat111'], ['cat80','cat79'], ['cat1','cat111'], ['cat79','cat103'], \
                ['cat111','cat2'], ['cat79','cat111'], ['cat50','cat111'], ['cat73','cat81'], ['cat72','cat2'], \
                ['cat50','cat103'], ['cat1','cat81'], ['cat1','cat103'], ['cat73','cat103'], ['cat6','cat2'], \
                ['cat80','cat111'], ['cat80','cat103'], ['cat1','cat72'], ['cat1','cat79'], ['cat103','cat2'], \
                ['cat81','cat103'], ['cat9','cat90'], ['cat73','cat111'], ['cat111','cat9'], ['cat76','cat111'], \
                ['cat76','cat103'], ['cat72','cat9'], ['cat6','cat79'], ['cat103','cat9'], ['cat79','cat81'], \
                ['cat111','cat13'], ['cat79','cat72'], ['cat80','cat72'], ['cat1','cat50'], ['cat72','cat81'], \
                ['cat79','cat12'], ['cat81','cat90'], ['cat81','cat111'], ['cat80','cat1'], ['cat103','cat11'], \
                ['cat73','cat79'], ['cat6','cat72'], ['cat103','cat4'], ['cat73','cat50'], ['cat103','cat12'], \
                ['cat111','cat87'], ['cat6','cat87'], ['cat111','cat12'], ['cat111','cat38'], ['cat6','cat9'], \
                ['cat111','cat36'], ['cat50','cat2'], ['cat103','cat87'], ['cat25','cat2'], ['cat103','cat23'], \
                ['cat73','cat72'], ['cat72','cat36'], ['cat80','cat2'], ['cat103','cat10'], ['cat50','cat72'], \
                ['cat72','cat87'], ['cat1','cat9'], ['cat79','cat2'], ['cat80','cat87'], ['cat111','cat11'], \
                ['cat76','cat79'], ['cat72','cat10'], ['cat80','cat73'], ['cat72','cat12'], ['cat79','cat87'], \
                ['cat6','cat73'], ['cat72','cat23'], ['cat12','cat38'], ['cat36','cat23'], ['cat6','cat36'], \
                ['cat50','cat9'], ['cat111','cat10'], ['cat111','cat5'], ['cat36','cat9'], ['cat36','cat2'], \
                ['cat50','cat87'], ['cat80','cat57'], ['cat6','cat1'], ['cat73','cat9'], ['cat1','cat2'], \
                ['cat23','cat2'], ['cat11','cat87'], ['cat38','cat2'], ['cat76','cat2'], ['cat72','cat25'], \
                ['cat24','cat103'], ['cat6','cat12'], ['cat6','cat80'], ['cat72','cat11'], ['cat103','cat13'], \
                ['cat79','cat9']]

cat_add = []
for comb in comb_features:
    full_data[comb[0] + "_" + comb[1]] = full_data[comb[0]] + full_data[comb[1]]
    cat_add.append(comb[0] + "_" + comb[1])
cat_cols = cat_cols + cat_add

# Original combination features: https://www.kaggle.com/misfyre/allstate-claims-severity/encoding-feature-comb-modkzs-1108-72665/discussion
#import itertools
#comb_list = ['cat80', 'cat87', 'cat57', 'cat12', 'cat79', 'cat10', 'cat7', 'cat89', 'cat2', 'cat72', 'cat81', 'cat11', \
#             'cat1', 'cat13', 'cat9', 'cat3', 'cat16', 'cat90', 'cat23', 'cat36', 'cat73', 'cat103', 'cat40', 'cat28', \
#             'cat111', 'cat6', 'cat76', 'cat50', 'cat5', 'cat4', 'cat14', 'cat38', 'cat24', 'cat82', 'cat25']
# Added 'cat37','cat27','cat53','cat44'
#comb_list = ['cat37','cat27','cat53','cat44'] + comb_list
#cat_add = []
#for comb in itertools.combinations(comb_list, 2):
#    full_data[comb[0] + "_" + comb[1]] = full_data [comb[0]] + full_data [ comb[1]]
#    cat_add.append(comb[0] + "_" + comb[1])
cat_cols = cat_cols + cat_add

# Categorical features
### 1. Label Encoding (Factorizing)

In [198]:
LBL = preprocessing.LabelEncoder()
for cat_col in cat_cols:
    full_data[cat_col] = LBL.fit_transform(full_data[cat_col])

### 2. One Hot Encoding (get dummies)

In [199]:
OHE = preprocessing.OneHotEncoder(sparse=True)
full_data_sparse=OHE.fit_transform(full_data[cat_cols])

### 3. Leave-one-out Encoding

In [None]:
# start=time.time()
# loo_cols =[]
# for col in cat_cols:
#     print ("Leave-One-Out Encoding  %s" % (col))
#     print ("Leave-one-out encoding column %s for %s......" % (col, target_col))
#     aggr=full_data.groupby(col)[target_col].agg([np.mean]).join(full_data[:train_size].groupby(col)[target_col].agg([np.sum,np.size]),how='left')        
#     meanTagetAggr = np.mean(aggr['mean'].values)
#     aggr=full_data.join(aggr,how='left', on=col)[list(aggr.columns)+[target_col]]
#     loo_col = 'MEAN_BY_'+col+'_'+target_col
#     full_data[loo_col] = \
#     aggr.apply(lambda row: row['mean'] if math.isnan(row[target_col]) 
#                                                        else (row['sum']-row[target_col])/(row['size']-1)*random.uniform(0.95, 1.05) , axis=1)
#     loo_cols.append(loo_col)
#     print ("New feature %s created." % (loo_col))
# print ('Leave-one-out enconding finished in %f seconds' % (time.time()-start))

# Numeric features
### Calculate skewness of each numeric features:

In [201]:
from scipy.stats import skew, boxcox
skewed_cols = full_data[:train_size][num_cols].apply(lambda x: skew(x.dropna()))
print (skewed_cols.sort_values())

cont2           -0.310939
cont14_cat113   -0.242597
cont3           -0.010002
cont14           0.248672
cont14_cat112    0.250998
cont11           0.280819
cont12           0.291990
cont10           0.354998
cont13           0.380739
cont4            0.416093
cont6            0.461211
cont1            0.516420
cont8            0.676629
cont5            0.681617
cont7            0.826046
cont14_cat100    0.893204
cont9            1.072420
cont14_cat116    1.409188
cont14_cat110    1.672612
dtype: float64


### Apply box-cox transformations:

In [202]:
skewed_cols = skewed_cols[skewed_cols > 0.24].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] + 1)

### Apply Standard Scaling:

In [203]:
SSL = preprocessing.StandardScaler()
for num_col in num_cols:
    full_data[num_col] = SSL.fit_transform(full_data[num_col])



# Feature Generation (numeric)

In [200]:
# https://www.kaggle.com/tilii7/allstate-claims-severity/feature-importance-from-linear-and-tree-models/code
# Generate interaction between cont7/cont14 and 5 most important cats from tree-based and 6 most important cats from LR
#target_keys = ['cont1', 'cont7', 'cont14']
#aList = ['cat100', 'cat112', 'cat113', 'cat110', 'cat116', 'cat57', 'cat53', 'cat77', 'cat12', 'cat44', 'cat90']
target_keys = ['cont14']
aList = ['cat100', 'cat112', 'cat113', 'cat110', 'cat116']
num_add = []
for target_key in target_keys:
    for key in aList:
        gb = full_data[[target_key,key]].groupby(key).agg({target_key: 'mean'})
        full_data[target_key+'_'+key] = full_data[key].map(lambda x: gb.loc[x, target_key])
        num_add.append(target_key+'_'+key)
#print len(num_add)

# 20 num_cat combs decrease mae
#num_add = []
#comb_features = [['cont1','cat100'], ['cont1','cat112'], ['cont1','cat116'], ['cont1','cat57'], ['cont1','cat12'], ['cont1','cat44'], ['cont7','cat100'], ['cont7','cat110'], ['cont7','cat116'], ['cont7','cat57'], ['cont7','cat12'], ['cont7','cat44'], ['cont7','cat90'], ['cont14','cat100'], ['cont14','cat112'], ['cont14','cat116'], ['cont14','cat57'], ['cont14','cat53'], ['cont14','cat12'], ['cont14','cat90']]
#for comb in comb_features:
#    gb = full_data[[comb[0],comb[1]]].groupby(comb[1]).agg({comb[0]: 'mean'})
#    full_data[comb[0]+'_'+comb[1]] = full_data[comb[1]].map(lambda x: gb.ix[x, comb[0]])
#    num_add.append(comb[0]+'_'+comb[1])
num_cols = num_cols + num_add
full_data.head()

Unnamed: 0,cat1,cat10,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,...,cat6_cat12,cat6_cat80,cat72_cat11,cat103_cat13,cat79_cat9,cont14_cat100,cont14_cat112,cont14_cat113,cont14_cat110,cont14_cat116
0,0,0,1,6,0,0,8,4,6,9,...,0,3,1,0,3,0.504525,0.718677,0.511825,0.513514,0.529868
1,0,1,11,5,0,0,4,4,8,10,...,0,3,0,0,3,0.499385,0.310124,0.512084,0.489072,0.462916
2,0,1,11,14,0,1,4,5,7,5,...,1,1,1,3,3,0.499385,0.767099,0.442002,0.441747,0.471563
3,1,0,8,3,0,0,4,4,8,10,...,0,3,0,0,3,0.485629,0.599762,0.489333,0.488853,0.479936
4,0,1,5,9,0,0,3,4,10,6,...,1,1,2,0,7,0.496052,0.44213,0.512084,0.494981,0.492886


In [41]:
# Feature engineering (add interacted features again)
from pylightgbm.models import GBMRegressor
execpath = '/Users/didle/OtherSoftwares/LightGBM/lightgbm'

lift = 200
train_y = np.log(full_data[:train_size].loss.values + lift)

#full_data_sparse = sparse.hstack((full_data_sparse, full_data[num_cols+num_add]), format='csr')
#train_x = full_data_sparse[:train_size]
#X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

rgr = GBMRegressor(exec_path=execpath,learning_rate=0.1,metric = 'l1',num_threads = 4,num_iterations=10000,
                       early_stopping_round=50,max_bin=483, num_leaves=121, min_data_in_leaf=107, 
                       feature_fraction=0.195979, bagging_fraction=0.918178, verbose = False)
#rgr.fit(X_train, y_train, test_data=[(X_val,y_val)])
#y_pred = rgr.predict(X_val)
#print("Basic, Best round: ", rgr.best_round, "MAE: ", log_mae(y_val,y_pred))
    
for col in cat_add:
    OHE = preprocessing.OneHotEncoder(sparse=True)
    full_data_sparse=OHE.fit_transform(full_data[cat_cols+[col]])

    full_data_sparse = sparse.hstack((full_data_sparse, full_data[num_cols]), format='csr')
    train_x = full_data_sparse[:train_size]
    X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)
    
    rgr.fit(X_train, y_train, test_data=[(X_val,y_val)])
    y_pred = rgr.predict(X_val)
    print("key: ", col, "Best round: ", rgr.best_round, "MAE: ", log_mae(y_val,y_pred))

# basic: 1151.7255222311139

pyLightGBM is looking for 'LIGHTGBM_EXEC' environment variable, cannot be found.
exec_path will be deprecated in favor of environment variable
('key: ', 'cat103_cat111', 'Best round: ', 195, 'MAE: ', 1148.5096622134056)
('key: ', 'cat72_cat103', 'Best round: ', 223, 'MAE: ', 1151.726495506688)
('key: ', 'cat80_cat81', 'Best round: ', 278, 'MAE: ', 1147.1898693139321)
('key: ', 'cat73_cat1', 'Best round: ', 253, 'MAE: ', 1150.869728739375)
('key: ', 'cat72_cat111', 'Best round: ', 278, 'MAE: ', 1149.3486489581198)
('key: ', 'cat6_cat103', 'Best round: ', 242, 'MAE: ', 1151.4513942030712)
('key: ', 'cat6_cat111', 'Best round: ', 207, 'MAE: ', 1149.8048013618293)
('key: ', 'cat80_cat79', 'Best round: ', 183, 'MAE: ', 1152.2043645298631)
('key: ', 'cat1_cat111', 'Best round: ', 222, 'MAE: ', 1151.643939564628)
('key: ', 'cat79_cat103', 'Best round: ', 224, 'MAE: ', 1149.4073680516533)
('key: ', 'cat111_cat2', 'Best round: ', 264, 'MAE: ', 1148.7091043346877)
('key: ', 'cat79_cat111', 'Best

In [204]:
lift = 200

full_data_sparse = sparse.hstack((full_data_sparse, full_data[num_cols]), format='csr')
print (full_data_sparse.shape)
train_x = full_data_sparse[:train_size]
test_x = full_data_sparse[train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[train_size:].values
del full_data, full_data_sparse

#train_x = full_data[:train_size]
#test_x = full_data[train_size:]
#train_x = train_x[num_cols + cat_cols].values
#test_x = test_x[num_cols + cat_cols].values
#del full_data

(313864, 4815)


In [119]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

In [190]:
# grad - is first derivative of a function and hess is a second derivative of your loss function
# objective = 'reg:linear': minimize MSE, optimizes for the mean
# This problem should minimize MAE, optimizes for the median

# The median for log-normal distribution is exp(u) (log(Loss) ~ N(u, var)) 
# Thus, if you do log transformation on dependent variable and then use MSE to estimate mean of log(Loss) 
# would essentially equal with estimation of median of Loss.
def logregobj(labels, preds):
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def xgb_logregobj(preds, dtrain):
    con = 2
    labels = dtrain.get_label()
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess

def fairobj(labels, preds):
    fair_constant = 0.7
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xgb_fairobj(preds, dtrain):
    fair_constant = 0.7
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

# loss function for Cachy function
def cauchylobj(labels, preds):
    c = 2  #the lower the "slower/smoother" the loss is. Cross-Validate.
    x = (preds - labels)
    grad = x / ((x**2)/(c**2)+1)
    hess = -c**2 * (x**2 - c**2) / (x**2+c**2)**2
    return grad, hess

def xgb_cauchylobj(preds, dtrain):
    c = 2  #the lower the "slower/smoother" the loss is. Cross-Validate.
    labels = dtrain.get_label()
    x = (preds - labels)
    grad = x / ((x**2)/(c**2)+1)
    hess = -c**2 * (x**2 - c**2) / (x**2+c**2)**2
    return grad, hess

# loss function for ln(cosh(x)) objective, where x is absolute error of normally distributed random variable
# convergent slowly
#def logcoshobj(preds, dtrain):
#    labels = dtrain.get_label()
#    grad = np.tanh(preds - labels)
#    hess = 1.0 - grad*grad
#    return grad, hess

# loss function for log(exp(-x) + exp(x)), eqaulivent to the loss function ln(cosh(x))
#def logexpexp(preds, dtrain):
#    labels = dtrain.get_label()
#    x= preds - labels
#    grad = (np.exp(2.0*x) - 1) / (np.exp(2.0*x) + 1)
#    hess = (4.0*np.exp(2.0*x)) / (np.exp(2.0*x) + 1)**2 
#    return grad, hess

def log_mae(labels,preds, lift=200):
    return mean_absolute_error(np.exp(labels)-lift, np.exp(preds)-lift)

def eval_mae(yhat, dtrain, lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift)

def lgbm_eval_mae(yhat, dtrain, lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift), False

# XGBoost

In [191]:
import xgboost as xgb

### 1. Mannual Tunning

In [158]:
# mannual: max_depth (5~10), min_child_weight (1+, 5*), colsample_bytree (0.3-0.9), subsample (0.6-1), gamma (0~3, 0.3)
learning_rate = 0.1
obj = logregobj

max_depth = 5
min_child_weight = 200
colsample_bytree = 1.0
subsample = 1.0

for val in [0, 0.003, 0.01, 0.03, 0.1]:
    rgr = xgb.XGBRegressor( seed = 1234, # use a fixed seed during tuning so we can reproduce the results
                       learning_rate = learning_rate,
                       n_estimators = 10000,
                       objective = obj,
                       nthread = -1,
                       silent = True,
                       max_depth = max_depth,
                       min_child_weight = min_child_weight,
                       colsample_bytree = colsample_bytree,
                       subsample = subsample,
                       gamma = val
                      )
    rgr.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=eval_mae, early_stopping_rounds=50, verbose = False)
    y_pred = rgr.predict(X_val)
    print("val: ", val, "MAE: ", log_mae(y_val,y_pred), "best_round:", rgr.best_iteration)

('val: ', 0, 'MAE: ', 1146.1576988535892, 'best_round:', 1004)
('val: ', 0.003, 'MAE: ', 1146.1576988535892, 'best_round:', 1004)
('val: ', 0.01, 'MAE: ', 1146.1576988535892, 'best_round:', 1004)
('val: ', 0.03, 'MAE: ', 1146.1576988535892, 'best_round:', 1004)


KeyboardInterrupt: 

In [52]:
max_depth = 5
min_child_weight = 200
colsample_bytree = 1.0
subsample = 1.0
gamma = 0

### 2. Automated tuning - Bayesian Optimization

In [53]:
from bayes_opt import BayesianOptimization
xgtrain = xgb.DMatrix(train_x, label=train_y,missing=np.nan) #used for Bayersian Optimization

In [54]:
def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma):
    params = dict()
    params['eta'] = 0.1
    params['verbose_eval'] = True
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)

    # change objective function?
    cv_result = xgb.cv(params, xgtrain, num_boost_round=10000, nfold=4,
                       obj = xgb_fairobj, # change it if necessary
                       feval=eval_mae,
                       seed=1234, callbacks=[xgb.callback.early_stop(50)])

    return -cv_result['test-mae-mean'].values[-1]


xgb_BO = BayesianOptimization(xgb_evaluate, 
                             {'max_depth': (max_depth - 1, max_depth + 3),
                              'min_child_weight': (min_child_weight - 20, min_child_weight + 20),
                              'colsample_bytree': (max(colsample_bytree - 0.2, 0.1), min(colsample_bytree + 0.2, 1)),
                              'subsample': (max(subsample - 0.2, 0.1), min(subsample + 0.2, 1)),
                              'gamma': (max(gamma - 0.25, 0), gamma + 0.2)
                             }
                            )
xgb_BO.maximize(init_points=5, n_iter=25)
del xgtrain

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[5476]	train-mae:1104.56+1.89461	test-mae:1142.17+6.53569

    1 | 110m44s | [35m-1142.17499[0m | [32m            0.9429[0m | [32m   0.1939[0m | [32m     2.6780[0m | [32m          193.1654[0m | [32m     0.8772[0m | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[1536]	train-mae:1070.11+1.6556	test-mae:1138.78+6.26471

    2 | 49m27s | [35m-1138.77875[0m | [32m            0.9174[0m | [32m   0.1188[0m | [32m     4.9612[0m | [32m   

  " state: %s" % convergence_dict)


    8 | 90m06s | -1143.95685 |             0.8000 |    0.2000 |      2.0000 |           172.6061 |      0.8000 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[789]	train-mae:1053.09+2.56312	test-mae:1140.06+7.57559

    9 | 33m16s | -1140.06323 |             0.8000 |    0.2000 |      6.0000 |           200.0000 |      0.8000 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[791]	train-mae:1048.05+2.45294	test-mae:1139.27+6.35895



  " state: %s" % convergence_dict)


   10 | 36m08s | -1139.26886 |             0.8000 |    0.2000 |      6.0000 |           176.5578 |      0.8000 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[961]	train-mae:1065.02+2.10662	test-mae:1139.12+5.55878

   11 | 45m36s | -1139.12286 |             0.9857 |    0.0822 |      5.9400 |           193.0848 |      0.8005 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[4778]	train-mae:1106.97+1.95695	test-mae:1142.99+6.55656

   12 | 112m44s | -1142.99216 |             0.9783 |    0.1099 |      2.6692 |           163.0974 |      0.8423 | 
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 50 rounds.
Stopping. Best iteration:
[1051]	train-mae:1057.95+2.44399	test-m

In [55]:
xgb_bo_scores = pd.DataFrame([[s[0]['max_depth'],
                               s[0]['min_child_weight'],
                               s[0]['subsample'],
                               s[0]['colsample_bytree'],
                               s[0]['gamma'],
                               s[1]] for s in zip(xgb_BO.res['all']['params'],xgb_BO.res['all']['values'])],
                            columns = ['max_depth',
                                       'min_child_weight',
                                       'subsample',
                                       'colsample_bytree',
                                       'gamma',
                                       'score'])
xgb_bo_scores=xgb_bo_scores.sort_values('score',ascending=False)
xgb_bo_scores

Unnamed: 0,max_depth,min_child_weight,subsample,colsample_bytree,gamma,score
10,6.0,189.466444,1.0,0.8,0.2,-1138.383515
7,5.874609,186.322461,0.893639,0.944709,0.030789,-1138.798157
1,6.0,173.891017,1.0,1.0,0.0,-1138.913239
12,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
14,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
23,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
22,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
21,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
20,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345
19,4.805533,190.839429,0.958068,0.927896,0.125444,-1139.122345


## Cross Validation and fit

In [192]:
fold = 10
obj = logregobj
train_pred = np.zeros((train_x.shape[0], 1))
test_pred = np.zeros((test_x.shape[0], fold))
    
skf = list(KFold(len(train_y), fold))
best_rounds = []
scores=[]
for i, (train, val) in enumerate(skf):
    est = xgb.XGBRegressor(
                       learning_rate = 0.005,
                       n_estimators = 500000,
                       max_depth= 7,
                       min_child_weight=183, 
                       colsample_bytree=0.2,
                       subsample=0.8,
                       gamma =0.5,
                       objective = obj,
                       nthread = -1,
                       silent = True
                      )
    est.fit(train_x[train],train_y[train],
            eval_set=[(train_x[val], train_y[val])], 
            eval_metric = eval_mae, 
            early_stopping_rounds = 500, verbose = False)
    val_y_predict_fold = est.predict(train_x[val])
    train_pred[val,0] = val_y_predict_fold
    test_pred[:,i] = est.predict(test_x)
    score = log_mae(train_y[val], val_y_predict_fold)
    print (score, est.best_iteration)
    scores.append(score)
    best_rounds.append(est.best_iteration)
print (-np.mean(scores), np.mean(best_rounds))

train_pred = np.exp(train_pred) - lift
test_pred = np.exp(test_pred) - lift

(1134.8576701708903, 27419)
(1116.6790009215167, 17972)
(1131.1384039528332, 21360)
(1115.2645121508447, 14962)
(1140.7384546532385, 17910)
(1130.7031351835888, 20260)
(1130.2607257929228, 19997)
(1130.3115546841811, 20444)
(1119.252611666559, 16626)
(1118.2582621231877, 23023)
(-1126.7464331299761, 19997.299999999999)


In [193]:
pd.DataFrame(train_pred, columns = ['Pred']).to_csv('../output/train_pred.csv', index = False)
test_pred = pd.DataFrame(test_pred, columns = ['Pred_'+str(k) for k in range(fold)])
test_pred['avgPred'] = test_pred.mean(axis = 1)
test_pred.to_csv('../output/test_pred.csv', index = False)
pd.DataFrame({'id': ID, 'loss': test_pred['avgPred']}).to_csv('../output/submission.csv', index = False)

# LightGBM

In [205]:
import lightgbm as lgb

### 1. Mannual Tunning

In [137]:
# tune num_leaves (255, doulbe/half) -> min_data_in_leaf (100, +/-20), feature_fraction (1, -0.1), 
# bagging_fraction (1, -0.1), bagging_freq (0, 1, 2, ...), max_bin (255, doulbe/half)
learning_rate = 0.1
obj = logregobj

num_leaves=7
min_child_weight=180
colsample_bytree=0.6
subsample=1
subsample_freq=0
max_bin=255
#reg_alpha=0.003

for val in [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3]:
    rgr = lgb.LGBMRegressor(seed = 1234, # use a fixed seed during tuning so we can reproduce the results
                   learning_rate=learning_rate,
                   n_estimators=10000,
                   objective=obj,
                   nthread = -1, #The acutal cores of CPU
                   num_leaves=num_leaves,
                   min_child_weight = min_child_weight,
                   colsample_bytree = colsample_bytree,
                   subsample = subsample,
                   subsample_freq = subsample_freq,
                   max_bin = max_bin,
                   reg_alpha = val,
                   silent = True)
    rgr.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric=lgbm_eval_mae, early_stopping_rounds=50, verbose = False)
    y_pred = rgr.predict(X_val)
    print("val: ", val, "MAE: ", log_mae(y_val,y_pred), "best_round:", rgr.best_iteration)


('val: ', 0, 'MAE: ', 1145.2003352659963, 'best_round:', 2002)
('val: ', 0.001, 'MAE: ', 1149.6040074938803, 'best_round:', 1370)
('val: ', 0.003, 'MAE: ', 1145.1468359398334, 'best_round:', 2442)
('val: ', 0.01, 'MAE: ', 1147.9205449647668, 'best_round:', 1549)
('val: ', 0.03, 'MAE: ', 1146.300458525963, 'best_round:', 1696)
('val: ', 0.1, 'MAE: ', 1146.477290412138, 'best_round:', 1975)
('val: ', 0.3, 'MAE: ', 1148.031414890273, 'best_round:', 1536)
('val: ', 1, 'MAE: ', 1146.9586082476646, 'best_round:', 1810)
('val: ', 3, 'MAE: ', 1146.3344093325347, 'best_round:', 1971)


In [206]:
num_leaves=50
min_child_weight=180
colsample_bytree=0.1
subsample=0.8
subsample_freq=1
max_bin=484
reg_alpha=0.003

### 2. Automated tuning - Bayesian Optimization

In [207]:
from bayes_opt import BayesianOptimization

In [208]:
learning_rate = 0.1
obj = logregobj
def lgbm_cv_bagging(max_bin, num_leaves, min_child_weight, colsample_bytree, subsample, reg_alpha, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est = lgb.LGBMRegressor(seed = 1234, # use a fixed seed during tuning so we can reproduce the results
                   learning_rate=learning_rate,
                   n_estimators=10000,
                   objective=obj,
                   nthread = -1, #The acutal cores of CPU
                   num_leaves=int(num_leaves),
                   min_child_weight = int(min_child_weight),
                   colsample_bytree = colsample_bytree,
                   subsample = subsample,
                   subsample_freq = int(subsample_freq),
                   max_bin = int(max_bin),
                   reg_alpha = reg_alpha,
                   silent = True)
        train_x_fold = train_x[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x[val]
        val_y_fold = train_y[val]
        est.fit(train_x_fold, train_y_fold, eval_set=[(val_x_fold, val_y_fold)], eval_metric=lgbm_eval_mae, early_stopping_rounds=50, verbose = False)
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold)
        print (score, est.best_iteration)
        scores.append(score)
    return -np.mean(scores)

def lgbm_cv_nobagging(max_bin, num_leaves, min_child_weight, colsample_bytree, reg_alpha, learning_rate=0.1):
    skf = list(KFold(len(train_y), 4))
    scores=[]
    for i, (train, val) in enumerate(skf):
        est = lgb.LGBMRegressor(seed = 1234, # use a fixed seed during tuning so we can reproduce the results
                   learning_rate=learning_rate,
                   n_estimators=10000,
                   objective=obj,
                   nthread = -1, #The acutal cores of CPU
                   num_leaves=int(num_leaves),
                   min_child_weight = int(min_child_weight),
                   colsample_bytree = colsample_bytree,
                   subsample = 1,
                   subsample_freq = 0,
                   max_bin = int(max_bin),
                   reg_alpha = reg_alpha,
                   silent = True)
        train_x_fold = train_x[train]
        train_y_fold = train_y[train]
        val_x_fold = train_x[val]
        val_y_fold = train_y[val]
        est.fit(train_x_fold, train_y_fold, eval_set=[(val_x_fold, val_y_fold)], eval_metric=lgbm_eval_mae, early_stopping_rounds=50, verbose = False)
        val_y_predict_fold = est.predict(val_x_fold)
        score = log_mae(val_y_fold, val_y_predict_fold)
        print (score, est.best_iteration)
        scores.append(score)
    return -np.mean(scores)

lgbm_BO = BayesianOptimization(lgbm_cv_bagging, {
                                     'max_bin': (255, 511),
                                     'num_leaves': (31, 127),
                                     'min_child_weight' :(160, 220),
                                     'colsample_bytree': (0.01, 0.2),
                                     'subsample' : (0.7, 1),
                                     'reg_alpha': (0, 0.01)})

#lgbm_BO = BayesianOptimization(lgbm_cv_nobagging, {
#                                     'max_bin': (255, 511),
#                                     'num_leaves': (31, 127),
#                                     'min_child_weight' :(140, 240),
#                                     'colsample_bytree': (0.1, 0.8),
#                                     'reg_alpha': (0, 0.01)})
lgbm_BO.maximize(init_points=5, n_iter=30)

[31mInitialization[0m
[94m------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   colsample_bytree |   max_bin |   min_child_weight |   num_leaves |   reg_alpha |   subsample | 
(1142.4069211093899, 1389)
(1153.4811690241861, 1166)
(1147.1241301297898, 1300)
(1145.0921910299503, 1325)
    1 | 24m43s | [35m-1147.02610[0m | [32m            0.0116[0m | [32m 352.9165[0m | [32m          208.9087[0m | [32m     90.6704[0m | [32m     0.0015[0m | [32m     0.8028[0m | 
(1142.5668075373803, 489)
(1148.5171100983987, 379)
(1144.2946569336498, 369)
(1143.3360282357949, 470)
    2 | 10m05s | [35m-1144.67865[0m | [32m            0.0713[0m | [32m 459.6612[0m | [32m          212.3745[0m | [32m     94.4639[0m | [32m     0.0054[0m | [32m     0.7071[0m | 
(1141.0394608107106, 608)
(1147.1890154565699, 764)
(1143.3356862634337, 565)
(1139.7332630428054, 577)
    3 | 12m1

  " state: %s" % convergence_dict)


    9 | 07m25s | -1143.39592 |             0.2000 |  382.1268 |           160.0000 |      31.0000 |      0.0000 |      0.7000 | 
(1140.7759344541539, 903)
(1144.906313506069, 748)
(1143.9340420381127, 883)
(1142.3140691095168, 887)
   10 | 08m32s | -1142.98259 |             0.2000 |  390.6923 |           220.0000 |      31.0000 |      0.0000 |      1.0000 | 
(1145.95613508055, 1098)
(1153.0461585346436, 1062)
(1147.104359596515, 1377)
(1147.3060495386319, 1443)
   11 | 20m41s | -1148.35318 |             0.0100 |  255.0000 |           160.0000 |     127.0000 |      0.0000 |      1.0000 | 
(1142.9376155235273, 1611)
(1150.9856633824013, 1502)
(1148.7007253408169, 1465)
(1147.2306155604042, 1707)
   12 | 17m48s | -1147.46365 |             0.0100 |  386.8388 |           220.0000 |      60.1823 |      0.0000 |      0.7000 | 
(1146.2808096417707, 212)
(1146.8047664250946, 248)
(1148.1099264757588, 312)
(1143.4470080060532, 278)
   13 | 06m56s | -1146.16063 |             0.2000 |  511.0000 | 

  " state: %s" % convergence_dict)


   22 | 10m38s | -1144.00246 |             0.2000 |  367.8985 |           203.3066 |      31.0000 |      0.0100 |      1.0000 | 
(1137.2692522112668, 766)
(1144.5144231353736, 750)
(1141.6512614047936, 733)
(1138.5227630583731, 775)


  " state: %s" % convergence_dict)


   23 | 11m22s | [35m-1140.48942[0m | [32m            0.0549[0m | [32m 424.8913[0m | [32m          160.4803[0m | [32m     41.5332[0m | [32m     0.0083[0m | [32m     0.7553[0m | 
(1139.5764197044923, 506)
(1144.4577249000829, 509)
(1143.1163485362224, 459)
(1142.10659256442, 549)
   24 | 12m43s | -1142.31427 |             0.0639 |  362.8903 |           206.2861 |      78.9789 |      0.0006 |      0.9121 | 
(1142.8938779601274, 539)
(1146.1017059047183, 400)
(1144.6648630011669, 320)
(1143.3059809401154, 447)
   25 | 10m57s | -1144.24161 |             0.0571 |  484.9354 |           206.7286 |     101.6738 |      0.0053 |      0.7767 | 
(1140.7108210887407, 434)
(1148.4262357361306, 488)
(1146.1394910899328, 593)
(1142.4953920871264, 490)
   26 | 09m37s | -1144.44299 |             0.1230 |  453.8276 |           160.4153 |      53.5665 |      0.0024 |      0.7022 | 
(1142.1579119940977, 1945)
(1150.3592475494152, 1503)
(1146.3569679854315, 1875)
(1146.2343618406551, 1726)
   

  " state: %s" % convergence_dict)


   30 | 25m03s | -1146.79867 |             0.0100 |  416.6993 |           160.0000 |      52.3347 |      0.0000 |      0.7000 | 
(1140.4757824921196, 349)
(1151.4633353799902, 254)
(1146.6472026729778, 294)
(1143.3490886231871, 359)
   31 | 09m43s | -1145.48385 |             0.1801 |  386.1441 |           181.5562 |     101.2623 |      0.0090 |      0.8047 | 
(1139.7336118058795, 501)
(1146.7807043108555, 639)
(1144.6715590799881, 856)
(1138.8146426316732, 873)
   32 | 10m11s | -1142.50013 |             0.1222 |  433.3767 |           168.8109 |      35.3339 |      0.0080 |      0.8825 | 
(1142.8869426446361, 441)
(1148.0578310235217, 398)
(1145.4569996699884, 352)
(1142.1483668258034, 410)
   33 | 11m15s | -1144.63754 |             0.0695 |  446.7031 |           179.0185 |     108.1515 |      0.0011 |      0.7853 | 
(1141.7666671883608, 315)
(1147.6011921974773, 344)
(1144.947497439736, 311)
(1143.6227701353603, 301)
   34 | 10m29s | -1144.48453 |             0.1314 |  455.0124 |      

In [211]:
gbm_bo_scores = pd.DataFrame([[s[0]['num_leaves'],
                               s[0]['min_child_weight'],
                               s[0]['max_bin'],
                               s[0]['colsample_bytree'],
                               s[0]['subsample'],
                               s[0]['reg_alpha'],
                               s[1]] for s in zip(lgbm_BO.res['all']['params'],lgbm_BO.res['all']['values'])],
                            columns = ['num_leaves',
                                       'min_child_weight',
                                       'max_bin',
                                       'colsample_bytree',
                                       'subsample',
                                       'reg_alpha',
                                       'score'])
gbm_bo_scores=gbm_bo_scores.sort_values('score',ascending=False)
gbm_bo_scores

Unnamed: 0,num_leaves,min_child_weight,max_bin,colsample_bytree,subsample,reg_alpha,score
17,41.533235,160.480263,424.891268,0.054892,0.755303,0.008349064,-1140.489425
13,32.821725,169.632318,448.124003,0.042388,0.719071,0.008139682,-1141.108369
15,35.970286,212.3441,368.303197,0.158388,0.769226,4.163086e-05,-1141.362391
14,64.95226,206.887199,380.61391,0.104926,0.987334,0.00851389,-1142.109702
10,32.595404,172.350784,418.064367,0.117317,0.773481,0.003703591,-1142.178851
18,78.978943,206.286086,362.890256,0.06389,0.912071,0.0006213211,-1142.314271
26,35.333856,168.810942,433.376664,0.122177,0.882492,0.007954188,-1142.500129
4,31.0,220.0,390.692263,0.2,1.0,0.0,-1142.98259
29,60.761983,195.806403,425.892291,0.025045,0.782639,0.006830397,-1143.189453
0,31.0,220.0,255.0,0.2,1.0,0.0,-1143.241385


## Cross Validation & Fit

In [212]:
fold = 10
train_pred = np.zeros((train_x.shape[0], 1))
test_pred = np.zeros((test_x.shape[0], fold))
    
skf = list(KFold(len(train_y), fold))
scores=[]
best_rounds=[]
for i, (train, val) in enumerate(skf):
    est=lgb.LGBMRegressor(learning_rate=0.005,
                   n_estimators=500000,
                   objective=obj,
                   nthread = -1, #The acutal cores of CPU
                   num_leaves=42,
                   min_child_weight = 160,
                   colsample_bytree = 0.055,
                   subsample = 0.76,
                   subsample_freq = 1,
                   max_bin = 424,
                   reg_alpha = 0.008,
                   silent = True)
    est.fit(train_x[train], train_y[train], eval_set=[(train_x[val], train_y[val])], 
            eval_metric=lgbm_eval_mae, early_stopping_rounds=500, verbose = False)
    val_y_predict_fold = est.predict(train_x[val])
    train_pred[val,0] = val_y_predict_fold
    test_pred[:,i] = est.predict(test_x)
    score = log_mae(train_y[val], val_y_predict_fold)
    print (score, est.best_iteration)
    best_rounds.append(est.best_iteration)
    scores.append(score)
print (-np.mean(scores), np.mean(best_rounds))
train_pred = np.exp(train_pred) - lift
test_pred = np.exp(test_pred) - lift

(1133.1663002518465, 19214)
(1117.0386514953771, 15911)
(1131.7961480318784, 18736)
(1114.0050047316508, 15253)
(1140.0568886975177, 15355)
(1131.9771909108647, 17264)
(1129.8376509126649, 18345)
(1129.9701815935912, 17848)
(1119.4439506783817, 18554)
(1117.2647914303179, 15547)
(-1126.4556758734091, 17202.700000000001)


In [213]:
pd.DataFrame(train_pred, columns = ['Pred']).to_csv('../output/train_pred.csv', index = False)
test_pred = pd.DataFrame(test_pred, columns = ['Pred_'+str(k) for k in range(fold)])
test_pred['avgPred'] = test_pred.mean(axis = 1)
test_pred.to_csv('../output/test_pred.csv', index = False)
pd.DataFrame({'id': ID, 'loss': test_pred['avgPred']}).to_csv('../output/submission.csv', index = False)

In [214]:
Repnum = 5
pred_train_y = np.zeros((train_x.shape[0], Repnum))
pred_test_y = np.zeros((test_x.shape[0], Repnum))
for k in range(Repnum):
    rgr = lgb.LGBMRegressor(learning_rate=0.005,                             
                     n_estimators=17202,
                     objective=obj,
                     nthread = -1, #The acutal cores of CPU
                     max_bin=424,
                     num_leaves=42,
                     min_child_samples=160,
                     colsample_bytree=0.055,
                     subsample=0.76,
                     subsample_freq=1,
                     reg_alpha=0.008,
                     silent=True)
    rgr.fit(train_x, train_y)
    
    pred_test_y[:, k] = np.exp(rgr.predict(test_x)) - lift
    pd.DataFrame({'prob': pred_test_y[:,k]}).to_csv('../output/test_'+str(k)+'.csv', index = False, header = False)

# Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2

## Comment out following lines if you are using Theano as backend
#import tensorflow as tf
#tf.python.control_flow_ops = tf

In [None]:
# custom metric function for Keras
def mae_log(y_true, y_pred): 
    lift = 200
    return K.mean(K.abs((K.exp(y_pred)-lift) - (K.exp(y_true)-lift)))

# Keras deosn't support sparse matrix. 
# The following functions are useful to split a large sparse matrix into smaller batches so they can be loaded into mem.

def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0 
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

## Tuning

In [None]:
early_stop = EarlyStopping(monitor='mae_log', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0, mode='auto')
checkpointer = ModelCheckpoint(filepath="weights.hdf5", monitor='mae_log', verbose=1, save_best_only=True, mode='min')

def create_model(input_dim):
    model = Sequential()
    
    model.add(Dense(400, # number of input units: needs to be tuned
                    input_dim = input_dim # fixed length: number of columns of X
                   ))
    
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(200)) # number of hidden units. needs to be tuned.
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2)) #dropout rate. needs to be tuned
    
    
    model.add(Dense(1)) # 1 for regression 
    model.compile(loss = 'mae',
                  metrics=[mae_log],
                  optimizer = 'adadelta' # optimizer. you may want to try different ones
                 )
    return(model)

model = create_model(X_train.shape[1])
fit = model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=train_size,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

## Cross Validation and Fit

In [None]:
from sklearn.cross_validation import KFold

early_stop = EarlyStopping(monitor='mae_log', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0, mode='auto')
checkpointer = ModelCheckpoint(filepath="weights.hdf5", monitor='mae_log', verbose=1, save_best_only=True, mode='min')

def nn_model(params):
    model = Sequential()
    model.add(Dense(params['input_size'], input_dim = params['input_dim']))

    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(params['input_drop_out']))
        
    model.add(Dense(params['hidden_size']))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out']))
    
    nadam = Nadam(lr=params['learning_rate'])
    
    model.add(Dense(1))
    model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')
    return(model)

def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=0, batch_size=128):
    print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = list(KFold(len(train_y), fold))
    
    train_blend_x = np.zeros((train_x.shape[0], len(parameters)))
    test_blend_x = np.zeros((test_x.shape[0], len(parameters)))
    scores = np.zeros ((len(skf),len(parameters)))
    best_rounds = np.zeros ((len(skf),len(parameters)))
 
    for j, nn_params in enumerate(parameters):
        print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], len(skf)))
        for i, (train, val) in enumerate(skf):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train]
            train_y_fold = train_y[train]
            val_x_fold = train_x[val]
            val_y_fold = train_y[val]
            
            # early stopping
            model = nn_model(nn_params)
            print (model)
            
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, batch_size, True),
                                     nb_epoch=60,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     callbacks=[
#                                                 EarlyStopping(monitor='mae_log'
#                                                               , patience=early_stopping_rounds, verbose=0, mode='auto'),
                                                ModelCheckpoint(filepath="weights.hdf5"
                                                                , monitor='mae_log', 
                                                                verbose=0, save_best_only=True, mode='min')
                                                ]
                                     )
            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')

            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_generator(generator=batch_generatorp(val_x_fold, batch_size, False),
                                        val_samples=val_x_fold.shape[0]
                                     )
            
            score = log_mae(val_y_fold, val_y_predict_fold)
            print ("Score: ", score, mean_absolute_error(val_y_fold, val_y_predict_fold))
            scores[i,j]=score
            train_blend_x[val, j] = val_y_predict_fold.reshape(val_y_predict_fold.shape[0])
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')            
            test_blend_x_j[:,i] = model.predict_generator(generator=batch_generatorp(test_x, batch_size, True),
                                        val_samples=test_x.shape[0]
                                     ).reshape(test_x.shape[0])
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
   
        test_blend_x[:,j] = test_blend_x_j.mean(1)
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds )

In [None]:
nn_parameters = [
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :450 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 250 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 250 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :450 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :350 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.6 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 150 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.1,
     'learning_rate': 0.1}
]

(train_blend_x, test_blend_x, blend_scores,best_round) = nn_blend_data(nn_parameters, train_x, train_y, test_x,
                                                         4,
                                                         5)



In [None]:
train_blend_x = np.exp(train_blend_x) - 200
test_blend_x = np.exp(test_blend_x) - 200
np.savetxt('../output/cv_train_keras10.csv', train_blend_x, delimiter=",")
np.savetxt('../output/cv_test_keras10.csv', test_blend_x, delimiter=",")

In [None]:
pred_y = np.exp(test_blend_x[:,3:4]) - 200 # the forth column of test_blend_x
results = pd.DataFrame()
results['id'] = ID
results['loss'] = pred_y
results.to_csv("../output/sub_keras_starter.csv", index=False)
print ("Submission created.")

pred_y = np.exp(np.mean(test_blend_x,axis=1)) - 200

results = pd.DataFrame()
results['id'] = full_data[train_size:].id
results['loss'] = pred_y
results.to_csv("../output/sub_keras_mean.csv", index=False)
print ("Submission created.")

In [None]:
def MAE(pred,y):
    error = np.exp(pred) -np.exp(y)
    error = np.mean((error**2)**.5)
    return 'mcc error',error
    
# original form
def MAE2(pred,y):
    error = pred - y
    error = np.mean((error**2)**.5)
    return 'mcc error',error

#### NOW THE MCMC PART to find individal weights for ensemble####
num = train.shape[1]
#weight = np.array([1.0/num,]*num)
weight = np.array([ 0.02523733,  0.00312543,  0.00631371,  0.00700795,  0.00810079,
        0.01606908,  0.01690393,  0.07645415,  0.04172416,  0.00971234,
        0.06701633,  0.001     ,  0.02779518,  0.02927811,  0.01636343,
        0.00781184,  0.001     ,  0.001     ,  0.08138719,  0.001     ,
        0.01751576,  0.04822087,  0.02539103,  0.02555227,  0.0514218 ,
        0.01235628,  0.03889011,  0.20541707,  0.06427481,  0.0818134 ])
# This is to define variables to be used later
Repnum = 10

train_mcmc=np.zeros((train.shape[0], Repnum))
test_mcmc_1=np.zeros((test_1.shape[0], Repnum))
test_mcmc_2=np.zeros((test_2.shape[0], Repnum))
for k in range(Repnum):
    pred_new = np.zeros(train.shape[0])
    pred_old = np.zeros(train.shape[0])
    counter = 0
    n=10000 ###MCMC steps
    result={}
    
    print('\n Finding weights by MCMC ...')
    for i in range(num):
        pred_new += train[:,i]*weight[i]
    pred_old = pred_new
    
    #### MCMC  #### 
    ### MCMC algo for dummies 
    ### 1. Get initialize ensemble weights
    ### 2. Generate new weights 
    ### 3. if MAE is lower, accept new weights immediately , or else accept new weights with probability of np.exp(-diff/.3)
    ### 4. repeat 2-3
    for i in range(n):
        new_weights = weight+ np.array([0.005,]*num)*np.random.normal(loc=0.0, scale=1.0, size=num)
        new_weights[new_weights < 0.001]=0.001 #0.01=>0.001
        pred_new=np.zeros(train.shape[0])
        for ii in range(num):
            pred_new += train[:,ii]*new_weights[ii]
        diff = MAE2(pred_new,train_y)[1] - MAE2(pred_old,train_y)[1]
        prob = min(1,np.exp(-diff/.5)) #0.3 -> 0.5
        random_prob = np.random.rand()
        if random_prob < prob:
            weight = new_weights
            pred_old = pred_new
            result[i] = (MAE2(pred_new,train_y)[1] ,MAE2(pred_old,train_y)[1],prob,random_prob ,weight)
            counter +=1
    print (counter *1.0 / n, 'Acceptance Ratio') #keep this [0.4,0.6] for best results
    print ('best result MAE', sorted([result[i] for i in result])[0:1][0])
        
    weight=sorted([result[i] for i in result])[0:1][-1]
    weight = weight[-1]
        
    for i in range(num):
        train_mcmc[:,k] += train[:,i]*weight[i]
        test_mcmc_1[:,k] += test_1[:,i]*weight[i]
        test_mcmc_2[:,k] += test_2[:,i]*weight[i]
    print ('combined all features plus MCMC weights:',',MAE=', MAE2(train_mcmc[:,k],train_y))
    print ('weights:', weight)
### notice the weights do not necessarily sum to 1 ###

#train_mcmc=np.exp(train_mcmc) - lift
#test_mcmc_1=np.exp(test_mcmc_1) - lift
#test_mcmc_2=np.exp(test_mcmc_2) - lift

train_pred = pd.DataFrame(train_mcmc, columns = ['Pred_'+str(k) for k in range(Repnum)])
train_pred['avgPred'] = train_pred.mean(axis=1)
train_pred['id'] = train_id['id']
train_pred['loss'] = train_id['loss']
train_pred.to_csv('../output/final_mcmc30K_train_97.csv', index = False)

test_pred1 = pd.DataFrame(test_mcmc_1, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred1['avgPred'] = test_pred1.mean(axis = 1)
test_pred1['id'] = test_id['id']
#test_pred1.to_csv('../output/final_mcmc30K_test1_9.csv', index = False)
test_pred1[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test1_97.csv', header = ['id','loss'], index = False)

test_pred2 = pd.DataFrame(test_mcmc_2, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred2['avgPred'] = test_pred2.mean(axis = 1)
test_pred2['id'] = test_id['id']
#test_pred2.to_csv('../output/final_mcmc30K_test2_9.csv', index = False)
test_pred2[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test2_97.csv', header = ['id','loss'], index = False)

test_id['loss'] = 0.5*test_pred1['avgPred']+0.5*test_pred2['avgPred']
test_id.to_csv('../output/final_mcmc30K_test_97.csv', index = False)


 Finding weights by MCMC ...
