In [1]:
import numpy as np
import pandas as pd
import time
import random
import itertools
from scipy.stats import skew, boxcox
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import xgboost as xgb
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_absolute_error
%matplotlib inline



# Load Data

In [2]:
# Try some combinations on numeric features
lin_list = []
for k in range(1, 15):
    lin_list.append('lin_cont'+str(k))
    
train_lin = pd.read_csv('../input/lin_train.csv', usecols=lin_list)
test_lin = pd.read_csv('../input/lin_test.csv', usecols=lin_list)

df_lin = pd.concat((train_lin, test_lin), axis = 0, ignore_index = True)
df_lin.describe()

Unnamed: 0,lin_cont1,lin_cont2,lin_cont3,lin_cont4,lin_cont5,lin_cont6,lin_cont7,lin_cont8,lin_cont9,lin_cont10,lin_cont11,lin_cont12,lin_cont13,lin_cont14
count,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0,313864.0
mean,581.671896,25.16662,52.72836,37.626042,27.023271,2317.240047,22085.974435,48.959696,343.005722,131.056808,186.048065,187.897421,246.021054,49120.127144
std,161.101318,3.719585,11.374909,24.474584,28.78499,841.712513,8899.382617,41.839242,113.732822,37.752062,64.128107,65.519916,72.695026,32155.655789
min,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,471.0,23.0,45.0,20.0,0.0,1739.0,16074.0,17.0,273.0,106.0,135.0,137.0,188.0,21123.0
50%,563.0,26.0,54.0,33.0,18.0,2115.0,19779.0,39.0,314.0,123.0,175.0,178.0,204.0,37210.0
75%,668.0,28.0,59.0,53.0,44.0,2866.0,26036.0,70.0,375.0,153.0,234.0,237.0,306.0,79759.0
max,1235.0,32.0,85.0,113.0,145.0,5608.0,61527.0,217.0,908.0,290.0,579.0,585.0,509.0,104060.0


In [9]:
# Try some combinations on numeric features
for comb in itertools.combinations(lin_list, 2):
    feat = comb[0] + "+" + comb[1]
    df_lin[feat] = df_lin[comb[0]] + df_lin[comb[1]]
    feat = comb[0] + "-" + comb[1]
    df_lin[feat] = df_lin[comb[0]] - df_lin[comb[1]]
    feat = comb[0] + "*" + comb[1]
    df_lin[feat] = df_lin[comb[0]] * df_lin[comb[1]]
    feat = comb[0] + "/" + comb[1]
    df_lin[feat] = 1.0* (df_lin[comb[0]]+0.001) / (df_lin[comb[1]]+0.001)
df_lin.describe()

Unnamed: 0,lin_cont1,lin_cont2,lin_cont3,lin_cont4,lin_cont5,lin_cont6,lin_cont7,lin_cont8,lin_cont9,lin_cont10,...,lin_cont12*lin_cont13,lin_cont12/lin_cont13,lin_cont12+lin_cont14,lin_cont12-lin_cont14,lin_cont12*lin_cont14,lin_cont12/lin_cont14,lin_cont13+lin_cont14,lin_cont13-lin_cont14,lin_cont13*lin_cont14,lin_cont13/lin_cont14
count,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,...,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0,188318.0
mean,581.497329,25.168592,52.740689,37.598472,27.018559,2315.369614,22065.75398,48.891173,342.901528,130.991631,...,48367.71751,235.156935,49309.788629,-48934.266576,9325662.0,0.00806546,49367.77564,-48876.279564,12181870.0,0.0105998
std,161.022329,3.720725,11.386339,24.48889,28.798484,841.655763,8887.524506,41.83983,113.636739,37.751194,...,27485.807233,7968.031362,32148.976932,32142.602242,7343005.0,0.01344683,32149.234045,32142.376306,9153247.0,0.01754524
min,0.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2e-06,485.0,-103915.0,0.0,9.841067e-09,413.0,-103846.0,0.0,9.638926e-09
25%,470.0,23.0,45.0,20.0,0.0,1739.0,16024.0,16.0,273.0,106.0,...,27072.0,0.591625,21324.0,-79547.0,3360927.0,0.002427268,21373.0,-79492.0,4504829.0,0.00316607
50%,563.0,26.0,54.0,33.0,18.0,2115.0,19763.0,39.0,314.0,123.0,...,40128.0,0.732759,37482.5,-37100.0,7349296.0,0.004224954,37545.5,-37053.5,9837878.0,0.005616623
75%,667.0,28.0,59.0,53.0,44.0,2863.0,26031.0,70.0,374.0,152.0,...,64688.0,0.901961,79957.0,-20949.0,13635300.0,0.008866874,80009.0,-20910.0,17838910.0,0.01175564
max,1235.0,32.0,85.0,112.0,145.0,5608.0,61527.0,212.0,908.0,290.0,...,180856.0,557001.0,104327.0,36.0,56048850.0,1.151898,104429.0,2.0,51656880.0,1.005249


In [10]:
keys = ['+', '-', '*', '/']
comb_feats = []
for key in keys:
    comb_feats += [item for i, item in enumerate(df_lin.columns.ravel()) if key in item]
len(comb_feats)

364

In [5]:
# Load data
start = time.time() 
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

print train.shape, test.shape

(188318, 132) (125546, 131)


In [6]:
shift = 200
train_num = train.shape[0]
train_target = np.log(train.loss + shift)
test_id = test['id']

train = train.drop(['id','loss'], axis = 1)
test = test.drop(['id'], axis = 1)
df = pd.concat((train, test), axis = 0, ignore_index = True)
df = pd.concat([df, df_lin[comb_feats]], axis = 1)
del df_lin, train, test
df.shape

(313864, 494)

In [12]:
data_types = df.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)
print len(cat_cols), len(num_cols)

116 378


# Feature Preprocessing
### Categorical features

In [8]:
# combine feature: https://www.kaggle.com/modkzs/allstate-claims-severity/lexical-encoding-feature-comb/code
#COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
#def encode(charcode):
#    r = 0
#    ln = len(charcode)
#    for i in range(ln):
#        r += (ord(charcode[i])-ord('A')+1)*26**(ln-i-1)
#    return r

In [13]:
#### cat_feats interaction
LBL = preprocessing.LabelEncoder()
#### 52 least important
COMB_FEATURE = 'cat6,cat76,cat80,cat85,cat60,cat73,cat46,cat1,cat79,cat24,cat50,cat72,cat81,cat103,cat111,cat14,cat36,cat48,cat4,cat9,cat40,cat58,cat7,cat10,cat11,cat12,cat23,cat25,cat28,cat34,cat38,cat47,cat2,cat3,cat5,cat13,cat15,cat16,cat20,cat22,cat35,cat55,cat57,cat62,cat63,cat64,cat68,cat69,cat70,cat87,cat89,cat90'.split(',')
for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    df[feat] = df[comb[0]] + df[comb[1]]
    df[feat] = LBL.fit_transform(df[feat]).astype(np.uint16)
    #df[feat] = df[feat].apply(encode)
#for col in cat_cols:
    #df[col] = df[col].apply(encode)

In [None]:
LBL = preprocessing.LabelEncoder()
#### 38 selected cat pairs
COMB_FEATURE = [['cat103','cat111'],['cat72','cat103'],['cat80','cat81'],['cat73','cat1'],['cat6','cat103'],['cat80','cat79'],['cat111','cat2'],['cat50','cat111'],['cat9','cat90'],['cat76','cat111'],['cat111','cat13'],['cat79','cat12'],['cat103','cat11'],['cat103','cat4'],['cat111','cat87'],['cat111','cat38'],['cat111','cat36'],['cat25','cat2'],['cat103','cat23'],['cat103','cat10'],['cat111','cat5'],['cat80','cat57'],['cat24','cat103'],['cat7','cat87'],['cat80','cat3'],['cat73','cat40'],['cat85','cat79'],['cat16','cat57'],['cat24','cat28'],['cat46','cat79'],['cat87','cat89'],['cat60','cat73'],['cat9','cat22'],['cat6','cat14'],['cat9','cat47'],['cat9','cat70'],['cat34','cat57'],['cat55','cat57']]


#### 1. Label Encoding (Factorizing)

In [14]:
LBL = preprocessing.LabelEncoder()
start=time.time()
for cat_col in cat_cols:
#     print ("Factorize feature %s" % (cat))
    df[cat_col] = LBL.fit_transform(df[cat_col]).astype(np.uint8)
print ('Label enconding finished in %f seconds' % (time.time()-start))

Label enconding finished in 18.618808 seconds


#### 2. One Hot Encoding (get dummies)

In [None]:
OHE = preprocessing.OneHotEncoder(sparse=True)
start=time.time()
df_sparse=OHE.fit_transform(df[cat_cols])
print ('One-hot-encoding finished in %f seconds' % (time.time()-start))
print (df_sparse.shape)

#### 3. Leave-one-out Encoding

In [None]:
# Leave-one-out Encoding
# start=time.time()
# loo_cols =[]
# for col in cat_cols:
#     print ("Leave-One-Out Encoding  %s" % (col))
#     print ("Leave-one-out encoding column %s for %s......" % (col, target_col))
#     aggr=full_data.groupby(col)[target_col].agg([np.mean]).join(full_data[:train_size].groupby(col)[target_col].agg([np.sum,np.size]),how='left')        
#     meanTagetAggr = np.mean(aggr['mean'].values)
#     aggr=full_data.join(aggr,how='left', on=col)[list(aggr.columns)+[target_col]]
#     loo_col = 'MEAN_BY_'+col+'_'+target_col
#     full_data[loo_col] = \
#     aggr.apply(lambda row: row['mean'] if math.isnan(row[target_col]) 
#                                                        else (row['sum']-row[target_col])/(row['size']-1)*random.uniform(0.95, 1.05) , axis=1)
#     loo_cols.append(loo_col)
#     print ("New feature %s created." % (loo_col))
# print ('Leave-one-out enconding finished in %f seconds' % (time.time()-start))

### Numeric features
#### 1. Box-cox transformation / log transformation
#### 2. Standardize scaler

In [15]:
# compute skew and do Box-Cox transformation (Tilli)
def BoxCox4Skew(df, train_num, numeric_feats, cutoff=0.25):
    skewed_feats = df.ix[:train_num, numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > cutoff].index
    print("\nSkew in numeric features:")
    print(skewed_feats)
    
    for feats in skewed_feats:
        df[feats] = df[feats] - df[feats].min() + 1
        df[feats], lam = boxcox(df[feats])
    return df

def log4Skew(df, train_num, numeric_feats, cutoff=0.25):
    skewed_feats = df.ix[:train_num, numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > cutoff].index
    print("\nSkew in numeric features:")
    print(skewed_feats)
    
    for feats in skewed_feats:
        df[feats] = np.log(df[feats] - df[feats].min() + 1)
    return df    

In [16]:
#### 1. skew transformation for numeric_feats
cutoff = 0.25 # other values
df = BoxCox4Skew(df, train_num, num_cols, cutoff)


Skew in numeric features:
Index([u'lin_cont1+lin_cont2', u'lin_cont1+lin_cont3', u'lin_cont1+lin_cont4',
       u'lin_cont1+lin_cont5', u'lin_cont1+lin_cont6', u'lin_cont1+lin_cont7',
       u'lin_cont1+lin_cont8', u'lin_cont1+lin_cont9', u'lin_cont1+lin_cont10',
       u'lin_cont1+lin_cont11',
       ...
       u'lin_cont9/lin_cont14', u'lin_cont10/lin_cont11',
       u'lin_cont10/lin_cont12', u'lin_cont10/lin_cont13',
       u'lin_cont10/lin_cont14', u'lin_cont11/lin_cont13',
       u'lin_cont11/lin_cont14', u'lin_cont12/lin_cont13',
       u'lin_cont12/lin_cont14', u'lin_cont13/lin_cont14'],
      dtype='object', length=304)


In [17]:
#### 2. Standardize numeric_feats
SSL = preprocessing.StandardScaler()
df[num_cols] = SSL.fit_transform(df[num_cols].values).astype(np.float32)

### Input data

In [14]:
train_x = df[:train_num]
test_x = df[train_num:]

#### One-hot encoding
#df_sparse = sparse.hstack((df_sparse,df[num_cols]), format='csr')
#print (df_sparse.shape)
#train_x = df_sparse[:train_num]
#test_x = df_sparse[train_num:]
#ID = df.id[:train_num].values

#xgtrain = xgb.DMatrix(train_x, label=train_target) #used for Bayersian Optimization
del df

# Model Selection

In [18]:
def logregobj(labels, preds):
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def log_mae(y,yhat):
    return mean_absolute_error(np.exp(y)-shift, np.exp(yhat)-shift)

log_mae_scorer = metrics.make_scorer(log_mae, greater_is_better = False)

def xg_eval_mae(yhat, dtrain, shift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift, np.exp(yhat)-shift)

In [29]:
# 2122, 338, 743, 1542
train_x = pd.concat((X_train, X_val), axis = 0, ignore_index = True)
train_target = pd.concat((y_train, y_val), axis = 0, ignore_index = True)
from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_x, train_target, train_size=.80, random_state=1542)

In [30]:
params = {'n_estimators': 20000, 'learning_rate': 0.03, \
          'max_depth': 12, 'min_child_weight': 100, \
          'subsample': 0.7, 'colsample_bytree': 0.7, \
          'gamma': 0, 'objective': 'reg:linear', \
          'reg_alpha': 0, 'reg_lambda': 1
}

clf = xgb.XGBRegressor(n_estimators = params['n_estimators'], learning_rate = params['learning_rate'], \
                        max_depth = params['max_depth'], min_child_weight = params['min_child_weight'], \
                        subsample = params['subsample'], colsample_bytree = params['colsample_bytree'], \
                        gamma = params['gamma'], objective = params['objective'], \
                        reg_alpha = params['reg_alpha'], reg_lambda = params['reg_lambda'], missing = np.nan, \
                        silent = True, nthread = 1, seed = 1542)

clf.fit(X_train, y_train, eval_set=[(X_val,y_val)], eval_metric=xg_eval_mae, early_stopping_rounds=20)

[0]	validation_0-mae:3225.1
Will train until validation_0-mae hasn't improved in 20 rounds.
[1]	validation_0-mae:3224.62
[2]	validation_0-mae:3224.04
[3]	validation_0-mae:3223.34
[4]	validation_0-mae:3222.53
[5]	validation_0-mae:3221.56
[6]	validation_0-mae:3220.44
[7]	validation_0-mae:3219.13
[8]	validation_0-mae:3217.62
[9]	validation_0-mae:3215.88
[10]	validation_0-mae:3213.88
[11]	validation_0-mae:3211.61
[12]	validation_0-mae:3209.04
[13]	validation_0-mae:3206.13
[14]	validation_0-mae:3202.86
[15]	validation_0-mae:3199.2
[16]	validation_0-mae:3195.12
[17]	validation_0-mae:3190.6
[18]	validation_0-mae:3185.6
[19]	validation_0-mae:3180.09
[20]	validation_0-mae:3174.04
[21]	validation_0-mae:3167.43
[22]	validation_0-mae:3160.24
[23]	validation_0-mae:3152.42
[24]	validation_0-mae:3143.96
[25]	validation_0-mae:3134.83
[26]	validation_0-mae:3125.01
[27]	validation_0-mae:3114.5
[28]	validation_0-mae:3103.24
[29]	validation_0-mae:3091.25
[30]	validation_0-mae:3078.5
[31]	validation_0-mae:

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=12,
       min_child_weight=100, missing=None, n_estimators=20000, nthread=1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1542, silent=True, subsample=0.7)

In [31]:
fscores = clf.booster().get_fscore()

importances = np.zeros(X_train.shape[1])
for k, v in fscores.iteritems():
    #importances[int(k[1:])] = v
    print k, v

cat50_cat9 59
cat111_cat35 1
cat50_cat4 12
cat10_cat13 17
cat76_cat46 3
cat10_cat11 20
cat10_cat16 7
cat46_cat12 1
cat50_cat2 60
cat50_cat3 20
cat40_cat22 3
cat6_cat76 24
cat6_cat70 1
cat6_cat72 71
cat6_cat73 61
cat24_cat23 2
cat6_cat79 91
cat111_cat11 61
cat111_cat10 48
cat111_cat13 69
cat111_cat12 66
cat1_cat68 2
cat1_cat69 3
cat7_cat90 8
cat73_cat40 37
lin_cont1/lin_cont5 131
cat1_cat62 1
cat1_cat63 2
cat10_cat12 21
lin_cont13-lin_cont14 178
cat50_cat5 19
cat50_cat63 1
lin_cont10/lin_cont14 341
lin_cont2-lin_cont12 58
cat34_cat57 15
lin_cont2-lin_cont14 71
cat80_cat10 22
cat79_cat11 17
cat79_cat10 26
cat79_cat13 23
cat79_cat12 79
cat40_cat3 12
cat80_cat89 7
cat79_cat16 4
cat103_cat2 101
cat103_cat3 25
cat24_cat47 1
cat24_cat40 21
cat103_cat7 11
cat103_cat4 60
cat73_cat58 1
cat14_cat16 3
lin_cont8*lin_cont14 482
lin_cont8*lin_cont11 101
lin_cont8*lin_cont10 78
lin_cont8*lin_cont13 102
lin_cont8*lin_cont12 81
cat40_cat68 1
cat1_cat13 26
lin_cont12-lin_cont13 157
lin_cont12-lin_cont14 

# Merge Results

In [121]:
import numpy as np
import pandas as pd

Repnum = 5

pred = pd.read_csv('../input/test.csv', usecols=['id'])
for k in range(Repnum):
    sub_pred = pd.read_csv('../output/sub_final_mcmc30K_test1_'+str(k)+'.csv', usecols = ['loss'])
    sub_pred.columns = ['pred_'+str(k)]
    pred = pd.concat([pred, sub_pred], axis = 1)
aList = [item for i, item in enumerate(pred.columns.ravel()) if 'pred' in item]
pred['avgPred'] = pred[aList].mean(axis = 1)
pred.to_csv('../output/final_mcmc30K_test1.csv', index = False)
pred[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test1.csv', header = ['id','loss'], index = False)

In [4]:
sub1 = pd.read_csv('../output/sub_xgb_starter.csv', usecols = ['id','loss'])
sub2 = pd.read_csv('../output/test_XGB_CatComb38_OHE.csv', usecols = ['avgPred'])
sub3 = pd.read_csv('../output/test_GBM_ContNS_OHE.csv', usecols = ['avgPred'])
sub4 = pd.read_csv('../output/sub_keras_starter.csv', usecols = ['loss'])
sub5 = pd.read_csv('../output/sub_keras_mean.csv', usecols = ['loss'])
sub6 = pd.read_csv('../output/test_GBM_Comb101+5_OHE_v1.csv', usecols = ['avgPred'])
sub7 = pd.read_csv('../output/test_GBM_Comb101+5_OHE_v2.csv', usecols = ['avgPred'])
sub = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7], axis = 1)
sub.columns =  ['id', 'Pred_1', 'Pred_2', 'Pred_3', 'Pred_4', 'Pred_5', 'Pred_6', 'Pred_7']
#aList = [item for i, item in enumerate(sub.columns.ravel()) if 'Pred' in item]
#sub['loss'] = sub[aList].mean(axis = 1)
sub['loss'] = 0.1 * sub.Pred_1 + 0.15 * (sub.Pred_5+sub.Pred_4+sub.Pred_2+sub.Pred_3)  + 0.15 * (sub.Pred_6+sub.Pred_7)
sub[['id','loss']].to_csv('../output/submission.csv', index = False)

In [2]:
import numpy as np
import pandas as pd
sub1 = pd.read_csv('../output/test_GBM_Comb101+5_OHE_v1.csv', usecols = ['id', 'avgPred'])
sub2 = pd.read_csv('../output/test_GBM_Comb101+5_OHE_v2.csv', usecols = ['avgPred'])
sub = pd.concat([sub1, sub2], axis = 1)
sub.columns = ['id', 'Pred_1', 'Pred_2']
sub['loss'] = 0.5 * (sub.Pred_1 + sub.Pred_2)
sub[['id','loss']].to_csv('../output/submission.csv', index = False)
sub.head(2)

Unnamed: 0,id,Pred_1,Pred_2,loss
0,4,1475.959344,1512.830259,1494.394801
1,6,2083.024392,2011.230726,2047.127559


# Ensemble

In [13]:
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.linear_model import Ridge

In [104]:
sub1 = pd.read_csv('../output/cv_train_XGB_starter.csv') # 1113.30717
sub2 = pd.read_csv('../output/cv_train_XGB_CatComb38_OHE.csv') # 1112.23090
sub3 = pd.read_csv('../output/cv10_train_LGB_ContNS_OHE.csv') # 1111.13920 (*)
sub4 = pd.read_csv('../output/cv_train_LGB_Comb101+5_OHE_v1.csv') # 1110.53253
sub5 = pd.read_csv('../output/cv10_train_LGB_Comb101+5_OHE_v2.csv') # 1109.69728 (*)
sub6 = pd.read_csv('../output/cv_train_XGB_Comb101+20_OHE.csv') # 1109.41721 (*)
sub7 = pd.read_csv('../output/cv_train_Keras_Basic10.csv', usecols = ['pred_0','pred_1','pred_2','pred_3','pred_4','pred_5','pred_6','pred_7','pred_8','pred_9']) # 1114.18662 (*)
#sub7 = pd.read_csv('../output/cv_train_Keras_Basic10.csv', usecols = ['avgPred']) # 1114.18662 (*)
sub8 = pd.read_csv('../output/cv_train_LGB_Comb101+20_OHE.csv') # 1112.87792
sub9 = pd.read_csv('../output/cv_train_Keras_mt.csv', usecols = ['avgPred']) # 1116.36436
sub10 = pd.read_csv('../output/cv10_train_LGB_Comb101+20Aft_LE_fairobj.csv') # 1113.95720
sub11 = pd.read_csv('../output/cv_train_XGB_Comb101+20_LE_fairobj.csv') # 1110.89367
sub12 = pd.read_csv('../output/cv10_train_LGB_Comb101+20Aft_LE.csv') # 1111.44509 (*)
sub13 = pd.read_csv('../output/cv10_train_LGB_CombC+20Aft_LEC1.csv')
sub14 = pd.read_csv('../output/cv10_train_LGB_CombC+20Aft_LE.csv')
sub15 = pd.read_csv('../output/cv10_train_LGB_CombC+20Aft_LEC2.csv')
sub16 = pd.read_csv('../output/cv10_train_LGB_CombC+20Aft_LE2.csv')

train = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15, sub16], axis = 1).as_matrix()
#train = pd.concat([sub1, sub2, sub3, sub5, sub6, sub7, sub9, sub11, sub13], axis = 1).as_matrix()
train_id = pd.read_csv('../input/train.csv', usecols = ['id','loss'])

train_y = train_id.loss.values

#lift = 200
#train_y = np.log(train_id.loss.values + lift)
#train = np.log(train + lift)

In [105]:
sub1 = pd.read_csv('../output/cv_test_XGB_starter.csv', usecols = ['avgPred'])
sub2 = pd.read_csv('../output/cv_test_XGB_CatComb38_OHE.csv', usecols = ['avgPred'])
sub3 = pd.read_csv('../output/cv10_test_LGB_ContNS_OHE.csv', usecols = ['avgPred'])
sub4 = pd.read_csv('../output/cv_test_LGB_Comb101+5_OHE_v1.csv', usecols = ['avgPred'])
sub5 = pd.read_csv('../output/cv10_test_LGB_Comb101+5_OHE_v2.csv', usecols = ['avgPred'])
sub6 = pd.read_csv('../output/cv_test_XGB_Comb101+20_OHE.csv', usecols = ['avgPred'])
sub7 = pd.read_csv('../output/cv_test_Keras_Basic10.csv', usecols = ['pred_0','pred_1','pred_2','pred_3','pred_4','pred_5','pred_6','pred_7','pred_8','pred_9'])
#sub7 = pd.read_csv('../output/cv_test_Keras_Basic10.csv', usecols = ['avgPred'])
sub8 = pd.read_csv('../output/cv_test_LGB_Comb101+20_OHE.csv', usecols = ['avgPred'])
sub9 = pd.read_csv('../output/cv_test_Keras_mt.csv', usecols = ['avgPred'])
sub10 = pd.read_csv('../output/cv10_test_LGB_Comb101+20Aft_LE_fairobj.csv', usecols = ['avgPred'])
sub11 = pd.read_csv('../output/cv_test_XGB_Comb101+20_LE_fairobj.csv', usecols = ['avgPred'])
sub12 = pd.read_csv('../output/cv10_test_LGB_Comb101+20Aft_LE.csv', usecols = ['avgPred'])
sub13 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LEC1.csv', usecols = ['avgPred'])
sub14 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LE.csv', usecols = ['avgPred'])
sub15 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LEC2.csv', usecols = ['avgPred'])
sub16 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LE2.csv', usecols = ['avgPred'])

test_1 = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15, sub16], axis = 1).as_matrix()
#test_1 = pd.concat([sub1, sub2, sub3, sub5, sub6, sub7, sub9, sub11, sub13], axis = 1).as_matrix()
#test_1 = np.log(test_1 + lift)

sub1 = pd.read_csv('../output/all_test_XGB_starter.csv', usecols = ['avgPred'])
sub2 = pd.read_csv('../output/all_test_XGB_CatComb38_OHE.csv', usecols = ['avgPred'])
sub3 = pd.read_csv('../output/all_test_LGB_ContNS_OHE.csv', usecols = ['avgPred'])
sub4 = pd.read_csv('../output/all_test_LGB_Comb101+5_OHE_v1.csv', usecols = ['avgPred'])
sub5 = pd.read_csv('../output/all_test_LGB_Comb101+5_OHE_v2.csv', usecols = ['avgPred'])
sub6 = pd.read_csv('../output/all_test_XGB_Comb101+20_OHE.csv', usecols = ['avgPred'])
sub7 = pd.read_csv('../output/cv_test_Keras_Basic10.csv', usecols = ['pred_0','pred_1','pred_2','pred_3','pred_4','pred_5','pred_6','pred_7','pred_8','pred_9'])
#sub7 = pd.read_csv('../output/cv_test_Keras_Basic10.csv', usecols = ['avgPred'])
sub8 = pd.read_csv('../output/all_test_LGB_Comb101+20_OHE.csv', usecols = ['avgPred'])
sub9 = pd.read_csv('../output/cv_test_Keras_mt.csv', usecols = ['avgPred'])
sub10 = pd.read_csv('../output/all_test_LGB_Comb101+20Aft_LE_fairobj.csv', usecols = ['avgPred'])
sub11 = pd.read_csv('../output/all_test_XGB_Comb101+20_LE_fairobj.csv', usecols = ['avgPred'])
sub12 = pd.read_csv('../output/all_test_LGB_Comb101+20Aft_LE.csv', usecols = ['avgPred'])
sub13 = pd.read_csv('../output/all_test_LGB_CombC+20Aft_LEC1.csv', usecols = ['avgPred'])
sub14 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LE.csv', usecols = ['avgPred'])
sub15 = pd.read_csv('../output/all_test_LGB_CombC+20Aft_LEC2.csv', usecols = ['avgPred'])
sub16 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LE2.csv', usecols = ['avgPred'])

test_2 = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15, sub16], axis = 1).as_matrix()
#test_2 = pd.concat([sub1, sub2, sub3, sub5, sub6, sub7, sub9, sub11, sub13], axis = 1).as_matrix()
#test_2 = np.log(test_2 + lift)

#sub1 = pd.read_csv('../output/cv_test_XGB_starter.csv', usecols = ['Pred_1', 'Pred_2', 'Pred_3', 'Pred_4'])
#sub2 = pd.read_csv('../output/cv_test_XGB_CatComb38_OHE.csv', usecols = ['Pred_1', 'Pred_2', 'Pred_3', 'Pred_4'])
#sub3 = pd.read_csv('../output/cv_test_GBM_ContNS_OHE.csv', usecols = ['Pred_1', 'Pred_2', 'Pred_3', 'Pred_4'])
#sub4 = pd.read_csv('../output/cv_test_GBM_Comb101+5_OHE_v1.csv', usecols = ['Pred_1', 'Pred_2', 'Pred_3', 'Pred_4'])
#sub5 = pd.read_csv('../output/cv_test_GBM_Comb101+5_OHE_v2.csv', usecols = ['Pred_1', 'Pred_2', 'Pred_3', 'Pred_4'])

#sub1 = sub1.apply(lambda x: np.log(x+lift))
#sub2 = sub2.apply(lambda x: np.log(x+lift))
#sub3 = sub3.apply(lambda x: np.log(x+lift))
#sub4 = sub4.apply(lambda x: np.log(x+lift))
#sub5 = sub5.apply(lambda x: np.log(x+lift))

#sub1['avg'] = sub1.mean(axis = 1)
#sub2['avg'] = sub2.mean(axis = 1)
#sub3['avg'] = sub3.mean(axis = 1)
#sub4['avg'] = sub4.mean(axis = 1)
#sub5['avg'] = sub5.mean(axis = 1)

#test = pd.concat([sub1.avg, sub2.avg, sub3.avg, sub4.avg, sub5.avg], axis = 1).as_matrix()

test_id = pd.read_csv('../input/test.csv', usecols = ['id'])

## Ridge

In [83]:
sub7s = pd.read_csv('../output/cv_train_Keras_Basic10.csv', usecols = ['pred_6','pred_9', 'avgPred']) # 1114.18662 (*)
train = pd.concat([sub3, sub5, sub6, sub7, sub8, sub12], axis = 1).as_matrix()
   
lift = 200
train_y = train_id.loss.values
train_y = np.log(train_id.loss.values + lift)
train = np.log(train + lift)

In [128]:
def log_mae(labels,preds):
    lift=200
    return mean_absolute_error(np.exp(labels)-lift, np.exp(preds)-lift)

log_mae_scorer = metrics.make_scorer(log_mae, greater_is_better = False)

print  ("Blending.")
param_grid = {
    'alpha':[0,0.00001,0.00003,0.0001,0.0003,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10,15,20,25,30,35,40,45,50,55,60,70]
              }

est = Ridge()
model = GridSearchCV(estimator = est, param_grid = param_grid, scoring = log_mae_scorer, verbose    = 10, 
                     n_jobs = 1, iid = True, refit = True, cv = 10)

model.fit(train, train_y)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:", model.best_params_)
print("Scores:", model.grid_scores_)

Blending.
Fitting 10 folds for each of 25 candidates, totalling 250 fits
[CV] alpha=0 .........................................................
[CV] ............................ alpha=0, score=-1131.129363 -   0.6s
[CV] alpha=0 .........................................................
[CV] ............................ alpha=0, score=-1112.987006 -   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ............................ alpha=0, score=-1127.188104 -   0.2s
[CV] alpha=0 .........................................................
[CV] ............................ alpha=0, score=-1109.760597 -   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.2s remaining:    0.0s


[CV] ............................ alpha=0, score=-1134.126635 -   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.3s remaining:    0.0s


[CV] ............................ alpha=0, score=-1127.034504 -   0.3s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.6s remaining:    0.0s


[CV] ............................ alpha=0, score=-1124.752708 -   0.2s
[CV] alpha=0 .........................................................
[CV] ............................ alpha=0, score=-1126.266737 -   0.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    2.0s remaining:    0.0s


[CV] ............................ alpha=0, score=-1114.263827 -   0.2s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    2.2s remaining:    0.0s


[CV] ............................ alpha=0, score=-1115.020753 -   0.2s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1131.129363 -   0.1s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1112.987006 -   0.2s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1127.188104 -   0.2s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1109.760597 -   0.1s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1134.126635 -   0.1s
[CV] alpha=1e-05 .....................................................
[CV] ........................ alpha=1e-05, score=-1127.034504 -   0.2s
[CV] alpha=1e-05 .....................................................
[CV] .

[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   43.3s finished


In [129]:
est = Ridge(alpha = 0)
est.fit(train, train_y)
print est.coef_, est.intercept_

[  7.72705730e-03   5.49269769e-02   1.39671526e-01   1.08675089e-02
   1.82746996e-01   7.14249004e-02   3.07320651e-02   6.35371004e-02
   1.48541193e-02   2.81236254e-02   5.12699376e-02   3.71828420e-02
   1.18318321e-02   1.58742066e-02  -1.02945099e-01  -2.00651189e-02
  -1.61213080e-01   2.16902374e-05  -8.59897012e-02   5.81482729e-02
   8.06269338e-02   1.96739080e-01   2.96515609e-03   8.07734988e-02
   2.24181209e-01] 0.0456593318716


In [130]:
fold = 10
train_pred = np.zeros((train.shape[0], 1))
test_pred1 = np.zeros((test_1.shape[0], fold))
test_pred2 = np.zeros((test_2.shape[0], fold))
    
skf = list(KFold(len(train_y), fold))
scores=[]
for i, (tr, val) in enumerate(skf):
    est = Ridge(alpha = 0)
    est.fit(train[tr], train_y[tr])
    print est.coef_, est.intercept_
    train_pred[val,0] = est.predict(train[val])
    test_pred1[:,i] = est.predict(test_1)
    test_pred2[:,i] = est.predict(test_2)
    score = log_mae(train_y[val], train_pred[val,0])
    print (score)
    scores.append(score)
print (-np.mean(scores))

[  3.45626027e-03   7.71621468e-02   1.35709347e-01   2.39002732e-03
   1.79088790e-01   1.00475250e-01   2.52962325e-02   6.66694376e-02
   2.44503964e-02   2.22157795e-02   5.40704424e-02   4.05129240e-02
   1.74568638e-03   1.78629941e-02  -1.05544674e-01  -1.73945013e-02
  -1.88407324e-01   1.48116504e-05  -8.97182193e-02   4.93459618e-02
   7.34062441e-02   2.06110591e-01   2.43821426e-02   7.34686276e-02
   2.16520121e-01] 0.0521256436468
1131.12936286
[  4.70861719e-03   4.85930856e-02   1.44356406e-01   3.26079960e-02
   1.72340192e-01   8.33806429e-02   3.18502847e-02   6.23587880e-02
   4.44350124e-03   2.78065388e-02   4.63735189e-02   4.75026614e-02
   2.19344884e-04   2.65778508e-02  -9.72871318e-02  -1.89666034e-02
  -1.76677875e-01   1.11196651e-04  -7.15243385e-02   5.08145342e-02
   6.17434983e-02   2.13961268e-01   9.71790106e-03   7.90216811e-02
   2.10508138e-01] 0.0422236005909
1112.98700574
[ -9.46476244e-03   6.09660494e-02   1.48768209e-01  -5.06781629e-03
   1.

In [131]:
train_id['pred'] = np.exp(train_pred) - lift
train_id[['id','pred','loss']].to_csv('../output/final_ridge_train.csv', index = False)

test_ridge_1 = (np.exp(test_pred1) - lift).mean(axis = 1)
test_ridge_2 = (np.exp(test_pred2) - lift).mean(axis = 1)

test_id['loss'] = test_ridge_1
test_id.to_csv('../output/final_ridge_test1.csv', index = False)

test_id['loss'] = test_ridge_2
test_id.to_csv('../output/final_ridge_test2.csv', index = False)

test_id['loss'] = 0.5*test_ridge_1+0.5*test_ridge_2
test_id.to_csv('../output/final_ridge_test.csv', index = False)

In [102]:
dat1 = pd.read_csv('../output/best/final_ridge_test1.csv')
dat2 = pd.read_csv('../output/best/sub_final_mcmc10Ks_test1.csv')
data = pd.concat([dat1, dat2.loss], axis = 1)
data.head()

Unnamed: 0,id,loss,loss.1
0,4,1520.416589,1524.44289
1,6,1927.429379,1935.53801
2,9,9191.731758,9500.035905
3,12,6403.205811,6650.414665
4,15,784.556758,801.872597


In [103]:
data.columns = ['id', 'ridge', 'mcmc']
data['loss'] = 0.5 * data.ridge + 0.5 * data.mcmc
data[['id','loss']].to_csv('../output/findal_ridge_mcmc10Ks_test1.csv', index = False)

## XGBoost

In [23]:
import xgboost as xgb

In [24]:
# XGBoost gblinear
def xg_eval_mae(yhat, dtrain,lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift)

params = {
    'eta': 0.1,
    'booster': 'gblinear',
    'lambda': 0,
    'alpha': 0, # you can try different values for alpha, e.g. [0, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
    'lambda_bias' : 0,
    'silent': 0,
    'verbose_eval': False,
    'seed': 1234
}

xgb.cv(params,
       xgb.DMatrix(train, label=train_y,missing=np.nan),
       num_boost_round=100000, nfold=4, feval=xg_eval_mae,seed=1234,callbacks=[xgb.callback.early_stop(500)])

Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 500 rounds.
Stopping. Best iteration:
[32632]	train-mae:1124.98+2.15005	test-mae:1125.01+6.10645



Unnamed: 0,test-mae-mean,test-mae-std,train-mae-mean,train-mae-std
0,2899.541016,29.602490,2899.538940,25.677586
1,1813.552368,36.800550,1813.578217,33.975418
2,1307.419647,12.278176,1307.455902,8.750483
3,1211.776306,5.924390,1211.761963,1.392287
4,1193.792206,5.353592,1193.765595,1.080698
5,1189.666168,5.314946,1189.639404,1.200382
6,1188.554901,5.288341,1188.527191,1.245769
7,1188.206146,5.278724,1188.177979,1.262266
8,1188.073333,5.278633,1188.045257,1.263735
9,1188.004547,5.279331,1187.976562,1.265115


In [56]:
def xg_eval_mae(yhat, dtrain,lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift)

def logregobj(labels, preds):
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def xgb_logregobj(preds, dtrain):
    con = 2
    labels = dtrain.get_label()
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess

params = {
    'eta': 0.1,
    'booster': 'gblinear',
    'obj': xgb_logregobj,
    'lambda': 0,
    'alpha': 0, # you can try different values for alpha
    'lambda_bias' : 0,
    'silent': 0,
    'verbose_eval': False,
    'seed': 1234
}

fold = 4
train_pred = np.zeros((train.shape[0], 1))
test_pred1 = np.zeros((test.shape[0], fold))
test_pred2 = np.zeros((test.shape[0], fold))

skf = list(KFold(len(train_y), fold))
scores=[]
for i, (tr, val) in enumerate(skf):
    xgtrain = xgb.DMatrix(train[tr], label=train_y[tr])
    xgval=xgb.DMatrix(train[val], label=train_y[val])
    watchlist  = [ (xgtrain,'train'),(xgval,'eval')]
    
    est = xgb.train(params, xgtrain,
          num_boost_round=100000,
          evals = watchlist,
          feval=xg_eval_mae,
          early_stopping_rounds=500
          )
    train_pred[val,0] = est.predict(xgb.DMatrix(train[val]))
    test_pred1[:,i] = est.predict(xgb.DMatrix(test_1))
    test_pred2[:,i] = est.predict(xgb.DMatrix(test_2))
    score = log_mae(train_y[val], train_pred[val,0])
    print (score)
    scores.append(score)
print (-np.mean(scores))


[0]	train-mae:2917.45	eval-mae:2906.99
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 500 rounds.
[1]	train-mae:1819.26	eval-mae:1801.81
[2]	train-mae:1312.36	eval-mae:1297.12
[3]	train-mae:1212.37	eval-mae:1201.83
[4]	train-mae:1195.24	eval-mae:1186.72
[5]	train-mae:1191.05	eval-mae:1183.19
[6]	train-mae:1189.87	eval-mae:1182.19
[7]	train-mae:1189.5	eval-mae:1181.87
[8]	train-mae:1189.34	eval-mae:1181.72
[9]	train-mae:1189.27	eval-mae:1181.66
[10]	train-mae:1189.2	eval-mae:1181.6
[11]	train-mae:1189.17	eval-mae:1181.56
[12]	train-mae:1189.12	eval-mae:1181.51
[13]	train-mae:1189.07	eval-mae:1181.47
[14]	train-mae:1189.03	eval-mae:1181.42
[15]	train-mae:1188.98	eval-mae:1181.38
[16]	train-mae:1188.93	eval-mae:1181.33
[17]	train-mae:1188.88	eval-mae:1181.28
[18]	train-mae:1188.84	eval-mae:1181.24
[19]	train-mae:1188.79	eval-mae:1181.19
[20]	train-mae:1188.74	eval-mae:1181.14
[21]	train-mae:1188.7	eval-mae:

In [57]:
train_id['prediction'] = np.exp(train_pred) - lift
train_id[['id','prediction','loss']].to_csv('../output/final_gbline_train.csv', index = False)

test_gbline_1 = (np.exp(test_pred1) - lift).mean(axis = 1)
test_gbline_2 = (np.exp(test_pred2) - lift).mean(axis = 1)

test_id['loss'] = test_gbline_1
test_id.to_csv('../output/final_gbline_test1.csv', index = False)

test_id['loss'] = test_gbline_2
test_id.to_csv('../output/final_gbline_test2.csv', index = False)

test_id['loss'] = 0.5*test_gbline_1+0.5*test_gbline_2
test_id.to_csv('../output/final_gbline_test.csv', index = False)

In [None]:
test_id['loss'] = 0.5*test_gbline_1+0.5*test_gbline_2

In [26]:
xgb_model=xgb.train(params, xgb.DMatrix(train, label=train_y,missing=np.nan),
          num_boost_round=32632,
          feval=xg_eval_mae)

pred_y_gblinear = np.exp(xgb_model.predict(xgb.DMatrix(test))) - lift

results = pd.DataFrame()
results['id'] = test_id.id
results['loss'] = pred_y_gblinear
results.to_csv("../output/sub_gblinear.csv", index=False)
print ("Submission created.")

results = pd.DataFrame()
results['id'] = test_id.id
results['loss'] = 0.5*pred_y_ridge + 0.5*pred_y_gblinear
results.to_csv("../output/submission.csv", index=False)
print ("Submission created.")

Submission created.


In [4]:
pred_ridge = pd.read_csv('../output/test_final_ridge.csv')
pred_gblinear = pd.read_csv('../output/test_final_gbline.csv')
pd.DataFrame({'id': pred_ridge.id, 'loss': 0.5 * pred_ridge.loss + 0.5 * pred_gblinear.loss}).to_csv('../output/submission.csv', index = False)

# Ensemble by MCMC

In [156]:
train = pd.read_csv('../output/cv_train.csv')
sub3 = pd.read_csv('../output/cv10_train_LGB_ContNS_OHE.csv') # 1111.13920 (*)
sub5 = pd.read_csv('../output/cv10_train_LGB_Comb101+5_OHE_v2.csv') # 1109.69728 (*)
sub6 = pd.read_csv('../output/cv10_train_XGB_Comb101+20_OHE.csv') # 1111.13920 (*)
sub16 = pd.read_csv('../output/cv10_train_LGB_CombC+20Aft_LE2.csv')
sub17 = pd.read_csv('../output/cv10_train_LGB_Comb101+5_OHE_v3.csv')
sub18 = pd.read_csv('../output/cv10_train_XGB_Mb.csv')
#sub0 = pd.read_csv('../output/final_ridge_train.csv')
train['sub17'] = sub3.Pred
train['sub18'] = sub5.Pred
train['sub19'] = sub6.Pred
train['sub16'] = sub16.Pred
train['sub20'] = sub17.Pred
train['sub21'] = sub18.Pred
train = train.values
train_id = pd.read_csv('../input/train.csv', usecols = ['id','loss'])
train_y = train_id.loss.values

#lift = 200
#train_y = np.log(train_id.loss.values + lift)
#train = np.log(train + lift)

test_1 = pd.read_csv('../output/cv_test.csv')
sub3 = pd.read_csv('../output/cv10_test_LGB_ContNS_OHE.csv', usecols = ['avgPred']) # 1111.13920 (*)
sub5 = pd.read_csv('../output/cv10_test_LGB_Comb101+5_OHE_v2.csv', usecols = ['avgPred']) # 1109.69728 (*)
sub6 = pd.read_csv('../output/cv10_test_XGB_Comb101+20_OHE.csv', usecols = ['avgPred']) # 1111.13920 (*)
sub16 = pd.read_csv('../output/cv10_test_LGB_CombC+20Aft_LE2.csv', usecols = ['avgPred'])
sub17 = pd.read_csv('../output/cv10_test_LGB_Comb101+5_OHE_v3.csv', usecols = ['avgPred'])
sub18 = pd.read_csv('../output/cv10_test_XGB_Mb.csv', usecols = ['avgPred'])
#sub0 = pd.read_csv('../output/final_ridge_test1.csv')
test_1['sub17'] = sub3.avgPred
test_1['sub18'] = sub5.avgPred
test_1['sub19'] = sub6.avgPred
test_1['sub16'] = sub16.avgPred
test_1['sub20'] = sub17.avgPred
test_1['sub21'] = sub18.avgPred
test_1 = test_1.values
#test_1 = np.log(test_1 + lift)

test_2 = pd.read_csv('../output/all_test.csv')
sub16 = sub16 = pd.read_csv('../output/all_test_LGB_CombC+20Aft_LE2.csv', usecols = ['avgPred'])
sub17 = pd.read_csv('../output/all_test_LGB_Comb101+5_OHE_v3.csv', usecols = ['avgPred'])
sub18 = pd.read_csv('../output/cv10_test_XGB_Mb.csv', usecols = ['avgPred'])
#sub0 = pd.read_csv('../output/final_ridge_test2.csv')
test_2['sub17'] = test_2.sub3
test_2['sub18'] = test_2.sub5
test_2['sub19'] = test_2.sub6
test_2['sub16'] = sub16.avgPred
test_2['sub20'] = sub17.avgPred
test_2['sub21'] = sub18.avgPred
test_2 = test_2.values
#test_2 = np.log(test_2 + lift)

test_id = pd.read_csv('../input/test.csv', usecols = ['id'])

In [None]:
def MAE(pred,y):
    error = np.exp(pred) -np.exp(y)
    error = np.mean((error**2)**.5)
    return 'mcc error',error
    
# original form
def MAE2(pred,y):
    error = pred - y
    error = np.mean((error**2)**.5)
    return 'mcc error',error

#### NOW THE MCMC PART to find individal weights for ensemble####
num = train.shape[1]
weight = np.array([ 0.04513211,  0.01778232,  0.00634926,  0.03413778,  0.03508045,
        0.00471817,  0.04195227,  0.001     ,  0.0101694 ,  0.10815785,
        0.03417272,  0.01253025,  0.01596475,  0.001     ,  0.01670606,
        0.00337885,  0.02099652,  0.001     ,  0.01157573,  0.00764824,
        0.00898724,  0.04282531,  0.00283935,  0.04339588,  0.01611678,
        0.08784073,  0.09465961,  0.19837183,  0.04070897,  0.04876591])

# This is to define variables to be used later
Repnum = 10

train_mcmc=np.zeros((train.shape[0], Repnum))
test_mcmc_1=np.zeros((test_1.shape[0], Repnum))
test_mcmc_2=np.zeros((test_2.shape[0], Repnum))
for k in range(Repnum):
    pred_new = np.zeros(train.shape[0])
    pred_old = np.zeros(train.shape[0])
    counter = 0
    n=10000 ###MCMC steps
    result={}
    
    print('\n Finding weights by MCMC ...')
    for i in range(num):
        pred_new += train[:,i]*weight[i]
    pred_old = pred_new
    
    #### MCMC  #### 
    ### MCMC algo for dummies 
    ### 1. Get initialize ensemble weights
    ### 2. Generate new weights 
    ### 3. if MAE is lower, accept new weights immediately , or else accept new weights with probability of np.exp(-diff/.3)
    ### 4. repeat 2-3
    for i in range(n):
        new_weights = weight+ np.array([0.005,]*num)*np.random.normal(loc=0.0, scale=1.0, size=num)
        new_weights[new_weights < 0.001]=0.001 #0.01=>0.001
        pred_new=np.zeros(train.shape[0])
        for ii in range(num):
            pred_new += train[:,ii]*new_weights[ii]
        diff = MAE2(pred_new,train_y)[1] - MAE2(pred_old,train_y)[1]
        prob = min(1,np.exp(-diff/.3)) #0.3 -> 0.5
        random_prob = np.random.rand()
        if random_prob < prob:
            weight = new_weights
            pred_old = pred_new
            result[i] = (MAE2(pred_new,train_y)[1] ,MAE2(pred_old,train_y)[1],prob,random_prob ,weight)
            counter +=1
    print (counter *1.0 / n, 'Acceptance Ratio') #keep this [0.4,0.6] for best results
    print ('best result MAE', sorted([result[i] for i in result])[0:1][0])
        
    weight=sorted([result[i] for i in result])[0:1][-1]
    weight = weight[-1]
        
    for i in range(num):
        train_mcmc[:,k] += train[:,i]*weight[i]
        test_mcmc_1[:,k] += test_1[:,i]*weight[i]
        test_mcmc_2[:,k] += test_2[:,i]*weight[i]
    print ('combined all features plus MCMC weights:',',MAE=', MAE2(train_mcmc[:,k],train_y))
    print ('weights:', weight)
### notice the weights do not necessarily sum to 1 ###

#train_mcmc=np.exp(train_mcmc) - lift
#test_mcmc_1=np.exp(test_mcmc_1) - lift
#test_mcmc_2=np.exp(test_mcmc_2) - lift

train_pred = pd.DataFrame(train_mcmc, columns = ['Pred_'+str(k) for k in range(Repnum)])
train_pred['avgPred'] = train_pred.mean(axis=1)
train_pred['id'] = train_id['id']
train_pred['loss'] = train_id['loss']
train_pred.to_csv('../output/final_mcmc30K_train_96.csv', index = False)

test_pred1 = pd.DataFrame(test_mcmc_1, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred1['avgPred'] = test_pred1.mean(axis = 1)
test_pred1['id'] = test_id['id']
#test_pred1.to_csv('../output/final_mcmc30K_test1_0.csv', index = False)
test_pred1[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test1_96.csv', header = ['id','loss'], index = False)

test_pred2 = pd.DataFrame(test_mcmc_2, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred2['avgPred'] = test_pred2.mean(axis = 1)
test_pred2['id'] = test_id['id']
#test_pred2.to_csv('../output/final_mcmc30K_test2_0.csv', index = False)
test_pred2[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test2_96.csv', header = ['id','loss'], index = False)

test_id['loss'] = 0.5*test_pred1['avgPred']+0.5*test_pred2['avgPred']
test_id.to_csv('../output/final_mcmc30K_test_96.csv', index = False)


 Finding weights by MCMC ...


In [154]:
#train_mcmc=np.exp(train_mcmc) - lift
#test_mcmc_1=np.exp(test_mcmc_1) - lift
#test_mcmc_2=np.exp(test_mcmc_2) - lift

train_pred = pd.DataFrame(train_mcmc, columns = ['Pred_'+str(k) for k in range(Repnum)])
train_pred['avgPred'] = train_pred.mean(axis=1)
train_pred['id'] = train_id['id']
train_pred['loss'] = train_id['loss']
train_pred.to_csv('../output/final_mcmc30K_train_0.csv', index = False)

test_pred1 = pd.DataFrame(test_mcmc_1, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred1['avgPred'] = test_pred1.mean(axis = 1)
test_pred1['id'] = test_id['id']
test_pred1.to_csv('../output/final_mcmc30K_test1_0.csv', index = False)
test_pred1[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test1_0.csv', header = ['id','loss'], index = False)

test_pred2 = pd.DataFrame(test_mcmc_2, columns = ['Pred_'+str(k) for k in range(Repnum)])
test_pred2['avgPred'] = test_pred2.mean(axis = 1)
test_pred2['id'] = test_id['id']
test_pred2.to_csv('../output/final_mcmc30K_test2.csv', index = False)
test_pred2[['id','avgPred']].to_csv('../output/sub_final_mcmc30K_test2_0.csv', header = ['id','loss'], index = False)

test_id['loss'] = 0.5*test_pred1['avgPred']+0.5*test_pred2['avgPred']
test_id.to_csv('../output/final_mcmc30K_test_0.csv', index = False)

# Ensemble by minimization

In [None]:
from scipy.optimize import minimize
from sklearn.metrics import mean_absolute_error

def mae_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return mean_absolute_error(Y_values, final_prediction)

print('\n Finding weights by minimization ...')
Y_values = train['loss2'].values
predictions = []
lls = []
wghts = []

for i in range(num):
    predictions.append(train[:,i])

for i in range(1000):
    starting_values = np.random.uniform(size=num)
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    bounds = [(0,1)]*len(predictions)

    res = minimize(mae_func, starting_values, method='L-BFGS-B', bounds=bounds, options={'disp': False, 'maxiter': 100000})

    lls.append(res['fun'])
    wghts.append(res['x'])
    # Uncomment the next line if you want to see the weights and scores calculated in real time
    #print('Weights: {weights}  Score: {score}'.format(weights=res['x'], score=res['fun']))

bestSC = np.min(lls)
bestWght = wghts[np.argmin(lls)]

print('\n Ensemble Score: {best_score}'.format(best_score=bestSC))
print('\n Best Weights: {weights}'.format(weights=bestWght))

In [None]:
weight = np.asarray(bestWght)
test_min_1=np.zeros(test_1.shape[0])
for i in range(num):
    test_min_1 += test_1[:,i]*weight[i]

test_min_2=np.zeros(test_2.shape[0])
for i in range(num):
    test_min_2 += test_2[:,i]*weight[i]

test_id['loss'] = test_min_1
test_id.to_csv('../output/final_mcmc_test1.csv', index = False)

test_id['loss'] = test_min_2
test_id.to_csv('../output/final_mcmc_test2.csv', index = False)

test_id['loss'] = 0.5*test_min_1+0.5*test_min_2
test_id.to_csv('../output/final_mcmc_test.csv', index = False)