In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
import gc
import warnings

warnings.filterwarnings('ignore')

In [2]:
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, r2_score, mean_squared_error, silhouette_score, normalized_mutual_info_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, HuberRegressor
from sklearn.cluster import KMeans
from sklearn.svm import SVR


In [3]:
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

from pystacknet.pystacknet import StackNetRegressor
from bayes_opt import BayesianOptimization


In [4]:
import tensorflow as tf
import keras
from keras import layers, models, optimizers
from keras.regularizers import L1L2
from tensorflow.keras.losses import Huber
import keras.backend as K

es = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

Using TensorFlow backend.


In [5]:
tr = pd.read_csv('./data/FIFA_train.csv').drop(['id', 'name'], axis=1)
te = pd.read_csv('./data/FIFA_test.csv').drop(['id', 'name'], axis=1)

sub = pd.read_csv('./data/submission.csv')

target = tr['value']
tr = tr.drop('value', axis=1)

In [6]:
tr['idx'] = np.ones(len(tr))
te['idx'] = np.zeros(len(te))
merge = pd.concat([tr, te])

categorical = ['continent', 'position', 'prefer_foot']
for c in categorical:
    lbe = LabelEncoder()
    merge[c] = lbe.fit_transform(merge[c])
    
dum = pd.get_dummies(merge['position'], prefix='position')
merge = pd.concat([merge, dum], axis=1)
merge = merge.drop('position', axis=1)

merge['age'] = (merge['age'] - np.min(merge['age']))/(np.max(merge['age']) - np.min(merge['age']))

tr = merge[merge['idx'] == 1].drop('idx', axis=1)
te = merge[merge['idx'] == 0].drop('idx', axis=1)

In [7]:
tr['left_cont'] = tr['contract_until'].copy()
te['left_cont'] = te['contract_until'].copy()

for i in range(len(tr['contract_until'])):
    try:
        tr['contract_until'][i] = int(tr['contract_until'][i])
        tr['left_cont'][i] = int(tr['left_cont'][i]) - 2018
    except:
        tr['contract_until'][i] = int(tr['contract_until'][i][len(tr['contract_until'][i])-4:len(tr['contract_until'][i])])
        tr['left_cont'][i] = tr['contract_until'][i] - 2018
        
for i in range(len(te['contract_until'])):
    try:
        te['contract_until'][i] = int(te['contract_until'][i])
        te['left_cont'][i] = int(te['left_cont'][i]) - 2018
    except:
        te['contract_until'][i] = int(te['contract_until'][i][len(te['contract_until'][i])-4:len(te['contract_until'][i])])
        te['left_cont'][i] = te['contract_until'][i] - 2018

In [8]:
tr = tr.astype('float')
te = te.astype('float')
    

In [9]:
tr['slack'] = tr['stat_potential'] - tr['stat_overall']
te['slack'] = te['stat_potential'] - te['stat_overall']

In [10]:
tr = tr.drop(['contract_until', 'continent'], axis=1)
te = te.drop(['contract_until', 'continent'], axis=1)

In [11]:
tr['stat_overall'] /= 100
te['stat_overall'] /= 100

tr['stat_potential'] /= 100
te['stat_potential'] /= 100

tr['reputation'] /= 5
te['reputation'] /= 5

tr['stat_skill_moves'] /= 5
te['stat_skill_moves'] /= 5





In [12]:
tr.head()

Unnamed: 0,age,prefer_foot,reputation,stat_overall,stat_potential,stat_skill_moves,position_0,position_1,position_2,position_3,left_cont,slack
0,0.576923,0.0,1.0,0.94,0.94,0.8,0.0,0.0,0.0,1.0,3.0,0.0
1,0.423077,1.0,0.8,0.91,0.93,0.2,0.0,1.0,0.0,0.0,2.0,2.0
2,0.576923,1.0,1.0,0.91,0.91,0.6,0.0,0.0,0.0,1.0,3.0,0.0
3,0.615385,1.0,0.8,0.91,0.91,0.6,1.0,0.0,0.0,0.0,2.0,0.0
4,0.346154,1.0,0.6,0.9,0.93,0.2,0.0,1.0,0.0,0.0,3.0,3.0


In [None]:
sns.distplot(tr['age'])

In [13]:
train_X = tr.copy()
train_y = target.copy()

test_X = te.copy()

In [None]:
lr = LinearRegression()
las = Lasso()
els = ElasticNet(l1_ratio=1)
rf = RandomForestRegressor(random_state=12,
                          max_depth=13)

test_models = [lr, las, els, rf]

In [None]:
for m in test_models:
    print(-1 * round(np.mean(cross_val_score(m, train_X, target, cv=4, scoring='neg_root_mean_squared_error')), 3))

In [14]:
tr_X, te_X, tr_y, te_y = train_test_split(train_X, target, test_size=0.3, random_state=12, shuffle=True)


In [15]:
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    params = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
#               'learning_rate' : 0.03,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'regression',
              'save_binary': True,
              'seed': 12,
              'feature_fraction_seed': 12,
              'bagging_seed': 12,
              'drop_seed': 12,
              'data_random_seed': 12,
              'boosting': 'gbdt', ## some get better result using 'dart'
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'rmse'}    
    
    ## set clf options
    clf = lgb.LGBMRegressor(**params).fit(tr_X, tr_y, early_stopping_rounds=100,eval_set=[(te_X, te_y)], eval_metric='rmse', verbose=0)
    
    score = -mean_squared_error(te_y, clf.predict(te_X))/len(te_y)

    return score


In [16]:
bounds_LGB = {
    'num_leaves': (10, 500), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.1, 0.9),
    'feature_fraction' : (0.1, 0.9),
#     'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.001, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(3, 19),
}

In [17]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=12)


In [18]:
init_points = 20
n_iter = 20

optimizer.maximize(init_points=init_points, n_iter=n_iter)


|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.114e+0[0m | [0m 0.2233  [0m | [0m 0.692   [0m | [0m 7.213   [0m | [0m 1.602   [0m | [0m 2.186   [0m | [0m 460.2   [0m | [0m 2.712   [0m | [0m 0.1969  [0m |
| [0m 2       [0m | [0m-2.224e+0[0m | [0m 0.8656  [0m | [0m 0.2098  [0m | [0m 7.541   [0m | [0m 1.819   [0m | [0m 141.6   [0m | [0m 427.8   [0m | [0m 0.1066  [0m | [0m 1.612   [0m |
| [0m 3       [0m | [0m-1.817e+0[0m | [0m 0.5416  [0m | [0m 0.4883  [0m | [0m 15.29   [0m | [0m 0.483   [0m | [0m 114.7   [0m | [0m 20.2    [0m | [0m 0.4921  [0m | [0m 0.4372  [0m |
| [0m 4       [0m | [0m-7.754e+0[0m | [0m 0.3479  [0m | [0m 0.6372  [0m | [0m 10.54   [0m | [0m 2.449   [0m | [0m 43.44   [0m | [0m 369

In [19]:
param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        #'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'regression',
        'save_binary': True,
        'seed': 12,
        'feature_fraction_seed': 12,
        'bagging_seed': 12,
        'drop_seed': 12,
        'data_random_seed': 12,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': False,
        'boost_from_average': True,
        'metric':'rmse'
    }

params = param_lgb.copy()

In [20]:
lgb_clf = lgb.LGBMRegressor(**params, early_stoppong_rounds = 100)
lgb_clf.fit(train_X, target)

LGBMRegressor(bagging_fraction=0.21352301463492807, bagging_seed=12,
              boost_from_average=True, boosting_type='gbdt', class_weight=None,
              colsample_bytree=1.0, data_random_seed=12, drop_seed=12,
              early_stoppong_rounds=100, feature_fraction=0.6266369578052369,
              feature_fraction_seed=12, importance_type='split',
              is_unbalance=False, learning_rate=0.1, max_depth=7, metric='rmse',
              min_child_samples=20, min_child_weight=1.6463011973330988,
              min_data_in_leaf=1, min_split_gain=0.0, n_estimators=100,
              n_jobs=-1, num_leaves=459, objective='regression',
              random_state=None, reg_alpha=2.4953852940360384,
              reg_lambda=0.1430473985570573, save_binary=True, seed=12,
              silent=True, ...)

In [21]:
sub['value'] = lgb_clf.predict(test_X)
print(sub)

         id         value
0         1  7.374257e+07
1         2  1.058097e+08
2         4  6.953201e+07
3         5  7.238886e+07
4         6  6.256181e+07
...     ...           ...
3823  16924  8.344692e+04
3824  16929  6.467118e+04
3825  16932  7.770136e+04
3826  16937  5.971908e+04
3827  16943  5.667375e+04

[3828 rows x 2 columns]


In [22]:
sub.to_csv('./sub/pre/lgb2.csv', index=False)

In [23]:
# parmas for xgboost
params_fx = {'min_data_in_leaf': params['min_data_in_leaf'],
             'num_leaves': params['num_leaves'],
             'min_child_weight': params['min_child_weight'],
             'bagging_fraction': params['bagging_fraction'],
             'feature_fraction': params['feature_fraction'],
             'reg_lambda': params['reg_lambda'],
             'reg_alpha': params['reg_alpha'],
             'max_depth': params['max_depth'],
#              'obective': 'reg:squarederror',
             'eval_metric': 'rmse'
             
}

In [24]:
xgb_clf = xgb.XGBRegressor(
            **params_fx,
            early_stoppong_rounds = 100         
    ).fit(train_X.values, target)



In [25]:
sub['value'] = xgb_clf.predict(test_X.values)
print(sub)

         id         value
0         1  6.378290e+07
1         2  1.059075e+08
2         4  7.039196e+07
3         5  7.172346e+07
4         6  6.067336e+07
...     ...           ...
3823  16924  7.882162e+04
3824  16929  3.336033e+04
3825  16932  6.518775e+04
3826  16937  7.228119e+04
3827  16943  7.354127e+04

[3828 rows x 2 columns]


In [26]:
sub.to_csv('./sub/pre/xgb2.csv', index=False)

In [27]:
cat_clf = cat.CatBoostRegressor(random_seed=42, n_estimators=1500, early_stopping_rounds=50, learning_rate=0.5).fit(train_X, target)


0:	learn: 3590583.5106852	total: 66.5ms	remaining: 1m 39s
1:	learn: 2413048.4525564	total: 68ms	remaining: 51s
2:	learn: 1679248.3999477	total: 69.8ms	remaining: 34.8s
3:	learn: 1262328.6547248	total: 71.5ms	remaining: 26.7s
4:	learn: 1042310.0214422	total: 73.1ms	remaining: 21.9s
5:	learn: 1011383.8608316	total: 74.4ms	remaining: 18.5s
6:	learn: 921070.9470389	total: 75.9ms	remaining: 16.2s
7:	learn: 903961.5101294	total: 77.1ms	remaining: 14.4s
8:	learn: 887364.3723581	total: 78.5ms	remaining: 13s
9:	learn: 849628.5087261	total: 79.7ms	remaining: 11.9s
10:	learn: 836145.0636925	total: 81.1ms	remaining: 11s
11:	learn: 770190.3887171	total: 82.9ms	remaining: 10.3s
12:	learn: 760485.5044871	total: 84.3ms	remaining: 9.65s
13:	learn: 714062.9813750	total: 85.8ms	remaining: 9.11s
14:	learn: 662123.2584750	total: 87.3ms	remaining: 8.65s
15:	learn: 656652.3884498	total: 92.9ms	remaining: 8.62s
16:	learn: 652498.7599791	total: 94.3ms	remaining: 8.23s
17:	learn: 628941.7746565	total: 95.8ms	re

193:	learn: 163580.1933325	total: 380ms	remaining: 2.56s
194:	learn: 163160.4036292	total: 382ms	remaining: 2.56s
195:	learn: 162862.5516721	total: 383ms	remaining: 2.55s
196:	learn: 162654.3382694	total: 385ms	remaining: 2.54s
197:	learn: 162218.4455363	total: 386ms	remaining: 2.54s
198:	learn: 161967.7330961	total: 388ms	remaining: 2.54s
199:	learn: 161757.4576886	total: 389ms	remaining: 2.53s
200:	learn: 161625.1457381	total: 391ms	remaining: 2.52s
201:	learn: 161223.5761173	total: 392ms	remaining: 2.52s
202:	learn: 160601.1793608	total: 394ms	remaining: 2.52s
203:	learn: 160369.2780388	total: 395ms	remaining: 2.51s
204:	learn: 160258.6928580	total: 397ms	remaining: 2.51s
205:	learn: 160105.6862636	total: 399ms	remaining: 2.51s
206:	learn: 159537.5620161	total: 401ms	remaining: 2.5s
207:	learn: 159514.7839993	total: 402ms	remaining: 2.5s
208:	learn: 159354.2518311	total: 404ms	remaining: 2.5s
209:	learn: 159146.1733253	total: 406ms	remaining: 2.49s
210:	learn: 158922.8507463	total: 

392:	learn: 119281.9119701	total: 698ms	remaining: 1.97s
393:	learn: 119148.9703748	total: 700ms	remaining: 1.96s
394:	learn: 118833.1271921	total: 701ms	remaining: 1.96s
395:	learn: 118763.5996944	total: 703ms	remaining: 1.96s
396:	learn: 118434.3313282	total: 705ms	remaining: 1.96s
397:	learn: 118341.1885451	total: 706ms	remaining: 1.96s
398:	learn: 118302.6134252	total: 708ms	remaining: 1.95s
399:	learn: 118246.4056693	total: 709ms	remaining: 1.95s
400:	learn: 118115.8748031	total: 711ms	remaining: 1.95s
401:	learn: 117959.4537331	total: 713ms	remaining: 1.95s
402:	learn: 117781.0549439	total: 714ms	remaining: 1.94s
403:	learn: 117676.6099116	total: 716ms	remaining: 1.94s
404:	learn: 117577.8001037	total: 717ms	remaining: 1.94s
405:	learn: 117508.1478083	total: 719ms	remaining: 1.94s
406:	learn: 117462.0903051	total: 721ms	remaining: 1.94s
407:	learn: 117354.5749734	total: 722ms	remaining: 1.93s
408:	learn: 117116.9775192	total: 724ms	remaining: 1.93s
409:	learn: 116943.0648774	tota

589:	learn: 100063.1211449	total: 1.02s	remaining: 1.57s
590:	learn: 99933.8034016	total: 1.02s	remaining: 1.57s
591:	learn: 99722.0586448	total: 1.02s	remaining: 1.56s
592:	learn: 99643.6640243	total: 1.02s	remaining: 1.56s
593:	learn: 99573.9457157	total: 1.02s	remaining: 1.56s
594:	learn: 99562.6653645	total: 1.02s	remaining: 1.56s
595:	learn: 99558.6650463	total: 1.03s	remaining: 1.56s
596:	learn: 99441.7261979	total: 1.03s	remaining: 1.56s
597:	learn: 99362.5328705	total: 1.03s	remaining: 1.56s
598:	learn: 99333.8571292	total: 1.03s	remaining: 1.55s
599:	learn: 99206.0677462	total: 1.03s	remaining: 1.55s
600:	learn: 99001.3607996	total: 1.04s	remaining: 1.55s
601:	learn: 98985.1690589	total: 1.04s	remaining: 1.55s
602:	learn: 98922.8411137	total: 1.04s	remaining: 1.55s
603:	learn: 98816.1595440	total: 1.04s	remaining: 1.54s
604:	learn: 98694.1200318	total: 1.04s	remaining: 1.54s
605:	learn: 98650.3204953	total: 1.04s	remaining: 1.54s
606:	learn: 98496.1807573	total: 1.05s	remainin

797:	learn: 85430.1086914	total: 1.34s	remaining: 1.18s
798:	learn: 85301.6489227	total: 1.34s	remaining: 1.18s
799:	learn: 85270.6347431	total: 1.34s	remaining: 1.18s
800:	learn: 85250.6658810	total: 1.34s	remaining: 1.17s
801:	learn: 85218.3322314	total: 1.35s	remaining: 1.17s
802:	learn: 85105.9059901	total: 1.35s	remaining: 1.17s
803:	learn: 85026.9994378	total: 1.35s	remaining: 1.17s
804:	learn: 84946.6497989	total: 1.35s	remaining: 1.17s
805:	learn: 84876.2702823	total: 1.35s	remaining: 1.17s
806:	learn: 84842.1005518	total: 1.35s	remaining: 1.16s
807:	learn: 84825.6793099	total: 1.36s	remaining: 1.16s
808:	learn: 84753.1480259	total: 1.36s	remaining: 1.16s
809:	learn: 84728.4359048	total: 1.36s	remaining: 1.16s
810:	learn: 84691.4331509	total: 1.36s	remaining: 1.16s
811:	learn: 84653.3854625	total: 1.36s	remaining: 1.16s
812:	learn: 84613.2555044	total: 1.36s	remaining: 1.15s
813:	learn: 84592.6494279	total: 1.37s	remaining: 1.15s
814:	learn: 84561.7633248	total: 1.37s	remaining

992:	learn: 77118.3316964	total: 1.66s	remaining: 846ms
993:	learn: 77072.3686544	total: 1.66s	remaining: 845ms
994:	learn: 77042.3299097	total: 1.66s	remaining: 844ms
995:	learn: 77032.6328334	total: 1.66s	remaining: 842ms
996:	learn: 77003.5287369	total: 1.67s	remaining: 840ms
997:	learn: 77003.2803725	total: 1.67s	remaining: 839ms
998:	learn: 76838.2817428	total: 1.67s	remaining: 837ms
999:	learn: 76829.0182214	total: 1.67s	remaining: 835ms
1000:	learn: 76813.5971822	total: 1.67s	remaining: 834ms
1001:	learn: 76756.1526790	total: 1.67s	remaining: 832ms
1002:	learn: 76712.3779608	total: 1.68s	remaining: 830ms
1003:	learn: 76688.1600200	total: 1.68s	remaining: 828ms
1004:	learn: 76684.7183661	total: 1.68s	remaining: 827ms
1005:	learn: 76642.8425486	total: 1.68s	remaining: 825ms
1006:	learn: 76627.5662794	total: 1.68s	remaining: 823ms
1007:	learn: 76540.6459040	total: 1.68s	remaining: 821ms
1008:	learn: 76407.6666680	total: 1.68s	remaining: 820ms
1009:	learn: 76402.0000124	total: 1.69s

1200:	learn: 70834.5714767	total: 1.98s	remaining: 493ms
1201:	learn: 70824.8494677	total: 1.98s	remaining: 492ms
1202:	learn: 70817.1653476	total: 1.99s	remaining: 490ms
1203:	learn: 70797.0782736	total: 1.99s	remaining: 488ms
1204:	learn: 70780.4111546	total: 1.99s	remaining: 487ms
1205:	learn: 70779.2142910	total: 1.99s	remaining: 485ms
1206:	learn: 70771.7562877	total: 1.99s	remaining: 483ms
1207:	learn: 70762.0254744	total: 1.99s	remaining: 482ms
1208:	learn: 70741.0107060	total: 1.99s	remaining: 480ms
1209:	learn: 70707.7851987	total: 2s	remaining: 478ms
1210:	learn: 70671.2625859	total: 2s	remaining: 477ms
1211:	learn: 70651.1049551	total: 2s	remaining: 475ms
1212:	learn: 70588.9816109	total: 2s	remaining: 473ms
1213:	learn: 70576.3846079	total: 2s	remaining: 472ms
1214:	learn: 70562.1204802	total: 2s	remaining: 470ms
1215:	learn: 70549.6870415	total: 2s	remaining: 468ms
1216:	learn: 70545.6346181	total: 2.01s	remaining: 467ms
1217:	learn: 70539.7823673	total: 2.01s	remaining: 4

1412:	learn: 66090.4798355	total: 2.31s	remaining: 142ms
1413:	learn: 66061.6836963	total: 2.31s	remaining: 141ms
1414:	learn: 66050.9716345	total: 2.31s	remaining: 139ms
1415:	learn: 66040.7410796	total: 2.31s	remaining: 137ms
1416:	learn: 66035.8090333	total: 2.31s	remaining: 136ms
1417:	learn: 66015.1613403	total: 2.32s	remaining: 134ms
1418:	learn: 66004.3662032	total: 2.32s	remaining: 132ms
1419:	learn: 65950.4275044	total: 2.32s	remaining: 131ms
1420:	learn: 65928.4883195	total: 2.32s	remaining: 129ms
1421:	learn: 65928.4198625	total: 2.32s	remaining: 127ms
1422:	learn: 65863.8320937	total: 2.32s	remaining: 126ms
1423:	learn: 65858.6896842	total: 2.33s	remaining: 124ms
1424:	learn: 65848.9691259	total: 2.33s	remaining: 123ms
1425:	learn: 65838.5056709	total: 2.33s	remaining: 121ms
1426:	learn: 65832.2307689	total: 2.33s	remaining: 119ms
1427:	learn: 65806.8960820	total: 2.33s	remaining: 118ms
1428:	learn: 65790.8107185	total: 2.33s	remaining: 116ms
1429:	learn: 65780.0727059	tota

In [28]:
sub['value'] = cat_clf.predict(test_X.values)
print(sub)

         id         value
0         1  7.937903e+07
1         2  8.783381e+07
2         4  7.449537e+07
3         5  7.728491e+07
4         6  6.150963e+07
...     ...           ...
3823  16924  6.818762e+04
3824  16929  5.009635e+04
3825  16932  5.017469e+04
3826  16937  6.337740e+04
3827  16943  6.797491e+04

[3828 rows x 2 columns]


In [29]:
sub.to_csv('./sub/pre/cat2.csv', index=False)

In [30]:
# lgb_clf = lgb.LGBMRegressor(**params, early_stoppong_rounds = 100)

# xgb_clf = xgb.XGBRegressor(
#             **params_fx,
#             early_stoppong_rounds = 100         
#     )

# cat_clf = cat.CatBoostRegressor(random_seed=42, n_estimators=3000, early_stopping_rounds=50)

rf1 = RandomForestRegressor(n_estimators=200,
                                max_depth=13, 
                                max_features='sqrt', 
                                random_state=21)

rf2 = RandomForestRegressor(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=12)

pca = PCA(15, random_state=42)

lr = LinearRegression()
els = ElasticNet(random_state=42, l1_ratio=1)
las = Lasso(random_state=21)

hl = HuberRegressor()

svr = SVR()

adb = AdaBoostRegressor(hl,
                        random_state=12,
                       n_estimators=500,
                       learning_rate=0.01)

pca = PCA(9)

test_sets = [lr, els, las, hl, svr]

In [None]:
for mo in test_sets:
    print(-1*np.mean(cross_val_score(mo, train_X, target, scoring='neg_mean_squared_error', cv=5)))

In [None]:
train_X.shape

In [31]:
_models = [[lgb_clf, xgb_clf, cat_clf, pca],
          [rf1]
         ]

In [32]:
model = StackNetRegressor(_models, 
                           metric="rmse", 
                           folds=4,
                           restacking=False,
                           random_state=42,
                           n_jobs=-1, 
                           verbose=1)

model.fit(train_X.values, target)

Input Dimensionality 12 at Level 0 
4 models included in Level 0 
Fold 1/4 , model 0 , rmse===984184.357971 
Fold 1/4 , model 1 , rmse===994459.407656 
Fold 1/4 , model 2 , rmse===1063365.839201 
Fold 2/4 , model 0 , rmse===759628.534109 
Fold 2/4 , model 1 , rmse===813047.539983 
Fold 2/4 , model 2 , rmse===581047.819310 
Fold 3/4 , model 0 , rmse===671014.185862 
Fold 3/4 , model 1 , rmse===622677.268465 
Fold 3/4 , model 2 , rmse===469304.804220 
Fold 4/4 , model 0 , rmse===814473.872531 
Fold 4/4 , model 1 , rmse===896291.803596 
Fold 4/4 , model 2 , rmse===688581.270281 
Output dimensionality of level 0 is 12 
 level 0 lasted 38.628135 seconds 
Input Dimensionality 12 at Level 1 
1 models included in Level 1 
Fold 1/4 , model 0 , rmse===1326673.748070 
Fold 2/4 , model 0 , rmse===637918.336281 
Fold 3/4 , model 0 , rmse===618550.965952 
Fold 4/4 , model 0 , rmse===641572.290933 
Output dimensionality of level 1 is 1 
 level 1 lasted 31.741575 seconds 
 fit() lasted 70.370738 secon

In [33]:
sub['value'] = model.predict(test_X.values)
print(sub)

1 estimators included in Level 0 
1 estimators included in Level 1 
         id         value
0         1  7.036750e+07
1         2  7.642062e+07
2         4  6.587000e+07
3         5  7.133812e+07
4         6  5.805750e+07
...     ...           ...
3823  16924  7.001611e+04
3824  16929  6.318696e+04
3825  16932  6.579765e+04
3826  16937  7.280371e+04
3827  16943  6.909499e+04

[3828 rows x 2 columns]


In [34]:
sub.to_csv('./sub/pre/stk3.csv', index=False)

In [None]:
##################

In [42]:
km = KMeans(random_state=42, n_clusters=10).fit(train_X)

tr_km = pd.get_dummies(km.predict(train_X))
te_km = pd.get_dummies(km.predict(test_X))

In [68]:
k1 = model.predict_up_to(train_X.values)
k2 = model.predict_up_to(test_X.values)

1 estimators included in Level 0 
(8932, 12)
1 estimators included in Level 1 
(8932, 1)
1 estimators included in Level 0 
(3828, 12)
1 estimators included in Level 1 
(3828, 1)


In [69]:
new_tr_X = k1[0]
new_te_X = k2[0]

In [70]:
new_merge = pd.DataFrame(np.concatenate([new_tr_X, new_te_X]))

k = (new_merge - np.mean(new_merge)) / np.std(new_merge)

In [71]:
new_tr_X = k.values[0:len(new_tr_X)]
new_te_X = k.values[len(new_tr_X):]

In [47]:
t = np.log(target)
# t = (t - np.mean(t)) / np.std(t)

In [89]:
K.clear_session()

inputs1 = layers.Input(shape= (train_X.shape[1], ))
inputs2 = layers.Input(shape = (tr_km.shape[1], ))

x1 = layers.Dense(32, kernel_initializer='he_normal')(inputs1)
x1 = layers.advanced_activations.LeakyReLU(0.1)(x1)
# x1 = layers.Dense(32, kernel_initializer='he_normal')(x1)
# x1 = layers.advanced_activations.LeakyReLU(0.1)(x1)

x2 = layers.Dense(32, kernel_initializer='he_normal')(inputs2)
x2 = layers.advanced_activations.LeakyReLU(0.1)(x2)

x = layers.Concatenate()([x1, x2])

x = layers.Dense(16, kernel_initializer='he_normal')(x)
x1 = layers.advanced_activations.LeakyReLU(0.3)(x1)

outputs = layers.Dense(1, activation = 'linear', kernel_initializer='he_normal')(x)

nn = models.Model([inputs1, inputs2], outputs)

nn.compile(optimizer = optimizers.Adam(),
          loss = Huber(),
          metrics = ['mae'])

In [90]:
nn.fit([train_X.values, tr_km], t.values,
      epochs=100,
      validation_split=0.3,
      callbacks=[es])

Train on 6252 samples, validate on 2680 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100


<keras.callbacks.callbacks.History at 0x1e8b2a15bc8>

In [91]:
sub['value'] = np.exp(nn.predict([test_X.values, te_km]))
print(sub)

         id         value
0         1  2.546610e+08
1         2  2.671213e+08
2         4  1.718574e+08
3         5  1.762082e+08
4         6  7.684925e+07
...     ...           ...
3823  16924  2.438191e+04
3824  16929  1.766639e+05
3825  16932  3.377254e+04
3826  16937  2.598425e+04
3827  16943  2.304784e+04

[3828 rows x 2 columns]


In [92]:
sub.to_csv('./sub/pre/km_nn6.csv', index=False)

In [None]:
new_tr_X.shape

In [None]:
K.clear_session()

nn = models.Sequential()
nn.add(layers.Dense(32, kernel_regularizer=L1L2(l2=0.001)))
nn.add(layers.advanced_activations.LeakyReLU(0.1))
nn.add(layers.Dropout(0.3))
# nn.add(layers.Dense(32, kernel_regularizer=L1L2(l2=0.001)))
# nn.add(layers.advanced_activations.LeakyReLU(0.1))

nn.add(layers.Dense(1, activation='linear'))

nn.compile(optimizer = optimizers.Adam(lr=1e-4),
          loss = 'mse',
          metrics = ['mae'])

In [None]:
nn.fit(new_tr_X, target.values,
      epochs=50,
      validation_split=0.3,
      callbacks = [es])

In [None]:
sub['value'] = nn.predict(new_te_X)
print(sub)

In [None]:
sub.to_csv('./sub/pre/nn2.csv', index=False)

In [None]:
new_te_X