In [5]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from bayes_opt import BayesianOptimization
from time import time
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [6]:
train = pd.read_csv('data/train_features.csv')
test = pd.read_csv('data/test_features.csv')
target = pd.read_csv('data/train_target.csv')
target_cols = target.columns[1:]

In [7]:
tr = train.pivot_table(index = 'id', columns = 'Time', values = ['S1', 'S2', 'S3', 'S4'])
te = test.pivot_table(index = 'id', columns = 'Time', values = ['S1', 'S2', 'S3', 'S4'])

In [8]:
y = target[target_cols]
y

Unnamed: 0,X,Y,M,V
0,0.0,-400.0,50.0,0.4
1,400.0,0.0,100.0,1.0
2,-300.0,-200.0,25.0,0.4
3,200.0,-100.0,150.0,0.4
4,-300.0,-100.0,150.0,0.4
...,...,...,...,...
2795,200.0,200.0,50.0,1.0
2796,-400.0,-400.0,150.0,0.2
2797,-100.0,0.0,50.0,0.2
2798,100.0,100.0,125.0,0.6


In [9]:
bounds_LGB = {
    'num_leaves': (100, 800), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 1),
    'min_child_weight': (0.01, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(6, 26),
    'n_estimators': (64, 512)
}

def build_lgb(x, y, init_points=15, n_iter=0, cv=4, param=True, verbose=2):
    train_X, test_X, train_y, test_y = train_test_split(x.values, y.values, test_size=0.3, random_state=SEED, shuffle=True)
    def LGB_bayesian(
        #learning_rate,
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
        n_estimators
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int


        params = {
                  'num_leaves': num_leaves, 
                  'min_data_in_leaf': min_data_in_leaf,
                  'min_child_weight': min_child_weight,
                  'bagging_fraction' : bagging_fraction,
                  'feature_fraction' : feature_fraction,
                  'learning_rate' : 0.05,
                  'max_depth': max_depth,
                  'reg_alpha': reg_alpha,
                  'reg_lambda': reg_lambda,
                  'objective': 'regression',
                  'save_binary': True,
                  'seed': SEED,
                  'feature_fraction_seed': SEED,
                  'bagging_seed': SEED,
                  'drop_seed': SEED,
                  'data_random_seed': SEED,
                  'boosting': 'gbdt', ## some get better result using 'dart'
                  'verbose': 1,
                  'boost_from_average': True,
                  'metric':'mse',
                  'n_estimators': int(n_estimators),
                  'n_jobs': -1,
#                   'tree_learner ': 'voting'
        }    

        ## set reg options
        reg = lgb.LGBMRegressor(**params)
        m_reg = MultiOutputRegressor(reg)
#         m_reg.fit(train_X, train_y)
#         score = mean_absolute_error(test_y, m_reg.predict(test_X))
        score = cross_val_score(m_reg, x, y, cv=cv, scoring='neg_mean_squared_error').mean()

        return score
    
    optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=SEED, verbose=verbose)
    init_points = init_points
    n_iter = n_iter

    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'regression',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'boost_from_average': True,
        'metric':'mse',
        'n_estimators': int(optimizer.max['params']['n_estimators']),
        'n_jobs': -1,
#         'tree_learner ': 'voting'
    }

    params = param_lgb.copy()
    
    reg = lgb.LGBMRegressor(**params)
    lgb_reg = MultiOutputRegressor(reg)
    lgb_reg.fit(x.values, y.values)
    
    if param:
        return lgb_reg, params
    else:
        return lgb_reg

In [11]:
reg = build_lgb(tr, y, 15, 10, param=False)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-58.8    [0m | [0m 0.5247  [0m | [0m 0.8704  [0m | [0m 20.64   [0m | [0m 1.8     [0m | [0m 23.4    [0m | [0m 133.9   [0m | [0m 140.7   [0m | [0m 2.612   [0m | [0m 1.843   [0m |
| [95m 2       [0m | [95m-43.64   [0m | [95m 0.7248  [0m | [95m 0.3124  [0m | [95m 25.4    [0m | [95m 2.499   [0m | [95m 31.85   [0m | [95m 145.5   [0m | [95m 228.4   [0m | [95m 0.9823  [0m | [95m 1.622   [0m |
| [0m 3       [0m | [0m-44.04   [0m | [0m 0.5592  [0m | [0m 0.4747  [0m | [0m 18.24   [0m | [0m 0.4271  [0m | [0m 43.82   [0m | [0m 228.1   [0m | [0m 419.2   [0m | [0m 2.377   [0m | [0m 0.6791  [0m |
| [0m 4       [0m | [0m-77.21   [0m | [0m 0.6085  

In [12]:
pred = reg.predict(te)
pred

array([[-2.46824409e+02, -1.35521454e+01,  9.99143954e+01,
         5.13609170e-01],
       [ 2.28287934e+02, -2.98427845e+02,  1.28773802e+02,
         5.17426834e-01],
       [-1.94726747e+02,  1.73651792e+02,  7.41212456e+01,
         3.07479658e-01],
       ...,
       [ 2.70993338e+02, -2.88691376e+02,  8.14073653e+01,
         2.36442357e-01],
       [ 3.65883194e+01, -3.41315857e+02,  1.35553705e+02,
         5.03562191e-01],
       [ 1.82849665e+02,  2.02360973e+02,  8.46993751e+01,
         3.48089802e-01]])

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mean_absolute_error(y, reg.predict(tr))

0.14802344731703979

In [17]:
sub = pd.read_csv('./data/sample_submission.csv')
sub[target_cols] = pred
sub

Unnamed: 0,id,X,Y,M,V
0,2800,-246.824409,-13.552145,99.914395,0.513609
1,2801,228.287934,-298.427845,128.773802,0.517427
2,2802,-194.726747,173.651792,74.121246,0.307480
3,2803,192.541532,2.926115,87.321333,0.260968
4,2804,-210.371211,279.279320,91.522330,0.534172
...,...,...,...,...,...
695,3495,-301.537880,285.293084,84.917981,0.550003
696,3496,59.751297,-2.795594,60.201667,0.363064
697,3497,270.993338,-288.691376,81.407365,0.236442
698,3498,36.588319,-341.315857,135.553705,0.503562


In [18]:
sub.to_csv('./sample1.csv', index=False)

In [14]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=13, random_state=42, n_jobs=-1)
rf.fit(tr, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=13, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mean_absolute_error(y, rf.predict(tr))

1.2719291159297055

In [None]:
sub = pd.read_csv('./data/sample_submission.csv')
te = test.pivot_table(index = 'id', columns = 'Time', values = ['S1', 'S2', 'S3', 'S4'])

In [None]:
for i, c in enumerate(target.columns[1:]):
    sub[c] = regs[i].predict(te)

In [None]:
pred = rf.predict(te)
pred

In [None]:
target

In [None]:
sub.to_csv('smaple.csv', index=False)