In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from time import time
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputRegressor

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from pystacknet.pystacknet import StackNetRegressor
from bayes_opt import BayesianOptimization

In [3]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='id')

In [21]:
tr = pd.read_csv('./data/train.csv', index_col='id')
te = pd.read_csv('./data/test.csv', index_col='id')

target_cols = ['hhb', 'hbo2', 'ca', 'na']
target = tr[target_cols].copy()
tr = tr.drop(target_cols, axis=1)

In [22]:
src = tr.columns[tr.columns.str.contains('src')]
dst = tr.columns[tr.columns.str.contains('dst')]


In [23]:
train_dst = tr[dst].replace(0, np.NaN) # dst 데이터만 따로 뺀다.
test_dst = te[dst].replace(0, np.NaN) # 보간을 하기위해 결측값을 삭제한다.

train_dst = train_dst.interpolate(methods='linear', axis=1)
test_dst = test_dst.interpolate(methods='linear', axis=1)

tr.update(train_dst) # 보간한 데이터를 기존 데이터프레임에 업데이트 한다.
te.update(test_dst)

In [24]:
alpha = tr[dst]
beta = te[dst]

alpha.loc[alpha['790_dst'].isnull(),'790_dst']=alpha.loc[alpha['790_dst'].isnull(),'800_dst']
alpha.loc[alpha['780_dst'].isnull(),'780_dst']=alpha.loc[alpha['780_dst'].isnull(),'790_dst']
alpha.loc[alpha['770_dst'].isnull(),'770_dst']=alpha.loc[alpha['770_dst'].isnull(),'780_dst']
alpha.loc[alpha['760_dst'].isnull(),'760_dst']=alpha.loc[alpha['760_dst'].isnull(),'770_dst']
alpha.loc[alpha['750_dst'].isnull(),'750_dst']=alpha.loc[alpha['750_dst'].isnull(),'760_dst']
alpha.loc[alpha['740_dst'].isnull(),'740_dst']=alpha.loc[alpha['740_dst'].isnull(),'750_dst']
alpha.loc[alpha['730_dst'].isnull(),'730_dst']=alpha.loc[alpha['730_dst'].isnull(),'740_dst']
alpha.loc[alpha['720_dst'].isnull(),'720_dst']=alpha.loc[alpha['720_dst'].isnull(),'730_dst']
alpha.loc[alpha['710_dst'].isnull(),'710_dst']=alpha.loc[alpha['710_dst'].isnull(),'720_dst']
alpha.loc[alpha['700_dst'].isnull(),'700_dst']=alpha.loc[alpha['700_dst'].isnull(),'710_dst']
alpha.loc[alpha['690_dst'].isnull(),'690_dst']=alpha.loc[alpha['690_dst'].isnull(),'700_dst']
alpha.loc[alpha['680_dst'].isnull(),'680_dst']=alpha.loc[alpha['680_dst'].isnull(),'690_dst']
alpha.loc[alpha['670_dst'].isnull(),'670_dst']=alpha.loc[alpha['670_dst'].isnull(),'680_dst']
alpha.loc[alpha['660_dst'].isnull(),'660_dst']=alpha.loc[alpha['660_dst'].isnull(),'670_dst']
alpha.loc[alpha['650_dst'].isnull(),'650_dst']=alpha.loc[alpha['650_dst'].isnull(),'660_dst']

beta.loc[beta['790_dst'].isnull(),'790_dst']=beta.loc[beta['790_dst'].isnull(),'800_dst']
beta.loc[beta['780_dst'].isnull(),'780_dst']=beta.loc[beta['780_dst'].isnull(),'790_dst']
beta.loc[beta['770_dst'].isnull(),'770_dst']=beta.loc[beta['770_dst'].isnull(),'780_dst']
beta.loc[beta['760_dst'].isnull(),'760_dst']=beta.loc[beta['760_dst'].isnull(),'770_dst']
beta.loc[beta['750_dst'].isnull(),'750_dst']=beta.loc[beta['750_dst'].isnull(),'760_dst']
beta.loc[beta['740_dst'].isnull(),'740_dst']=beta.loc[beta['740_dst'].isnull(),'750_dst']
beta.loc[beta['730_dst'].isnull(),'730_dst']=beta.loc[beta['730_dst'].isnull(),'740_dst']
beta.loc[beta['720_dst'].isnull(),'720_dst']=beta.loc[beta['720_dst'].isnull(),'730_dst']
beta.loc[beta['710_dst'].isnull(),'710_dst']=beta.loc[beta['710_dst'].isnull(),'720_dst']
beta.loc[beta['700_dst'].isnull(),'700_dst']=beta.loc[beta['700_dst'].isnull(),'710_dst']
beta.loc[beta['690_dst'].isnull(),'690_dst']=beta.loc[beta['690_dst'].isnull(),'700_dst']
beta.loc[beta['680_dst'].isnull(),'680_dst']=beta.loc[beta['680_dst'].isnull(),'690_dst']
beta.loc[beta['670_dst'].isnull(),'670_dst']=beta.loc[beta['670_dst'].isnull(),'680_dst']
beta.loc[beta['660_dst'].isnull(),'660_dst']=beta.loc[beta['660_dst'].isnull(),'670_dst']
beta.loc[beta['650_dst'].isnull(),'650_dst']=beta.loc[beta['650_dst'].isnull(),'660_dst']

tr[dst] = alpha
te[dst] = beta

In [None]:
for col in dst:
    tr[col+'_sq'] = tr[col] * (tr['rho'] ** 2)
    te[col+'_sq'] = te[col] * (te['rho']** 2)
    tr[col+'_sq2'] = tr[col] * (np.exp(tr['rho'] ))
    te[col+'_sq2'] = te[col] * (np.exp(te['rho'] ))

In [31]:
tr=pd.concat((tr, alpha), axis=1)
te=pd.concat((te, beta), axis=1)

In [32]:
div = tr[src].values / (tr[dst].values + 1e-10)
df_div = pd.DataFrame(div, columns=[f'{i}_div' for i in range(650, 1000, 10)])

div_ = df_div.columns[df_div.columns.str.contains('div')]
for c in div_:
    res = df_div[c].values / np.exp(tr.rho)
    df_div[c] = res
    
tr = pd.concat([tr, df_div], axis=1)

div = te[src].values / (te[dst].values + 1e-10)
df_div = pd.DataFrame(div, columns=[f'{i}_div' for i in range(650, 1000, 10)], index=te.index)

div_ = df_div.columns[df_div.columns.str.contains('div')]
for c in div_:
    res = df_div[c].values / np.exp(te.rho.values)
    df_div[c] = res
    
te = pd.concat([te, df_div], axis=1)

div = tr.columns[tr.columns.str.contains('div')]

In [33]:
for c in dst:
    new = c.replace('dst', 'dst_mul')
    tr[new] = tr[c].values * 10**tr.rho

for c in dst:
    new = c.replace('dst', 'dst_mul')
    te[new] = te[c].values * 10**te.rho
    
mul = tr.columns[tr.columns.str.contains('dst_mul')]

In [34]:
dst_mul_log = list(map(lambda x: x.replace('dst_mul', 'dst_mul_log'), mul))

tr_mul_log = pd.DataFrame(np.log(tr[mul].values +1e-10), columns=dst_mul_log)
tr = pd.concat([tr, tr_mul_log], axis=1)

te_mul_log = pd.DataFrame(np.log(te[mul].values +1e-10), columns=dst_mul_log, index=te.index)
te = pd.concat([te, te_mul_log], axis=1)


In [35]:
# src_exp = list(map(lambda x: x.replace('src', 'src_exp'), src))

# tr_src_exp = pd.DataFrame(np.exp(tr[src].values), columns=src_exp)
# tr = pd.concat([tr, tr_src_exp], axis=1)

# te_src_exp = pd.DataFrame(np.exp(te[src].values), columns=src_exp, index=te.index)
# te = pd.concat([te, te_src_exp], axis=1)


In [36]:
dst_exp = list(map(lambda x: x.replace('dst', 'dst_exp'), dst))

tr_dst_exp = pd.DataFrame(np.exp(tr[src].values), columns=dst_exp)
tr = pd.concat([tr, tr_dst_exp], axis=1)

te_dst_exp = pd.DataFrame(np.exp(te[src].values), columns=dst_exp, index=te.index)
te = pd.concat([te, te_dst_exp], axis=1)


In [37]:
# div_exp = list(map(lambda x: x.replace('src', 'div_exp'), src))

# divexp = pd.DataFrame(tr[src_exp].values / tr[dst_exp].values, columns=div_exp)
# tr = pd.concat([tr, divexp], axis=1)

# divexp = pd.DataFrame(te[src_exp].values / te[dst_exp].values, columns=div_exp, index=te.index)
# te = pd.concat([te, divexp], axis=1)

In [None]:
alpha_real=tr[dst]
alpha_imag=tr[dst]

beta_real=te[dst]
beta_imag=te[dst]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

In [17]:
# sqrt = list(map(lambda x: x.replace('mul', 'sqrt'), mul))

# tr_sqrt = pd.DataFrame(np.sqrt(tr[mul].values), columns=sqrt)
# tr = pd.concat([tr, tr_sqrt], axis=1)

# te_sqrt = pd.DataFrame(np.sqrt(te[mul].values), columns=sqrt, index=te.index)
# te = pd.concat([te, te_sqrt], axis=1)


In [38]:
np.sum(np.sum(te.isna()))

0

In [39]:
tr = tr.drop(src, axis=1)
te = te.drop(src, axis=1)

In [40]:
bounds_LGB = {
    'num_leaves': (100, 800), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 1),
    'min_child_weight': (0.01, 3),   
    'reg_alpha': (0.1, 3), 
    'reg_lambda': (0.1, 3),
    'max_depth':(6, 29),
    'n_estimators': (64, 512)
}

def build_lgb(x, y, init_points=15, n_iter=0, cv=2, param=True, verbose=2):
    print(f'training using cv={cv}')
    def LGB_bayesian(
        #learning_rate,
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
        n_estimators
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int


        params = {
                  'num_leaves': num_leaves, 
                  'min_data_in_leaf': min_data_in_leaf,
                  'min_child_weight': min_child_weight,
                  'bagging_fraction' : bagging_fraction,
                  'feature_fraction' : feature_fraction,
                  'learning_rate' : 0.05,
                  'max_depth': max_depth,
                  'reg_alpha': reg_alpha,
                  'reg_lambda': reg_lambda,
                  'objective': 'regression',
                  'save_binary': True,
                  'seed': SEED,
                  'feature_fraction_seed': SEED,
                  'bagging_seed': SEED,
                  'drop_seed': SEED,
                  'data_random_seed': SEED,
                  'boosting': 'gbdt', ## some get better result using 'dart'
                  'verbose': 1,
                  'boost_from_average': True,
                  'metric':'mae',
                  'n_estimators': int(n_estimators),
                  'n_jobs': -1,
#                   'tree_learner ': 'voting'
        }    

        ## set reg options
        reg = lgb.LGBMRegressor(**params)
        m_reg = MultiOutputRegressor(reg)
        score = cross_val_score(m_reg, x, y, cv=cv, scoring='neg_mean_absolute_error').mean()

        return score
    
    optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=SEED, verbose=verbose)
    init_points = init_points
    n_iter = n_iter

    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'regression',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'boost_from_average': True,
        'metric':'mae',
        'n_estimators': int(optimizer.max['params']['n_estimators']),
        'n_jobs': -1,
#         'tree_learner ': 'voting'
    }

    params = param_lgb.copy()
    
    reg = lgb.LGBMRegressor(**params)
    lgb_reg = MultiOutputRegressor(reg)
    lgb_reg.fit(x.values, y.values)
    
    if param:
        return lgb_reg, params
    else:
        return lgb_reg

In [41]:
# tr = tr.drop(ctd, axis=1)
# te = te.drop(ctd, axis=1)

In [42]:
tr.shape

(10000, 351)

In [43]:
lgb_reg, params = build_lgb(tr, target, 10, 5)

training using cv=2
|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.173   [0m | [0m 0.5247  [0m | [0m 0.8704  [0m | [0m 20.64   [0m | [0m 1.8     [0m | [0m 23.4    [0m | [0m 133.9   [0m | [0m 140.7   [0m | [0m 2.612   [0m | [0m 1.843   [0m |
| [95m 2       [0m | [95m-1.162   [0m | [95m 0.7248  [0m | [95m 0.3124  [0m | [95m 25.4    [0m | [95m 2.499   [0m | [95m 31.85   [0m | [95m 145.5   [0m | [95m 228.4   [0m | [95m 0.9823  [0m | [95m 1.622   [0m |
| [95m 3       [0m | [95m-1.154   [0m | [95m 0.5592  [0m | [95m 0.4747  [0m | [95m 18.24   [0m | [95m 0.4271  [0m | [95m 43.82   [0m | [95m 228.1   [0m | [95m 419.2   [0m | [95m 2.377   [0m | [95m 0.6791  [0m |
| [0m 4       [0m | [

In [44]:
params_xg = {
    'min_data_in_leaf': params['min_data_in_leaf'],
    'num_leaves': params['num_leaves'],
    'min_child_weight': params['min_child_weight'],
    'bagging_fraction': params['bagging_fraction'],
    'feature_fraction': params['feature_fraction'],
    'reg_lambda': params['reg_lambda'],
    'reg_alpha': params['reg_alpha'],
    'max_depth': params['max_depth'],
    'eval_metric': 'mae',
    'tree_method': 'gpu_hist',
}

In [46]:
# xgb
reg = xgb.XGBRegressor(**params_xg)
xgb_reg = MultiOutputRegressor(reg)
xgb_reg.fit(tr, target)

MultiOutputRegressor(estimator=XGBRegressor(bagging_fraction=0.4447834108075249,
                                            base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            eval_metric='mae',
                                            feature_fraction=0.3225246510876955,
                                            gamma=None, gpu_id=None,
                                            importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=23,
                                            min_child_weight=1.4748630648089285,
                                            min_data_in_leaf=12

In [48]:
# cat
reg = cat.CatBoostRegressor(eval_metric='MAE',
                            task_type='GPU',
                            early_stopping_rounds = 100,
                            iterations = 10000,
                            metric_period = 10000,
                            grow_policy = 'Lossguide',
                            l2_leaf_reg = params['reg_lambda'],
                            random_seed=SEED)

cat_reg = MultiOutputRegressor(reg)
cat_reg.fit(tr, target)



0:	learn: 2.3291709	total: 28.4ms	remaining: 4m 44s
9999:	learn: 0.0062738	total: 4m 5s	remaining: 0us




0:	learn: 0.7926174	total: 27.1ms	remaining: 4m 30s
9999:	learn: 0.0048831	total: 4m 6s	remaining: 0us




0:	learn: 2.3690480	total: 27.8ms	remaining: 4m 37s
9999:	learn: 0.0166269	total: 4m 5s	remaining: 0us




0:	learn: 1.5304105	total: 27.3ms	remaining: 4m 33s
9999:	learn: 0.0118754	total: 3m 58s	remaining: 0us


MultiOutputRegressor(estimator=<catboost.core.CatBoostRegressor object at 0x000001758B211F88>,
                     n_jobs=None)

In [49]:
lgb_pred = lgb_reg.predict(te)
xgb_pred = xgb_reg.predict(te)
cat_pred = cat_reg.predict(te)

In [50]:
rf = RandomForestRegressor(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=SEED)

pca = PCA(10, random_state=SEED, whiten=True)

In [51]:
models = [
    [lgb_reg, xgb_reg, pca],
    [rf]
         ]

In [53]:
model = StackNetRegressor(models, 
                           metric="mae", 
                           folds=2,
                           restacking=False,
                           random_state=SEED,
                           n_jobs=6, 
                           verbose=1)

model.fit(tr, target)

Input Dimensionality 351 at Level 0 
3 models included in Level 0 


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

In [None]:
stk_pred = model.predict(te)

In [54]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
# from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVR
from sklearn.cross_decomposition import PLSRegression

In [55]:
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
svr = SVR()


In [60]:
np.mean(cross_val_score(lasso, tr, target, scoring='neg_mean_absolute_error', cv=4))

-1.6683771891570136

In [61]:
np.mean(cross_val_score(ridge, tr, target, scoring='neg_mean_absolute_error', cv=4))

-1.5382765020868077

In [68]:
svr = MultiOutputRegressor(SVR(), n_jobs=-1)
np.mean(cross_val_score(svr, tr, target, scoring='neg_mean_absolute_error', cv=4))

-1.7312895803531703

In [65]:
for i in range(20, 30):
    pls = PLSRegression(i)
    print(np.mean(cross_val_score(pls, tr, target, scoring='neg_mean_absolute_error', cv=4)))

-1.5319850419338212
-1.5277379349372717
-1.5254928995052255
-1.5245977054880036
-1.521161768383776
-1.5175157308259157
-1.5177311099962791
-1.515335993235752
-1.5100454618246744
-1.5081437211724293


In [69]:
pred = 0.3 * lgb_pred +\
        0.2 * xgb_pred +\
        0.2 * cat_pred

In [70]:
pred

array([[6.19847194, 3.23137698, 5.74628763, 1.47710135],
       [4.95604721, 1.51738617, 7.58946456, 2.68129883],
       [6.56696023, 3.43518601, 8.78011583, 2.22208722],
       ...,
       [5.24124827, 2.678205  , 5.98326419, 2.12571852],
       [9.24690893, 3.22992815, 8.1617763 , 2.94804283],
       [3.22268301, 3.36658708, 8.60570847, 2.21208175]])

In [71]:
sub[target_cols] = pred
sub.to_csv('test.csv')