In [1]:
import pandas as pd
import numpy as np

import librosa
import lightgbm as lgb
from bayes_opt import BayesianOptimization
from time import time
import warnings
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.multioutput import MultiOutputRegressor

import seaborn as sns
import matplotlib.pyplot as plt

SEED = 42
np.random.seed(SEED)

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.decomposition import PCA

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from pystacknet.pystacknet import StackNetRegressor
from bayes_opt import BayesianOptimization

In [3]:
sub = pd.read_csv('./data/sample_submission.csv', index_col='id')

In [4]:
tr = pd.read_csv('./data/train.csv', index_col='id')
te = pd.read_csv('./data/test.csv', index_col='id')

target_cols = ['hhb', 'hbo2', 'ca', 'na']
target = tr[target_cols].copy()
tr = tr.drop(target_cols, axis=1)

In [14]:
src = tr.columns[tr.columns.str.contains('src')]
dst = tr.columns[tr.columns.str.contains('dst')]
n_mels = 32

In [6]:
stack = np.array([0 for _ in range(35)])
for i in tqdm(tr.index):
    temp = tr[dst].loc[i]#.replace(0, np.nan)
    temp.index = range(35)
    stack = np.vstack([stack, temp.interpolate(method='akima', order=5, limit_direction='both').values.flatten()])
    
stack = stack[1:]
tr[dst] = pd.DataFrame(stack, columns=dst)

100%|██████████| 10000/10000 [00:30<00:00, 325.53it/s]


In [7]:
stack = np.array([0 for _ in range(35)])
for i in tqdm(te.index):
    temp = te[dst].loc[i]#.replace(0, np.nan)
    temp.index = range(35)
    stack = np.vstack([stack, temp.interpolate(method='akima', order=5, limit_direction='both').values.flatten()])
    
stack = stack[1:]
te[dst] = pd.DataFrame(stack, columns=dst, index=te.index)

100%|██████████| 10000/10000 [00:31<00:00, 315.52it/s]


In [None]:
np.sum(te[dst].isna())

In [None]:
# temp = tr[dst].loc[0].replace(0, np.nan)
# temp.index = range(35)
# q = temp.interpolate(method='spline', order=3)

# plt.plot(q)

In [None]:
# temp = tr[dst].loc[0].replace(0, np.nan)
# temp.index = range(35)
# q = temp.interpolate(method='akima')

# plt.plot(q)

In [8]:
tr[dst] = tr[dst].interpolate(axis=1)

te[dst] = te[dst].interpolate(axis=1)


In [9]:
tr_dst=tr[dst]
te_dst=te[dst]

tr_dst.loc[tr_dst['700_dst'].isnull(),'700_dst']=tr_dst.loc[tr_dst['700_dst'].isnull(),'710_dst']
tr_dst.loc[tr_dst['690_dst'].isnull(),'690_dst']=tr_dst.loc[tr_dst['690_dst'].isnull(),'700_dst']
tr_dst.loc[tr_dst['680_dst'].isnull(),'680_dst']=tr_dst.loc[tr_dst['680_dst'].isnull(),'690_dst']
tr_dst.loc[tr_dst['670_dst'].isnull(),'670_dst']=tr_dst.loc[tr_dst['670_dst'].isnull(),'680_dst']
tr_dst.loc[tr_dst['660_dst'].isnull(),'660_dst']=tr_dst.loc[tr_dst['660_dst'].isnull(),'670_dst']
tr_dst.loc[tr_dst['650_dst'].isnull(),'650_dst']=tr_dst.loc[tr_dst['650_dst'].isnull(),'660_dst']

te_dst.loc[te_dst['700_dst'].isnull(),'700_dst']=te_dst.loc[te_dst['700_dst'].isnull(),'710_dst']
te_dst.loc[te_dst['690_dst'].isnull(),'690_dst']=te_dst.loc[te_dst['690_dst'].isnull(),'700_dst']
te_dst.loc[te_dst['680_dst'].isnull(),'680_dst']=te_dst.loc[te_dst['680_dst'].isnull(),'690_dst']
te_dst.loc[te_dst['670_dst'].isnull(),'670_dst']=te_dst.loc[te_dst['670_dst'].isnull(),'680_dst']
te_dst.loc[te_dst['660_dst'].isnull(),'660_dst']=te_dst.loc[te_dst['660_dst'].isnull(),'670_dst']
te_dst.loc[te_dst['650_dst'].isnull(),'650_dst']=te_dst.loc[te_dst['650_dst'].isnull(),'660_dst']

tr[dst] = tr_dst
te[dst] = te_dst

In [10]:
for s, col in zip(src, dst):
    tr[col+'_sq1'] = tr[col] * (tr['rho'] ** 2)
    te[col+'_sq1'] = te[col] * (te['rho']** 2)
    
    tr[col+'_sq2'] = tr[col] * (np.exp(tr['rho']))
    te[col+'_sq2'] = te[col] * (np.exp(te['rho']))
    
    tr[col+'_subt_sq1'] =  tr[s] - tr[col+'_sq1']
    tr[col+'_subt_sq2'] =  tr[s] - tr[col+'_sq2']

    te[col+'_subt_sq1'] =  te[s] - te[col+'_sq1']
    te[col+'_subt_sq2'] =  te[s] - te[col+'_sq2']

    
sq1_dst = [c for c in tr.columns if 'sq1' in c and 'subt' not in c]
sq2_dst = [c for c in tr.columns if 'sq2' in c and 'subt' not in c]
subt1_dst = [c for c in tr.columns if 'subt' in c and 'sq' in c]
subt2_dst = [c for c in tr.columns if 'sq2' in c and 'subt' in c]

In [11]:
tr['sq1_dst_mean'] = tr[sq1_dst].mean(axis=1)
tr['sq1_dst_std'] = tr[sq1_dst].std(axis=1)

tr['sq2_dst_mean'] = tr[sq2_dst].mean(axis=1)
tr['sq2_dst_std'] = tr[sq2_dst].std(axis=1)

tr['subt1_dst_mean'] = tr[subt1_dst].mean(axis=1)
tr['subt1_dst_std'] = tr[subt1_dst].std(axis=1)

tr['subt2_dst_mean'] = tr[subt2_dst].mean(axis=1)
tr['subt2_dst_std'] = tr[subt2_dst].std(axis=1)

# test
te['sq1_dst_mean'] = te[sq1_dst].mean(axis=1)
te['sq1_dst_std'] = te[sq1_dst].std(axis=1)

te['sq2_dst_mean'] = te[sq2_dst].mean(axis=1)
te['sq2_dst_std'] = te[sq2_dst].std(axis=1)

te['subt1_dst_mean'] = te[subt1_dst].mean(axis=1)
te['subt1_dst_std'] = te[subt1_dst].std(axis=1)

te['subt2_dst_mean'] = te[subt2_dst].mean(axis=1)
te['subt2_dst_std'] = te[subt2_dst].std(axis=1)

In [None]:
for col in src:
    tr[col+'_sq1'] = tr[col] * (tr['rho'] ** 2)
    te[col+'_sq1'] = te[col] * (te['rho']** 2)
    
    tr[col+'_sq2'] = tr[col] * (np.exp(tr['rho']))
    te[col+'_sq2'] = te[col] * (np.exp(te['rho']))
    
sq1_src = [c for c in tr.columns if 'sq1' in c and 'src' in c]
sq2_src = [c for c in tr.columns if 'sq2' in c and 'src' in c]

In [None]:
# for k in tqdm([5, 10, 15, 20]):
#     tr_roll_mean = tr[dst].rolling(5, axis=1).mean().dropna(axis=1)
#     newc = ['dst_rolling_mean_'+str(k)+'_'+str(i+1) for i in range(len(tr_roll_mean.columns))]
#     tr_roll_mean.columns = newc
#     tr_roll_std = tr[dst].rolling(5, axis=1).std().dropna(axis=1)
#     newc =['dst_rolling_std_'+str(k)+'_'+str(i+1) for i in range(len(tr_roll_std.columns))]
#     tr_roll_std.columns = newc
        
#     tr = pd.concat([tr, tr_roll_mean, tr_roll_std], axis=1)
    
# rolling_5_mean = [c for c in tr.columns if 'dst_rolling_mean_5' in c]
# rolling_10_mean = [c for c in tr.columns if 'dst_rolling_mean_10' in c]
# rolling_15_mean = [c for c in tr.columns if 'dst_rolling_mean_15' in c]
# rolling_20_mean = [c for c in tr.columns if 'dst_rolling_mean_20' in c]

In [None]:
te= te.fillna(0)

In [None]:
rnn_x = []

## dst

In [None]:
tr_chroma=tr[dst]
te_chroma=te[dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_chroma))):   
    tr_temp.append(librosa.feature.chroma_stft(tr_chroma.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_chroma))):
    te_temp.append(librosa.feature.chroma_stft(te_chroma.values[i,:]).flatten())
    
chroma_cols = []

for i in range(12):
    chroma_cols.append('dst' + '_chroma_' + str(i+1))
    
tr_chroma = pd.DataFrame(tr_temp, columns=chroma_cols, index=tr_chroma.index)
te_chroma = pd.DataFrame(te_temp, columns=chroma_cols, index=te_chroma.index)

tr = pd.concat([tr, tr_chroma], axis=1)
te = pd.concat([te, te_chroma], axis=1)

tr['chroma'+'_mean'] = tr[chroma_cols].mean(axis=1)
tr['chroma'+'_std'] = tr[chroma_cols].std(axis=1)

te['chroma'+'_mean'] = te[chroma_cols].mean(axis=1)
te['chroma'+'_std'] = te[chroma_cols].std(axis=1)

In [None]:
n_mels = 24

tr_mel=tr[dst]
te_mel=te[dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mel))):   
    tr_temp.append(librosa.feature.melspectrogram(tr_mel.values[i,:], n_mels=n_mels).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mel))):
    te_temp.append(librosa.feature.melspectrogram(te_mel.values[i,:], n_mels=n_mels).flatten())
    
mel_cols = []
for i in range(n_mels):
    mel_cols.append('dst_melspec_'+str(i+1))
    
tr_mel = pd.DataFrame(tr_temp, columns=mel_cols, index=tr_mel.index)
te_mel = pd.DataFrame(te_temp, columns=mel_cols, index=te_mel.index)

tr = pd.concat([tr, tr_mel], axis=1)
te = pd.concat([te, te_mel], axis=1)

tr['mel'+'_mean'] = tr[mel_cols].mean(axis=1)
tr['mel'+'_std'] = tr[mel_cols].std(axis=1)

te['mel'+'_mean'] = te[mel_cols].mean(axis=1)
te['mel'+'_std'] = te[mel_cols].std(axis=1)

# rnn_x.append(np.array(list(map(lambda x: x.reshape(-1, 1), tr_mel.values))))

In [None]:
tr_mfcc=tr_chroma
te_mfcc=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mfcc))):   
    tr_temp.append(librosa.feature.mfcc(tr_mfcc.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mfcc))):
    te_temp.append(librosa.feature.mfcc(te_mfcc.values[i,:]).flatten())
    
mfcc_cols = []
for i in range(20):
    mfcc_cols.append('dst_chroma_mfcc_'+str(i+1))
    
tr_mfcc = pd.DataFrame(tr_temp, columns=mfcc_cols, index=tr_mfcc.index)
te_mfcc = pd.DataFrame(te_temp, columns=mfcc_cols, index=te_mfcc.index)

tr = pd.concat([tr, tr_mfcc], axis=1)
te = pd.concat([te, te_mfcc], axis=1)

tr['mfcc'+'_mean'] = tr[mfcc_cols].mean(axis=1)
tr['mfcc'+'_std'] = tr[mfcc_cols].std(axis=1)

te['mfcc'+'_mean'] = te[mfcc_cols].mean(axis=1)
te['mfcc'+'_std'] = te[mfcc_cols].std(axis=1)

In [None]:
tr_mel=tr_chroma
te_mel=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mel))):   
    tr_temp.append(librosa.feature.melspectrogram(tr_mel.values[i,:], n_mels=n_mels).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mel))):
    te_temp.append(librosa.feature.melspectrogram(te_mel.values[i,:], n_mels=n_mels).flatten())
    
mel_cols = []
for i in range(n_mels):
    mel_cols.append('dst_chroma_melspec_'+str(i+1))
    
tr_mel = pd.DataFrame(tr_temp, columns=mel_cols, index=tr_mel.index)
te_mel = pd.DataFrame(te_temp, columns=mel_cols, index=te_mel.index)

tr = pd.concat([tr, tr_mel], axis=1)
te = pd.concat([te, te_mel], axis=1)

tr['mel_chroma'+'_mean'] = tr[mel_cols].mean(axis=1)
tr['mel_chroma'+'_std'] = tr[mel_cols].std(axis=1)

te['mel_chroma'+'_mean'] = te[mel_cols].mean(axis=1)
te['mel_chroma'+'_std'] = te[mel_cols].std(axis=1)

# rnn_x.append(np.array(list(map(lambda x: x.reshape(-1, 1), tr_mel.values))))

# dst sq2

In [12]:
tr_chroma=tr[sq2_dst]
te_chroma=te[sq2_dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_chroma))):   
    tr_temp.append(librosa.feature.chroma_stft(tr_chroma.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_chroma))):
    te_temp.append(librosa.feature.chroma_stft(te_chroma.values[i,:]).flatten())
    
chroma_cols = []

for i in range(12):
    chroma_cols.append('dst' + '_chroma2_' + str(i+1))
    
tr_chroma = pd.DataFrame(tr_temp, columns=chroma_cols, index=tr_chroma.index)
te_chroma = pd.DataFrame(te_temp, columns=chroma_cols, index=te_chroma.index)

tr = pd.concat([tr, tr_chroma], axis=1)
te = pd.concat([te, te_chroma], axis=1)

tr['chroma2'+'_mean'] = tr[chroma_cols].mean(axis=1)
tr['chroma2'+'_std'] = tr[chroma_cols].std(axis=1)

te['chroma2'+'_mean'] = te[chroma_cols].mean(axis=1)
te['chroma2'+'_std'] = te[chroma_cols].std(axis=1)

100%|██████████| 10000/10000 [00:23<00:00, 427.96it/s]
100%|██████████| 10000/10000 [00:22<00:00, 453.64it/s]


In [15]:
tr_mel=tr[sq2_dst]
te_mel=te[sq2_dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mel))):   
    tr_temp.append(librosa.feature.melspectrogram(tr_mel.values[i,:], n_mels=n_mels).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mel))):
    te_temp.append(librosa.feature.melspectrogram(te_mel.values[i,:], n_mels=n_mels).flatten())
    
mel_cols = []
for i in range(n_mels):
    mel_cols.append('dst_melspec2_'+str(i+1))
    
tr_mel = pd.DataFrame(tr_temp, columns=mel_cols, index=tr_mel.index)
te_mel = pd.DataFrame(te_temp, columns=mel_cols, index=te_mel.index)

tr = pd.concat([tr, tr_mel], axis=1)
te = pd.concat([te, te_mel], axis=1)

tr['mel2'+'_mean'] = tr[mel_cols].mean(axis=1)
tr['mel2'+'_std'] = tr[mel_cols].std(axis=1)

te['mel2'+'_mean'] = te[mel_cols].mean(axis=1)
te['mel2'+'_std'] = te[mel_cols].std(axis=1)

# rnn_x.append(np.array(list(map(lambda x: x.reshape(-1, 1), tr_mel.values))))

100%|██████████| 10000/10000 [00:18<00:00, 535.94it/s]
100%|██████████| 10000/10000 [00:18<00:00, 551.07it/s]


In [16]:
tr_mfcc=tr_chroma
te_mfcc=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mfcc))):   
    tr_temp.append(librosa.feature.mfcc(tr_mfcc.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mfcc))):
    te_temp.append(librosa.feature.mfcc(te_mfcc.values[i,:]).flatten())
    
mfcc_cols = []
for i in range(20):
    mfcc_cols.append('dst_chroma_mfcc2_'+str(i+1))
    
tr_mfcc = pd.DataFrame(tr_temp, columns=mfcc_cols, index=tr_mfcc.index)
te_mfcc = pd.DataFrame(te_temp, columns=mfcc_cols, index=te_mfcc.index)

tr = pd.concat([tr, tr_mfcc], axis=1)
te = pd.concat([te, te_mfcc], axis=1)

tr['mfcc2'+'_mean'] = tr[mfcc_cols].mean(axis=1)
tr['mfcc2'+'_std'] = tr[mfcc_cols].std(axis=1)

te['mfcc2'+'_mean'] = te[mfcc_cols].mean(axis=1)
te['mfcc2'+'_std'] = te[mfcc_cols].std(axis=1)

100%|██████████| 10000/10000 [00:58<00:00, 170.89it/s]
100%|██████████| 10000/10000 [00:56<00:00, 178.32it/s]


In [17]:
tr_mel=tr_chroma
te_mel=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mel))):   
    tr_temp.append(librosa.feature.melspectrogram(tr_mel.values[i,:], n_mels=n_mels).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mel))):
    te_temp.append(librosa.feature.melspectrogram(te_mel.values[i,:], n_mels=n_mels).flatten())
    
mel_cols = []
for i in range(n_mels):
    mel_cols.append('dst_chroma2_melspec_'+str(i+1))
    
tr_mel = pd.DataFrame(tr_temp, columns=mel_cols, index=tr_mel.index)
te_mel = pd.DataFrame(te_temp, columns=mel_cols, index=te_mel.index)

tr = pd.concat([tr, tr_mel], axis=1)
te = pd.concat([te, te_mel], axis=1)

tr['mel_chroma2'+'_mean'] = tr[mel_cols].mean(axis=1)
tr['mel_chroma2'+'_std'] = tr[mel_cols].std(axis=1)

te['mel_chroma2'+'_mean'] = te[mel_cols].mean(axis=1)
te['mel_chroma2'+'_std'] = te[mel_cols].std(axis=1)

# rnn_x.append(np.array(list(map(lambda x: x.reshape(-1, 1), tr_mel.values))))

100%|██████████| 10000/10000 [00:18<00:00, 534.98it/s]
100%|██████████| 10000/10000 [00:17<00:00, 561.29it/s]


# dst subt

In [None]:
tr_chroma=tr[subt1_dst]
te_chroma=te[subt1_dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_chroma))):   
    tr_temp.append(librosa.feature.chroma_stft(tr_chroma.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_chroma))):
    te_temp.append(librosa.feature.chroma_stft(te_chroma.values[i,:]).flatten())
    
chroma_cols = []

for i in range(12):
    chroma_cols.append('dst_sub_chroma1_' + str(i+1))
    
tr_chroma = pd.DataFrame(tr_temp, columns=chroma_cols, index=tr_chroma.index)
te_chroma = pd.DataFrame(te_temp, columns=chroma_cols, index=te_chroma.index)

tr = pd.concat([tr, tr_chroma], axis=1)
te = pd.concat([te, te_chroma], axis=1)

tr['chroma1_sub'+'_mean'] = tr[chroma_cols].mean(axis=1)
tr['chroma1_sub'+'_std'] = tr[chroma_cols].std(axis=1)

te['chroma1_sub'+'_mean'] = te[chroma_cols].mean(axis=1)
te['chroma1_sub'+'_std'] = te[chroma_cols].std(axis=1)

In [None]:
tr_mfcc=tr_chroma
te_mfcc=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mfcc))):   
    tr_temp.append(librosa.feature.mfcc(tr_mfcc.values[i,:], htk=True).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mfcc))):
    te_temp.append(librosa.feature.mfcc(te_mfcc.values[i,:], htk=True).flatten())
    
mfcc_cols = []
for i in range(20):
    mfcc_cols.append('dst_sub_chroma1_mfcc2_'+str(i+1))
    
tr_mfcc = pd.DataFrame(tr_temp, columns=mfcc_cols, index=tr_mfcc.index)
te_mfcc = pd.DataFrame(te_temp, columns=mfcc_cols, index=te_mfcc.index)

tr = pd.concat([tr, tr_mfcc], axis=1)
te = pd.concat([te, te_mfcc], axis=1)

tr['mfcc1_sub'+'_mean'] = tr[mfcc_cols].mean(axis=1)
tr['mfcc1_sub'+'_std'] = tr[mfcc_cols].std(axis=1)

te['mfcc1_sub'+'_mean'] = te[mfcc_cols].mean(axis=1)
te['mfcc1_sub'+'_std'] = te[mfcc_cols].std(axis=1)

In [None]:
tr_chroma=tr[subt2_dst]
te_chroma=te[subt2_dst]

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_chroma))):   
    tr_temp.append(librosa.feature.chroma_stft(tr_chroma.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_chroma))):
    te_temp.append(librosa.feature.chroma_stft(te_chroma.values[i,:]).flatten())
    
chroma_cols = []

for i in range(12):
    chroma_cols.append('dst_sub_chroma2_' + str(i+1))
    
tr_chroma = pd.DataFrame(tr_temp, columns=chroma_cols, index=tr_chroma.index)
te_chroma = pd.DataFrame(te_temp, columns=chroma_cols, index=te_chroma.index)

tr = pd.concat([tr, tr_chroma], axis=1)
te = pd.concat([te, te_chroma], axis=1)

tr['chroma2_sub'+'_mean'] = tr[chroma_cols].mean(axis=1)
tr['chroma2_sub'+'_std'] = tr[chroma_cols].std(axis=1)

te['chroma2_sub'+'_mean'] = te[chroma_cols].mean(axis=1)
te['chroma2_sub'+'_std'] = te[chroma_cols].std(axis=1)

In [None]:
tr_mfcc=tr_chroma
te_mfcc=te_chroma

tr_temp = []
te_temp = []

for i in tqdm(range(len(tr_mfcc))):   
    tr_temp.append(librosa.feature.mfcc(tr_mfcc.values[i,:]).flatten()) # - np.mean(tr[dst].values[0,:])

for i in tqdm(range(len(te_mfcc))):
    te_temp.append(librosa.feature.mfcc(te_mfcc.values[i,:]).flatten())
    
mfcc_cols = []
for i in range(20):
    mfcc_cols.append('dst_sub_chroma1_mfcc2_'+str(i+1))
    
tr_mfcc = pd.DataFrame(tr_temp, columns=mfcc_cols, index=tr_mfcc.index)
te_mfcc = pd.DataFrame(te_temp, columns=mfcc_cols, index=te_mfcc.index)

tr = pd.concat([tr, tr_mfcc], axis=1)
te = pd.concat([te, te_mfcc], axis=1)

tr['mfcc1_sub'+'_mean'] = tr[mfcc_cols].mean(axis=1)
tr['mfcc1_sub'+'_std'] = tr[mfcc_cols].std(axis=1)

te['mfcc1_sub'+'_mean'] = te[mfcc_cols].mean(axis=1)
te['mfcc1_sub'+'_std'] = te[mfcc_cols].std(axis=1)

In [None]:
# librosa.feature.melspectrogram(tr[dst].values[i,:], n_mels=32)

## FT

In [None]:
alpha_real=tr[dst]
alpha_imag=tr[dst]

beta_real=te[dst]
beta_imag=te[dst]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in dst:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

tr=pd.concat((tr, alpha), axis=1)
te=pd.concat((te, beta), axis=1)

tr['dst_fft'+'_mean'] = tr[alpha.columns].mean(axis=1)
tr['dst_fft'+'_std'] = tr[alpha.columns].std(axis=1)

te['dst_fft'+'_mean'] = te[beta.columns].mean(axis=1)
te['dst_fft'+'_std'] = te[beta.columns].std(axis=1)

# FT sq2

In [19]:
alpha_real=tr[sq2_dst]
alpha_imag=tr[sq2_dst]

beta_real=te[sq2_dst]
beta_imag=te[sq2_dst]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in sq2_dst:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

tr=pd.concat((tr, alpha), axis=1)
te=pd.concat((te, beta), axis=1)

tr['dst_sq2_fft'+'_mean'] = tr[alpha.columns].mean(axis=1)
tr['dst_sq2_fft'+'_std'] = tr[alpha.columns].std(axis=1)

te['dst_sq2_fft'+'_mean'] = te[beta.columns].mean(axis=1)
te['dst_sq2_fft'+'_std'] = te[beta.columns].std(axis=1)

100%|██████████| 10000/10000 [00:20<00:00, 495.38it/s]
100%|██████████| 10000/10000 [00:20<00:00, 492.63it/s]


In [None]:
np.sum(np.sum(tr.isna()))

In [None]:
alpha_real=tr[src]
alpha_imag=tr[src]

beta_real=te[src]
beta_imag=te[src]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in src:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

tr=pd.concat((tr, alpha), axis=1)
te=pd.concat((te, beta), axis=1)

tr['src_fft'+'_mean'] = tr[alpha.columns].mean(axis=1)
tr['src_fft'+'_std'] = tr[alpha.columns].std(axis=1)

te['src_fft'+'_mean'] = te[beta.columns].mean(axis=1)
te['src_fft'+'_std'] = te[beta.columns].std(axis=1)

In [None]:
alpha_real=tr[sq2_src]
alpha_imag=tr[sq2_src]

beta_real=te[sq2_src]
beta_imag=te[sq2_src]

for i in tqdm(alpha_real.index):
    alpha_real.loc[i]=alpha_real.loc[i] - alpha_real.loc[i].mean()
    alpha_imag.loc[i]=alpha_imag.loc[i] - alpha_real.loc[i].mean()
    
    alpha_real.loc[i] = np.fft.fft(alpha_real.loc[i], norm='ortho').real
    alpha_imag.loc[i] = np.fft.fft(alpha_imag.loc[i], norm='ortho').imag

    
for i in tqdm(beta_real.index):
    beta_real.loc[i]=beta_real.loc[i] - beta_real.loc[i].mean()
    beta_imag.loc[i]=beta_imag.loc[i] - beta_imag.loc[i].mean()
    
    beta_real.loc[i] = np.fft.fft(beta_real.loc[i], norm='ortho').real
    beta_imag.loc[i] = np.fft.fft(beta_imag.loc[i], norm='ortho').imag
    
real_part=[]
imag_part=[]

for col in src:
    real_part.append(col + '_fft_real')
    imag_part.append(col + '_fft_imag')
    
alpha_real.columns=real_part
alpha_imag.columns=imag_part
alpha = pd.concat((alpha_real, alpha_imag), axis=1)

beta_real.columns=real_part
beta_imag.columns=imag_part
beta=pd.concat((beta_real, beta_imag), axis=1)

tr=pd.concat((tr, alpha), axis=1)
te=pd.concat((te, beta), axis=1)

tr['sq2_src_fft'+'_mean'] = tr[alpha.columns].mean(axis=1)
tr['sq2_src_fft'+'_std'] = tr[alpha.columns].std(axis=1)

te['sq2_src_fft'+'_mean'] = te[beta.columns].mean(axis=1)
te['sq2_src_fft'+'_std'] = te[beta.columns].std(axis=1)

In [None]:
np.sum(np.sum(te.isna()))

In [None]:
tr.shape

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error



In [21]:
bounds_LGB = {
    'num_leaves': (100, 800), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
#     'learning_rate': (0.01, 1),
    'min_child_weight': (0.01, 3),   
    'reg_alpha': (0.01, 3), 
    'reg_lambda': (0.01, 1),
    'max_depth':(6, 29),
    'n_estimators': (64, 512)
}

def build_lgb(x, y, init_points=10, n_iter=10, cv=2, param=True, verbose=2, is_test=False):
    train_X, test_X, train_y, test_y = train_test_split(x.values, y.values, test_size=0.3, random_state=SEED, shuffle=True)
    def LGB_bayesian(
        #learning_rate,
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
        n_estimators
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int


        params = {
                  'num_leaves': num_leaves, 
                  'min_data_in_leaf': min_data_in_leaf,
                  'min_child_weight': min_child_weight,
                  'bagging_fraction' : bagging_fraction,
                  'feature_fraction' : feature_fraction,
                  'learning_rate' : 0.05,
                  'max_depth': max_depth,
                  'reg_alpha': reg_alpha,
                  'reg_lambda': reg_lambda,
                  'objective': 'regression',
                  'save_binary': True,
                  'seed': SEED,
                  'feature_fraction_seed': SEED,
                  'bagging_seed': SEED,
                  'drop_seed': SEED,
                  'data_random_seed': SEED,
                  'boosting': 'gbdt', ## some get better result using 'dart'
                  'verbose': 1,
                  'boost_from_average': True,
                  'metric':'mae',
                  'n_estimators': int(n_estimators),
                  'n_jobs': -1,
                  'tree_learner ': 'voting'
        }    

        ## set reg options
        reg = lgb.LGBMRegressor(**params)
        m_reg = MultiOutputRegressor(reg)
#         m_reg.fit(train_X, train_y)
#         score = mean_absolute_error(test_y, m_reg.predict(test_X))
        score = cross_val_score(m_reg, x, y, cv=cv, scoring='neg_mean_absolute_error').mean()

        return score
    
    optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=SEED, verbose=verbose)
    init_points = init_points
    n_iter = n_iter

    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'regression',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'boost_from_average': True,
        'metric':'mae',
        'n_estimators': int(optimizer.max['params']['n_estimators']),
        'n_jobs': -1,
        'tree_learner ': 'voting'
    }

    params = param_lgb.copy()

    reg = lgb.LGBMRegressor(**params)
    lgb_reg = MultiOutputRegressor(reg)
    lgb_reg.fit(x.values, y.values)

    if param:
        return lgb_reg, params
    else:
        return lgb_reg


In [None]:
# tr = tr.drop(ctd, axis=1)
# te = te.drop(ctd, axis=1)

In [None]:
ttr = tr.drop(list(src)+list(dst), axis=1)
# te = te.drop(src, axis=1)
ttr.shape

In [None]:
# base: -1.135
# base2: -1.132

# rolling: -1.125
# rolling2: -1.126

# mfcc: -1.114
# mfcc2: -1.114

# fft: -1.091
# fft2: -1.089

# mfcc_sq1: -1.092
# mfcc_sq1_2: 

# fft_sq2: -1.079
# fft_sq2_2: -1.078


In [None]:
ttr = tr.drop(list(imps['col'][imps['imp']==0].values), axis=1)
tte = te.drop(list(imps['col'][imps['imp']==0].values), axis=1)

In [22]:
lgb_reg = build_lgb(tr, target, 6, 10, param=False)

|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | n_esti... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-1.094   [0m | [0m 0.5247  [0m | [0m 0.8704  [0m | [0m 22.84   [0m | [0m 1.8     [0m | [0m 23.4    [0m | [0m 133.9   [0m | [0m 140.7   [0m | [0m 2.6     [0m | [0m 0.6051  [0m |
| [95m 2       [0m | [95m-1.076   [0m | [95m 0.7248  [0m | [95m 0.3124  [0m | [95m 28.31   [0m | [95m 2.499   [0m | [95m 31.85   [0m | [95m 145.5   [0m | [95m 228.4   [0m | [95m 0.9197  [0m | [95m 0.5295  [0m |
| [95m 3       [0m | [95m-1.061   [0m | [95m 0.5592  [0m | [95m 0.4747  [0m | [95m 20.07   [0m | [95m 0.4271  [0m | [95m 43.82   [0m | [95m 228.1   [0m | [95m 419.2   [0m | [95m 2.358   [0m | [95m 0.2077  [0m |
| [0m 4       [0m | [0m-1.121   [0m | [

In [None]:
# lgb_reg2 = build_lgb(ttr, target, 3, 5, param=False)

In [None]:
%%time
rf = RandomForestRegressor(max_depth=13, n_jobs=-1, random_state=42)

# np.mean(cross_val_score(rf, tr, target, scoring='neg_mean_absolute_error', cv=3))
rf.fit(ttr, target)

In [None]:
imps = pd.DataFrame()

imps['col'] = ttr.columns
imps['imp'] = 0
for i in range(4):
    imps['imp'] += lgb_reg.estimators_[i].booster_.feature_importance(importance_type='gain')
# imps['imp'] = rf.feature_importances_

imps.sort_values('imp', ascending=1).head(40)

In [None]:
list(imps['col'][imps['imp']==0].values)

In [None]:
lgb_reg.predict(te)

In [None]:
raise('eo')

In [None]:
sc = PowerTransformer().fit(tr)
tr2 = pd.DataFrame(sc.transform(tr), columns=tr.columns)

sc = PowerTransformer().fit(ttr)
ttr2 = pd.DataFrame(sc.transform(ttr), columns=ttr.columns)


In [None]:
lgb_reg3 = build_lgb(tr2, target, 3, 0, param=False)

In [None]:
lgb_reg4 = build_lgb(ttr2, target, 3, 0, param=False)

In [None]:
imps = pd.DataFrame()

imps['col'] = ttr.columns

imps['imp'] = lgb_reg.estimators_[3].booster_.feature_importance(importance_type='gain')

imps.sort_values('imp', ascending=1).head(40)

In [None]:
regs = {}
for tc in target_cols:
    temp = {}
    for u in tr['rho'].unique():
        ttr = tr[tr['rho']==u].drop('rho', axis=1)
        tr_y = target[tr['rho']==u][tc]
    
        temp[u] = build_lgb(ttr, tr_y, 6, 5, param=False)
        
    regs[tc] = temp

In [None]:
for tc in target_cols:
    temp = sub[tc].astype('float32')
    for u in tr['rho'].unique():
        tte = te[te['rho']==u].drop('rho', axis=1)

        pred = regs[tc][u].predict(tte)
        temp[te['rho']==u] = pred
    sub[tc] = temp
    
sub

In [None]:
sub.to_csv('test.csv')

In [None]:
sns.distplot(target[target_cols[2]])

In [None]:
pred = lgb_reg.predict(te)
pred

In [None]:
sub[target_cols] = pred
sub.to_csv('test.csv')

In [None]:
imps = pd.DataFrame()

imps['col'] = tr.columns

imps['imp'] = lgb_reg.estimators_[3].booster_.feature_importance(importance_type='gain')

imps.sort_values('imp', ascending=1).head(40)

In [None]:
tr.groupby('rho').mean()

In [None]:
# cat
reg = cat.CatBoostRegressor(eval_metric='MAE',
                            task_type='CPU',
                            early_stopping_rounds = 100,
                            iterations = 10000,
                            metric_period = 10000,
                            grow_policy = 'Lossguide',
                            l2_leaf_reg = 0.01,
                            random_seed=SEED)

cat_reg = MultiOutputRegressor(reg)
# cat_reg.fit(tr, target)

In [None]:
rf = RandomForestRegressor(n_estimators=150,
                                max_depth=9, 
                                max_features='sqrt', 
                                random_state=SEED)

pca = PCA(10, random_state=SEED, whiten=True)

In [None]:
models = [
    [lgb_reg, pca, cat_reg],
    [rf]
         ]

In [None]:
model = StackNetRegressor(models, 
                           metric="mae", 
                           folds=2,
                           restacking=False,
                           random_state=SEED,
                           n_jobs=-1, 
                           verbose=1)

model.fit(tr, target)

In [None]:
pred = lgb_reg.predict(tte)
pred

In [None]:
sub[target_cols] = pred
sub.to_csv('0618.csv')

In [None]:
np.sum(np.sum(tr))

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.cross_decomposition import PLSRegression
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
tr.shape

In [None]:
lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
svr = SVR()
rf = RandomForestRegressor(max_depth=13, n_jobs=-1, random_state=42)
mlp = MLPRegressor([256, 128], learning_rate='adaptive', random_state=42)

In [None]:
tr['rho'].unique()

In [None]:
ttr = tr[tr['rho']==20].drop('rho', axis=1)
tr_y = target[tr['rho']==20]

In [None]:
np.mean(cross_val_score(mlp, tr, target[target_cols[3]], scoring='neg_mean_absolute_error', cv=4))

In [None]:
sc = PowerTransformer().fit(tr)

ttr = sc.transform(tr)
# tte = sc.transform(te)

In [None]:
np.mean(cross_val_score(lr, tr, target, scoring='neg_mean_absolute_error', cv=4))

In [None]:
np.mean(cross_val_score(rf, tr, target, scoring='neg_mean_absolute_error', cv=4))

In [None]:
np.mean(cross_val_score(rf, ttr, target, scoring='neg_mean_absolute_error', cv=4))

In [None]:
adr = AdaBoostRegressor(lasso)
adr = MultiOutputRegressor(adr)

np.mean(cross_val_score(adr, tr, target, scoring='neg_mean_absolute_error', cv=4))

In [None]:
bgr = BaggingRegressor(lasso)
bgr = MultiOutputRegressor(bgr)

np.mean(cross_val_score(bgr, ttr, target, scoring='neg_mean_absolute_error', cv=4))

In [None]:
import keras
import keras.backend as K
from keras import layers, models, optimizers
from keras.layers import Dense, Concatenate, Activation, BatchNormalization
from keras.models import Input, Model, Sequential

import tensorflow as tf

def mish(x):
    return x*K.tanh(K.softplus(x))

inputs = Input(shape = (tr.shape[-1], ))

x = Dense(1024, kernel_initializer='he_normal')(inputs)
x = BatchNormalization(momentum=0.8)(x)
x = Activation(mish)(x)
x = Dense(512, kernel_initializer='he_normal')(x)
x = Activation(mish)(x)
x = Dense(256, kernel_initializer='he_normal')(x)
x = Activation(mish)(x)

x = Dense(4, kernel_initializer='he_normal')(x)

model = Model(inputs, x)

model.compile(loss='mae', optimizer=optimizers.adam(lr=1e-4))

es = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)



In [None]:
# ttr = sc.transform(tr)
model.fit(tr, target,
         epochs=100,
         validation_split=0.3,
         callbacks=[es])

In [None]:
target