# Libs

In [1]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mae, AUC, Accuracy
from tensorflow.keras.utils import normalize

import lightgbm as lgb

from utils import *

# Load Data

In [2]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [3]:
# parse args
use_simple_features = False
use_best_features = False
split_mode = 'fix_transfer'

# split the dataset
X_train, X_test, y_train, y_test = dataset_split(X, Y,
                                                 split_mode=split_mode,
                                                 use_simple_features=use_simple_features,
                                                 use_best_features=use_best_features,
                                                 num_features=0)
X_train_oh, X_test_oh, _, _ = dataset_split(X, Y,
                                            split_mode=split_mode,
                                            use_simple_features=use_simple_features,
                                            use_best_features=use_best_features,
                                            num_features=0)

# Distribution Shift

In [5]:
shift_mode = 'svd'

In [6]:
if shift_mode is 'svd':
    X_ = pd.concat((X_train, X_test), axis=0)
    train_idx = range(0,X_train.shape[0])
    test_idx = range(X_train.shape[0],X_.shape[0])
    
    # define transformer
    svd = TruncatedSVD(n_components=100, n_iter=20, random_state=1326)
    svd.fit(X_.T)
    print("[Stats] Var. explanation ratio: {:.4f}".format(svd.explained_variance_ratio_.sum()))
    
    # transform dataset
    X_new = svd.components_.T

elif shift_mode is 'ae':
    X_ = pd.concat((X_train, X_test), axis=0)
    
    train_idx = range(0,X_train.shape[0])
    test_idx = range(X_train.shape[0],X_.shape[0])
    
    # define transformer
    x_in = Input(shape=(X_data.shape[1],))
    h = Dense(units=128,use_bias=True,kernel_initializer=he_normal(),activation=None)(x_in)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = BatchNormalization()(h)
    h = Dense(units=100,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=X_data.shape[1],use_bias=True,kernel_initializer=he_normal(),activation='relu')(h)
    
    ae = Model(inputs=x_in, outputs=h)
    ae.compile(loss='mse',optimizer=Adam(5e-4))
    hist = ae.fit(normalize(X_), normalize(y_),
                       batch_size=512, epochs=10,
                       shuffle=True,verbose=0)
    
    # transform dataset
    Transformer = Model(inputs=ae.input, outputs=ae.layers[5].output)
    X_new = Transformer.predict(normalize(X_))

[Stats] Var. explanation ratio: 1.0000


# Train Regressor

In [7]:
regressor = 'gbm'

In [8]:
if regressor is 'gbm':
    sp = time.time()
    train_data = lgb.Dataset(X_new[train_idx], y_train)
    test_data = lgb.Dataset(X_new[test_idx], y_test, reference=train_data)

    params = {
        'objective':'regression',
        'boosting':'gbdt',
        'metric':'mae',
        'num_rounds':20000,
        'learning_rate':0.001,
        'max_depth':8,
        'num_leaves':100,
        'feature_fraction':0.9,
        'bagging_fraction':0.9,
        'bagging_freq':200,
        'verbose':0
    }

    gbm = lgb.train(params, train_data,
                    valid_sets=[test_data, train_data],
                    valid_names=['test','train'],
                    verbose_eval=200,
                    early_stopping_rounds=100)
    print("[Duration] {:.2f} sec.".format(time.time() - sp))


    # evaluate
    y_pred = gbm.predict(X_new[test_idx],num_iteration=gbm.best_iteration)
    print("[LightGBM] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

elif regressor is 'mlp':
    sp = time.time()
    
    x_in = Input(shape=(X_train_oh.shape[1],))
    h = Dense(units=256,use_bias=True,kernel_initializer=he_normal(),activation=None)(x_in)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=128,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=64,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=32,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=1,use_bias=True,kernel_initializer=he_normal(),activation='relu')(h)
    
    mlp = Model(inputs=x_in, outputs=h)
    mlp.compile(loss='mse',optimizer=Adam(3e-4),metrics=['mae'])
    hist = mlp.fit(X_train_oh, y_train, batch_size=128, epochs=200,
                   shuffle=True, verbose=1,
                   validation_data=[X_test_oh,y_test])
    
    print("[Duration] {:.2f} sec.".format(time.time() - sp))
    
    # evaluate
    y_pred = mlp.predict(X_test_oh).reshape(-1)
    print("[MLP] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test),
        100 * mspe(y_pred, y_test)))

Training until validation scores don't improve for 100 rounds
[200]	train's l1: 53.4869	test's l1: 50.9033
[400]	train's l1: 46.3937	test's l1: 45.9228
[600]	train's l1: 40.6122	test's l1: 42.1669
[800]	train's l1: 35.9312	test's l1: 39.3858
[1000]	train's l1: 32.1493	test's l1: 37.3665
[1200]	train's l1: 29.0833	test's l1: 35.619
[1400]	train's l1: 26.6438	test's l1: 34.2893
[1600]	train's l1: 24.6704	test's l1: 33.2119
[1800]	train's l1: 23.0947	test's l1: 32.3905
[2000]	train's l1: 21.8099	test's l1: 31.7867
[2200]	train's l1: 20.7289	test's l1: 31.2858
[2400]	train's l1: 19.8641	test's l1: 30.9307
[2600]	train's l1: 19.1456	test's l1: 30.6374
[2800]	train's l1: 18.5366	test's l1: 30.3915
[3000]	train's l1: 18.0323	test's l1: 30.2302
[3200]	train's l1: 17.5909	test's l1: 30.1251
[3400]	train's l1: 17.2009	test's l1: 30.0333
[3600]	train's l1: 16.8538	test's l1: 29.922
[3800]	train's l1: 16.5602	test's l1: 29.825
[4000]	train's l1: 16.2959	test's l1: 29.7188
[4200]	train's l1: 16.037