# Libs

In [None]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.initializers import he_normal, constant
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mae, AUC, Accuracy
from tensorflow.keras.utils import normalize

import lightgbm as lgb

from utils import *

# Load Data

In [None]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [None]:
# parse args
use_simple_features = False
use_best_features = False
split_mode = 'fix_transfer'

# split the dataset
X_train, X_test, y_train, y_test = dataset_split(X, Y,
                                                 split_mode=split_mode,
                                                 use_simple_features=use_simple_features,
                                                 use_best_features=use_best_features,
                                                 num_features=0)
X_train_oh, X_test_oh, _, _ = dataset_split(X_onehot, Y,
                                            split_mode=split_mode,
                                            use_simple_features=use_simple_features,
                                            use_best_features=use_best_features,
                                            num_features=0)

# Select and Weight Samples

In [None]:
weight_machine = 'mlp'

In [None]:
if weight_machine is 'logit':
    X_data = pd.concat((X_train, X_test),axis=0)
    y_data = np.concatenate((np.zeros((X_train.shape[0],1)),np.ones((X_test.shape[0],1))))

    # define classifier
    lr = LogisticRegression(penalty='l2',random_state=1819)
    lr = lr.fit(X_data, y_data)

    # weight samples
    sample_weights = lr.predict_proba(X_train)[:,1]

elif weight_machine is 'mlp':
    X_data = pd.concat((X_train_oh, X_test_oh),axis=0)
    y_data = np.concatenate((np.zeros((X_train_oh.shape[0],1)),np.ones((X_test_oh.shape[0],1))))
    
    # define classifier
    x_in = Input(shape=(X_data.shape[1],))
    h = Dense(units=128,use_bias=True,kernel_initializer=he_normal(),activation=None)(x_in)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = BatchNormalization()(h)
    h = Dense(units=32,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = BatchNormalization()(h)
    h = Dense(units=1,use_bias=True,kernel_initializer=he_normal(),activation='sigmoid')(h)
    
    mlp_clf = Model(inputs=x_in, outputs=h)
    mlp_clf.compile(loss='binary_crossentropy',optimizer=Adam(5e-4),metrics=['accuracy'])
    hist = mlp_clf.fit(X_data, y_data, batch_size=512, epochs=10, shuffle=True,verbose=0)
    
    # weight samples
    sample_weights = mlp_clf.predict(X_train_oh).reshape(-1)

# Train Regressor

In [None]:
regressor = 'mlp'

In [None]:
if regressor is 'gbm':
    sp = time.time()
    train_data = lgb.Dataset(X_train, y_train,
                             categorical_feature=categorical_features)
    train_data.set_weight(sample_weights.reshape(-1))
    test_data = lgb.Dataset(X_test, y_test, reference=train_data)

    params = {
        'objective':'regression',
        'boosting':'gbdt',
        'metric':'mae',
        'num_rounds':20000,
        'learning_rate':0.001,
        'max_depth':8,
        'num_leaves':100,
        'feature_fraction':0.5,
        'bagging_fraction':0.5,
        'bagging_freq':200,
        'verbose':0
    }

    gbm = lgb.train(params, train_data,
                    valid_sets=[test_data, train_data],
                    valid_names=['test','train'],
                    verbose_eval=1000,
                    early_stopping_rounds=100)
    print("[Duration] {:.2f} sec.".format(time.time() - sp))


    # evaluate
    y_pred = gbm.predict(X_test,num_iteration=gbm.best_iteration)
    print("[LightGBM] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

elif regressor is 'mlp':
    sp = time.time()
    
    x_in = Input(shape=(X_train_oh.shape[1],))

    def dense_block(h, units):
        h = Dense(units=units, use_bias=True,
                  activation=None,
                  kernel_initializer=he_normal(),
                  bias_initializer=constant(0.0))(h)
        h = BatchNormalization()(h)
        h = LeakyReLU(0.2)(h)
        h = Dropout(rate=0.5)(h)
        return h

    h = dense_block(x_in, units=256)
    h = dense_block(h, units=128)
    h = dense_block(h, units=64)
    h = Dense(units=1, use_bias=False, activation='relu',kernel_initializer=he_normal())(h)

    mlp = Model(inputs=x_in, outputs=h)
    mlp.compile(loss='mse', optimizer=Adam(3e-4), metrics=['mae'])
    
    mlp = Model(inputs=x_in, outputs=h)
    mlp.compile(loss='mse',optimizer=Adam(3e-4),metrics=['mae'])
    hist = mlp.fit(X_train_oh, y_train, batch_size=128, epochs=200,
                   shuffle=True,verbose=1,sample_weight=sample_weights,
                   validation_data=[X_test_oh,y_test])
    
    print("[Duration] {:.2f} sec.".format(time.time() - sp))
    

    # evaluate
    y_pred = mlp.predict(X_test_oh).reshape(-1)
    print("[MLP] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test),
        100 * mspe(y_pred, y_test)))