# Libs

In [1]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tensorflow.keras.backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LeakyReLU, BatchNormalization
from tensorflow.keras.layers import Dense, Reshape, Activation, Dropout, Flatten
from tensorflow.keras.layers import Embedding, Concatenate, Add, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import he_normal, constant
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical, normalize

from sklearn.neighbors import KernelDensity
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor

import lightgbm as lgb

from utils import *
from feat_eng import engineering

# Load Data

In [2]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [3]:
# split the dataset
X_train, X_test, y_train, y_test = dataset_split(X, Y,
                                                 split_mode='fix_transfer',
                                                 use_features=[],
                                                 use_best_features=False,
                                                 num_features=40)
current_cat_feats = list(set(X_train.columns).intersection(set(categorical_features)))

# Linear Regression

In [4]:
lr = LinearRegression()
lr = lr.fit(X_train,y_train)

print("[Linear-Regression] train score: {:.3f} | test score: {:.3f}".format(
    lr.score(X_train,y_train),
    lr.score(X_test,y_test)))

y_pred = lr.predict(X_test)
print("[Linear-Regression] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

[Linear-Regression] train score: 0.762 | test score: 0.425
[Linear-Regression] mae: 53.66 | mape: 42.99% | mspe: 22.67%


# $k$-NN

In [8]:
knn = KNeighborsRegressor(n_neighbors=20, weights='distance', n_jobs=-1)
knn = knn.fit(X_train,y_train)

print("[k-NN] train score: {:.3f} | test score: {:.3f}".format(
    knn.score(X_train,y_train),
    knn.score(X_test,y_test)))

y_pred = knn.predict(X_test)
print("[k-NN] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

[k-NN] train score: 0.999 | test score: 0.669
[k-NN] mae: 36.74 | mape: 33.18% | mspe: 13.04%


# Random Forest

In [6]:
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rf = rf.fit(X_train, y_train)

print("[Random-Forest] train score: {:.3f} | test score: {:.3f}".format(
    rf.score(X_train,y_train),
    rf.score(X_test,y_test)))

y_pred = rf.predict(X_test)
print("[Random-Forest] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

[Random-Forest] train score: 0.989 | test score: 0.615
[Random-Forest] mae: 37.79 | mape: 35.91% | mspe: 15.17%


# LightGBM

In [10]:
params = {
    'objective':'regression',
    'boosting':'gbdt',
    'metric':'mae',
    'num_rounds':5000,
    'learning_rate':0.001,
    'max_depth':8,
    'num_leaves':100,
    'verbose':0
}

train_data = lgb.Dataset(X_train, y_train,
                         categorical_feature=current_cat_feats)
gbm = lgb.train(params, train_data,
                valid_sets=[train_data],
                valid_names=['train'],
                verbose_eval=500,
                early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[500]	train's l1: 40.4365
[1000]	train's l1: 27.7863
[1500]	train's l1: 20.6135
[2000]	train's l1: 16.6234
[2500]	train's l1: 14.4424
[3000]	train's l1: 13.2413
[3500]	train's l1: 12.5756
[4000]	train's l1: 12.1656
[4500]	train's l1: 11.8814
[5000]	train's l1: 11.639
Did not meet early stopping. Best iteration is:
[5000]	train's l1: 11.639


In [11]:
y_pred = gbm.predict(X_test)
print("[LightGBM] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

[LightGBM] mae: 36.82 | mape: 33.20% | mspe: 12.96%


# MLP

In [17]:
def build_mlp(input_shape):
    x_in = Input(shape=(input_shape,))

    def dense_block(h, units):
        h = Dense(units=units, use_bias=True,
                  activation=None,
                  kernel_initializer=he_normal(),
                  bias_initializer=constant(0.0))(h)
        h = BatchNormalization()(h)
        h = LeakyReLU(0.2)(h)
        h = Dropout(rate=0.5)(h)
        return h

    h = dense_block(x_in, units=128)
    h = dense_block(h, units=64)
    h = Dense(units=1, use_bias=False,
              activation='relu',
              kernel_initializer=he_normal(),
              bias_initializer=constant(1.0))(h)

    mlp = Model(inputs=x_in, outputs=h)
    mlp.compile(loss='mse', optimizer=Adam(3e-4), metrics=['mae'])

    return mlp

X_train, X_test, y_train, y_test = dataset_split(X_onehot, Y,
                                                 split_mode='fix_transfer',
                                                 use_features=[],
                                                 use_best_features=False,
                                                 num_features=40)

mlp = build_mlp(input_shape=X_train.shape[1])
hist = mlp.fit(X_train, y_train, batch_size=256, epochs=50, shuffle=True, verbose=0)

y_pred = mlp.predict(X_test).reshape(-1)
print("[MLP] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

[MLP] mae: 34.35 | mape: 36.20% | mspe: 13.42%
