# Libs

In [None]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import tensorflow.keras.backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, LeakyReLU, BatchNormalization
from tensorflow.keras.layers import Dense, Reshape, Activation, Dropout, Flatten
from tensorflow.keras.layers import Embedding, Concatenate, Add, Conv1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import he_normal, constant
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical, normalize

from sklearn.neighbors import KernelDensity
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor

import lightgbm as lgb

from utils import *
from feat_eng import engineering

# Load Data

In [None]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [None]:
# split the dataset
use_features = ['volume', 'volume_es_p6', 'volume_es_p5', 'volume_es_p7',
                'through', 'left', 'right', 'thr_left',
                'thr_right', 'u_turn', 'num_lanes',
                'weekday', 'interval', 'holiday', 'peak']
X_train, X_test, y_train, y_test = dataset_split(X, Y,
                                                 split_mode='fix_transfer',
                                                 use_features=use_features,
                                                 use_best_features=False,
                                                 num_features=0)
current_cat_feats = list(set(X_train.columns).intersection(set(categorical_features)))

# Weight Samples

KDE

kde = KernelDensity(kernel='gaussian',bandwidth=0.5)
kde = kde.fit(X_test.values)
kde_weights = kde.score_samples(X_train.values)
trunc_kde_weights = -1 / kde_weights

np.save('../Data/fix_kde_weights.npy',trunc_kde_weights)$

Logistic

X_data = np.concatenate((X_train.values, X_test.values),axis=0)
y_data = np.concatenate((np.zeros((X_train.shape[0],1)), np.ones((X_test.shape[0],1))),axis=0)

lr = LogisticRegression(penalty='l2',n_jobs=-1)
lr = lr.fit(X_data, y_data)
print("[Logistic] fit score: {:.3f}".format(lr.score(X_data, y_data)))
logit_weights = lr.predict_proba(X_train.values)[:,1]

np.save('../Data/fix_logit_weights.npy',logit_weights)

## Classification Features
features that facilitate domain classification

In [None]:
X_data = pd.concat((X_train, X_test),axis=0)
y_data = np.concatenate((np.zeros((X_train.shape[0],1)), np.ones((X_test.shape[0],1))),axis=0)

params = {
    'objective':'binary',
    'boosting':'gbdt',
    'num_rounds':1000,
    'learning_rate':0.01,
    'max_depth':5,
    'num_leaves':20,
    'bagging_fraction':0.8,
    'bagging_freq':100,
    'verbose':2
}

train_data = lgb.Dataset(X_data, y_data.reshape(-1),
                         categorical_feature=current_cat_feats)
gbm = lgb.train(params, train_data)
clf_features = get_gbm_best_k_features(gbm, k =10,return_score=False)

In [None]:
robust_features = set(X_data.columns.tolist()) - set(clf_features)
X_data = X_data[robust_features]

## MLP for sample weights

In [None]:
def build_mlp_clf(input_shape):
    x_in = Input(shape=(input_shape,))

    def dense_block(h, units):
        h = Dense(units=units, use_bias=True,
                  activation=None,
                  kernel_initializer=he_normal(),
                  bias_initializer=constant(0.0))(h)
        h = BatchNormalization()(h)
        h = LeakyReLU(0.2)(h)
        h = Dropout(rate=0.5)(h)
        return h

    h = dense_block(x_in, units=32)
    h = dense_block(h, units=16)
    h = Dense(units=1, use_bias=False,
              activation='sigmoid',
              kernel_initializer='normal',
              bias_initializer=constant(0.0))(h)

    mlp_clf = Model(inputs=x_in, outputs=h)
    mlp_clf.compile(loss='binary_crossentropy', optimizer=Adam(5e-4), metrics=['accuracy'])

    return mlp_clf

mlp_clf = build_mlp_clf(input_shape=X_data.shape[1])
hist = mlp_clf.fit(X_data, y_data, batch_size=512, epochs=20, shuffle=True, verbose=1)

mlp_weights = mlp_clf.predict(X_train[robust_features].values)
np.save('../Data/fix_nn_weights.npy',mlp_weights)

# Train Regressor

In [None]:
current_cat_feats = list(set(current_cat_feats).intersection(set(robust_features)))

In [None]:
train_data = lgb.Dataset(X_train[robust_features], y_train,
                         categorical_feature=current_cat_feats)
train_data.set_weight(mlp_weights.reshape(-1))
test_data = lgb.Dataset(X_test[robust_features], y_test, reference=train_data)

params = {
    'objective':'regression',
    'boosting':'gbdt',
    'metric':'mae',
    'num_rounds':20000,
    'learning_rate':0.001,
    'max_depth':8,
    'num_leaves':100,
    'feature_fraction':0.5,
    'bagging_fraction':0.5,
    'extra_trees':True,
    'bagging_freq':200,
    'verbose':2
}

gbm = lgb.train(params, train_data,
                valid_sets=[test_data, train_data],
                valid_names=['test','train'],
                verbose_eval=500,
                early_stopping_rounds=100)

In [None]:
lgb.plot_importance(gbm,grid=False,height=0.5,max_num_features=15)

In [None]:
y_pred = gbm.predict(X_test[robust_features],num_iteration=gbm.best_iteration)

print("[LightGBM] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
    mae(y_pred, y_test),
    100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

In [None]:
plt.figure(figsize=(8,7))
plt.scatter(y_pred,y_test,marker='o',c='',edgecolors='k')
plt.plot(np.arange(420),np.arange(420),'r:')
plt.xlim([0,420])
plt.ylim([0,420])