# Libs

In [200]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import Flatten, Embedding, Concatenate
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.initializers import he_normal, constant
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mae
from tensorflow.keras.utils import normalize, to_categorical

import lightgbm as lgb
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK

from utils import *
from intra_alignment import *
from label_prop_v2 import *
from EasyTL import *

# Load Data

In [111]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')
Y_cat = Y.copy()
Y_cat['volume'] = Y_cat['volume'] // 5
Y_cat['volume'] = Y_cat['volume'].astype(int)

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [112]:
# parse args
use_simple_features = False
use_best_features = False
split_mode = 'fix_transfer'

# split the dataset
_, _, y_train, y_test = dataset_split(X, Y,
                                      split_mode=split_mode,
                                      use_simple_features=use_simple_features,
                                      use_best_features=use_best_features,
                                      num_features=0)
X_train, X_test, y_train_cat, y_test_cat = dataset_split(X, Y_cat,
                                                         split_mode=split_mode,
                                                         use_simple_features=use_simple_features,
                                                         use_best_features=use_best_features,
                                                         num_features=0)
X_train_oh, X_test_oh, _, _ = dataset_split(X_onehot, Y_cat,
                                            split_mode=split_mode,
                                            use_simple_features=use_simple_features,
                                            use_best_features=use_best_features,
                                            num_features=0)

# Well-regularized regressor

In [212]:
def build_wrr(input_shape, name):
    x_in = Input(shape=(input_shape,), name='input')

    weekday = Input(shape=(1,), name='weekday')
    holiday = Input(shape=(1,), name='holiday')
    peak = Input(shape=(1,), name='peak')
    interval = Input(shape=(1,), name='interval')

    v_weekday = Flatten()(Embedding(input_dim=7, output_dim=20)(weekday))
    v_holiday = Flatten()(Embedding(input_dim=2, output_dim=5)(holiday))
    v_peak = Flatten()(Embedding(input_dim=4, output_dim=5)(peak))
    v_interval = Flatten()(Embedding(input_dim=144, output_dim=30)(interval))
    v = Concatenate()([x_in, v_weekday, v_holiday, v_peak, v_interval])

    def build_dense(h, units, drop=True, norm=True):
        h = Dense(units=units, use_bias=True, activation=None,
                  kernel_initializer=he_normal(),
                  bias_initializer=constant(0.1),
                  kernel_regularizer=l2())(h)
        if norm:
            h = BatchNormalization()(h)
        h = LeakyReLU(0.2)(h)
        if drop:
            h = Dropout(rate=0.5)(h)
        return h
    
    h = build_dense(v, 384, norm=True)
    h = build_dense(h, 320, norm=True)
    h = build_dense(h, 256, norm=True)
    h = build_dense(h, 192, norm=True)
    h = build_dense(h, 128, norm=True)
    o = Dense(units=88,
              use_bias=True,
              activation='softmax',
              kernel_initializer='uniform',
              bias_initializer=constant(0.0),
              kernel_regularizer=l2())(h)

    model = Model(inputs=[x_in, weekday,
                          holiday, peak, interval],
                  outputs=o, name=name)

    return model

In [213]:
wrr = build_wrr(input_shape=X_train[numeric_features].shape[1],
                name='wrr')
wrr.compile(loss='categorical_crossentropy',
            optimizer=Adam(3e-4),
            metrics=['categorical_accuracy'])
wrr_hist = wrr.fit([X_train[numeric_features],
                    X_train['weekday'], X_train['holiday'],
                    X_train['peak'], X_train['interval']],
                   to_categorical(y_train_cat),
                   batch_size=512, epochs=500, shuffle=True, verbose=2)

Epoch 1/500
77760/77760 - 3s - loss: 21.6998 - categorical_accuracy: 0.0486
Epoch 2/500
77760/77760 - 2s - loss: 11.3522 - categorical_accuracy: 0.0784
Epoch 3/500
77760/77760 - 2s - loss: 6.7396 - categorical_accuracy: 0.0951
Epoch 4/500
77760/77760 - 2s - loss: 4.7164 - categorical_accuracy: 0.1069
Epoch 5/500
77760/77760 - 2s - loss: 3.8469 - categorical_accuracy: 0.1164
Epoch 6/500
77760/77760 - 2s - loss: 3.4852 - categorical_accuracy: 0.1215
Epoch 7/500
77760/77760 - 2s - loss: 3.3232 - categorical_accuracy: 0.1255
Epoch 8/500
77760/77760 - 2s - loss: 3.2419 - categorical_accuracy: 0.1288
Epoch 9/500
77760/77760 - 2s - loss: 3.2085 - categorical_accuracy: 0.1303
Epoch 10/500
77760/77760 - 2s - loss: 3.1841 - categorical_accuracy: 0.1322
Epoch 11/500
77760/77760 - 2s - loss: 3.1697 - categorical_accuracy: 0.1324
Epoch 12/500
77760/77760 - 2s - loss: 3.1516 - categorical_accuracy: 0.1344
Epoch 13/500
77760/77760 - 2s - loss: 3.1281 - categorical_accuracy: 0.1358
Epoch 14/500
77760/

Epoch 109/500
77760/77760 - 2s - loss: 2.8979 - categorical_accuracy: 0.1532
Epoch 110/500
77760/77760 - 2s - loss: 2.8853 - categorical_accuracy: 0.1541
Epoch 111/500
77760/77760 - 2s - loss: 2.8906 - categorical_accuracy: 0.1544
Epoch 112/500
77760/77760 - 2s - loss: 2.8933 - categorical_accuracy: 0.1538
Epoch 113/500
77760/77760 - 2s - loss: 2.8875 - categorical_accuracy: 0.1531
Epoch 114/500
77760/77760 - 2s - loss: 2.8872 - categorical_accuracy: 0.1532
Epoch 115/500
77760/77760 - 2s - loss: 2.8932 - categorical_accuracy: 0.1529
Epoch 116/500
77760/77760 - 3s - loss: 2.8988 - categorical_accuracy: 0.1510
Epoch 117/500
77760/77760 - 2s - loss: 2.8893 - categorical_accuracy: 0.1538
Epoch 118/500
77760/77760 - 2s - loss: 2.8829 - categorical_accuracy: 0.1544
Epoch 119/500
77760/77760 - 2s - loss: 2.8914 - categorical_accuracy: 0.1551
Epoch 120/500
77760/77760 - 2s - loss: 2.8834 - categorical_accuracy: 0.1532
Epoch 121/500
77760/77760 - 2s - loss: 2.8774 - categorical_accuracy: 0.1544

Epoch 216/500
77760/77760 - 2s - loss: 2.8325 - categorical_accuracy: 0.1577
Epoch 217/500
77760/77760 - 2s - loss: 2.8469 - categorical_accuracy: 0.1533
Epoch 218/500
77760/77760 - 2s - loss: 2.8449 - categorical_accuracy: 0.1543
Epoch 219/500
77760/77760 - 2s - loss: 2.8427 - categorical_accuracy: 0.1565
Epoch 220/500
77760/77760 - 2s - loss: 2.8413 - categorical_accuracy: 0.1559
Epoch 221/500
77760/77760 - 2s - loss: 2.8343 - categorical_accuracy: 0.1569
Epoch 222/500
77760/77760 - 2s - loss: 2.8294 - categorical_accuracy: 0.1589
Epoch 223/500
77760/77760 - 2s - loss: 2.8316 - categorical_accuracy: 0.1569
Epoch 224/500
77760/77760 - 2s - loss: 2.8317 - categorical_accuracy: 0.1553
Epoch 225/500
77760/77760 - 2s - loss: 2.8321 - categorical_accuracy: 0.1572
Epoch 226/500
77760/77760 - 2s - loss: 2.8470 - categorical_accuracy: 0.1551
Epoch 227/500
77760/77760 - 2s - loss: 2.8352 - categorical_accuracy: 0.1564
Epoch 228/500
77760/77760 - 2s - loss: 2.8320 - categorical_accuracy: 0.1570

Epoch 323/500
77760/77760 - 2s - loss: 2.8087 - categorical_accuracy: 0.1578
Epoch 324/500
77760/77760 - 2s - loss: 2.8142 - categorical_accuracy: 0.1570
Epoch 325/500
77760/77760 - 2s - loss: 2.8110 - categorical_accuracy: 0.1583
Epoch 326/500
77760/77760 - 2s - loss: 2.8178 - categorical_accuracy: 0.1574
Epoch 327/500
77760/77760 - 2s - loss: 2.8118 - categorical_accuracy: 0.1582
Epoch 328/500
77760/77760 - 2s - loss: 2.8056 - categorical_accuracy: 0.1581
Epoch 329/500
77760/77760 - 2s - loss: 2.8101 - categorical_accuracy: 0.1583
Epoch 330/500
77760/77760 - 2s - loss: 2.8180 - categorical_accuracy: 0.1572
Epoch 331/500
77760/77760 - 2s - loss: 2.8138 - categorical_accuracy: 0.1579
Epoch 332/500
77760/77760 - 2s - loss: 2.8133 - categorical_accuracy: 0.1568
Epoch 333/500
77760/77760 - 2s - loss: 2.8071 - categorical_accuracy: 0.1581
Epoch 334/500
77760/77760 - 2s - loss: 2.8103 - categorical_accuracy: 0.1582
Epoch 335/500
77760/77760 - 2s - loss: 2.8148 - categorical_accuracy: 0.1575

Epoch 430/500
77760/77760 - 2s - loss: 2.7986 - categorical_accuracy: 0.1560
Epoch 431/500
77760/77760 - 2s - loss: 2.7989 - categorical_accuracy: 0.1563
Epoch 432/500
77760/77760 - 2s - loss: 2.7929 - categorical_accuracy: 0.1585
Epoch 433/500
77760/77760 - 2s - loss: 2.7939 - categorical_accuracy: 0.1591
Epoch 434/500
77760/77760 - 2s - loss: 2.7974 - categorical_accuracy: 0.1598
Epoch 435/500
77760/77760 - 2s - loss: 2.7952 - categorical_accuracy: 0.1594
Epoch 436/500
77760/77760 - 2s - loss: 2.7976 - categorical_accuracy: 0.1578
Epoch 437/500
77760/77760 - 2s - loss: 2.7909 - categorical_accuracy: 0.1593
Epoch 438/500
77760/77760 - 2s - loss: 2.7900 - categorical_accuracy: 0.1598
Epoch 439/500
77760/77760 - 2s - loss: 2.8006 - categorical_accuracy: 0.1595
Epoch 440/500
77760/77760 - 2s - loss: 2.7990 - categorical_accuracy: 0.1586
Epoch 441/500
77760/77760 - 2s - loss: 2.7916 - categorical_accuracy: 0.1607
Epoch 442/500
77760/77760 - 2s - loss: 2.8077 - categorical_accuracy: 0.1606

In [214]:
embedder = Model(wrr.input, wrr.layers[16].output)

X_train_new = embedder.predict([X_train[numeric_features],
                                X_train['weekday'], X_train['holiday'],
                                X_train['peak'], X_train['interval']])
X_test_new = embedder.predict([X_test[numeric_features],
                               X_test['weekday'], X_test['holiday'],
                               X_test['peak'], X_test['interval']])

Xs, Xt = X_train_new.copy(), X_test_new.copy()
Ys, Yt = y_train_cat.copy().values, y_test_cat.copy().values

# EasyTL

In [220]:
def easytl(Xs, Xt, Ys, Yt, class_step=5, norm=False, alignment='coral'):
    
    if norm:
        Xs = Xs / np.tile(np.sum(Xs,axis=1).reshape(-1,1), [1, Xs.shape[1]])
        Xs = scipy.stats.mstats.zscore(Xs);
        Xt = Xt / np.tile(np.sum(Xt,axis=1).reshape(-1,1), [1, Xt.shape[1]])
        Xt = scipy.stats.mstats.zscore(Xt)

        Xs[np.isnan(Xs)] = 0
        Xt[np.isnan(Xt)] = 0

    C = len(np.unique(Ys))
    m = len(Yt)
    
    if alignment is 'coral':
        Xs = CORAL_map(Xs, Xt)
    elif alignment is 'raw':
        pass
    
    # get class distance
    _, Dct = get_class_center(Xs,Ys,Xt,dist='euclidean')
    
    # solve LP
    Mcj = label_prop(C,m,Dct,lp='linear')
    
    # evaluate solution
    y_pred_cat = np.argmax(Mcj, axis=1)
    y_pred = y_pred_cat * class_step + class_step / 2
    print("[MAE] {:.2f}".format(mae(y_pred, y_test)))
    
    return y_pred

In [221]:
y_pred = easytl(Xs, Xt, Ys, Yt)

[MAE] 59.73
