# Libs

In [4]:
import os
import time
import glob
import json
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.initializers import he_normal
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import mae
from tensorflow.keras.utils import normalize

import lightgbm as lgb

from utils import *

from hyperopt import fmin, hp, tpe, Trials, STATUS_OK

# Load Data

In [6]:
X = pd.read_csv('../Data/X.csv')
Y = pd.read_csv('../Data/Y.csv')

# load feature type columns
with open('../Data/feature_types.json', 'r') as f:
    categorical_features, numeric_features = json.load(f)

# one-hot encoding cat features
X_onehot = one_hot_encoding(X, categorical_features)

In [7]:
# parse args
use_simple_features = False
use_best_features = False
split_mode = 'fix_transfer'

# split the dataset
X_train, X_test, y_train, y_test = dataset_split(X, Y,
                                                 split_mode=split_mode,
                                                 use_simple_features=use_simple_features,
                                                 use_best_features=use_best_features,
                                                 num_features=0)
X_train_oh, X_test_oh, _, _ = dataset_split(X_onehot, Y,
                                            split_mode=split_mode,
                                            use_simple_features=use_simple_features,
                                            use_best_features=use_best_features,
                                            num_features=0)

# Distribution Shift

In [89]:
shift_mode = 'svd'

In [90]:
if shift_mode is 'svd':
    X_ = pd.concat((X_train, X_test), axis=0)
    train_idx = range(0,X_train.shape[0])
    test_idx = range(X_train.shape[0],X_.shape[0])
    
    # define transformer
    svd = TruncatedSVD(n_components=150, n_iter=20, random_state=1326)
    svd.fit(X_.T)
    print("[Stats] Var. explanation ratio: {:.4f}".format(svd.explained_variance_ratio_.sum()))
    
    # transform dataset
    X_new = svd.components_.T

elif shift_mode is 'ae':
    X_ = pd.concat((X_train_oh, X_test_oh), axis=0)
    
    train_idx = range(0,X_train.shape[0])
    test_idx = range(X_train.shape[0],X_.shape[0])
    
    # define transformer
    x_in = Input(shape=(X_train_oh.shape[1],))
    h = Dense(units=512,use_bias=True,kernel_initializer=he_normal(),activation=None)(x_in)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = BatchNormalization()(h)
    h = Dense(units=X_train_oh.shape[1],use_bias=True,
              kernel_initializer=he_normal(),activation='relu')(h)
    
    ae = Model(inputs=x_in, outputs=h)
    ae.compile(loss='mse',optimizer=Adam(5e-4))
    hist = ae.fit(normalize(X_.values), normalize(X_.values),
                       batch_size=512, epochs=50,
                       shuffle=True,verbose=1)
    
    # transform dataset
    Transformer = Model(inputs=ae.input, outputs=ae.layers[-2].output)
    X_new = Transformer.predict(normalize(X_.values))

[Stats] Var. explanation ratio: 1.0000


# Train Regressor

In [92]:
regressor = 'mlp'

In [None]:
if regressor is 'gbm':
    sp = time.time()
    train_data = lgb.Dataset(X_new[train_idx], y_train)
    test_data = lgb.Dataset(X_new[test_idx], y_test, reference=train_data)

    params = {
        'objective':'regression',
        'boosting':'gbdt',
        'metric':'mae',
        'num_rounds':20000,
        'learning_rate':0.002,
        'max_depth':10,
        'num_leaves':200,
        'feature_fraction':0.5,
        'bagging_fraction':0.9,
        'bagging_freq':200,
        'verbose':0
    }

    gbm = lgb.train(params, train_data,
                    valid_sets=[test_data, train_data],
                    valid_names=['test','train'],
                    verbose_eval=200,
                    early_stopping_rounds=100)
    print("[Duration] {:.2f} sec.".format(time.time() - sp))


    # evaluate
    y_pred = gbm.predict(X_new[test_idx],num_iteration=gbm.best_iteration)
    print("[LightGBM] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test), 100 * mspe(y_pred, y_test)))

elif regressor is 'mlp':
    sp = time.time()
    
    x_in = Input(shape=(X_new.shape[1],))
    h = Dense(units=512,use_bias=True,kernel_initializer=he_normal(),activation=None)(x_in)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=256,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=128,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=64,use_bias=True,kernel_initializer=he_normal(),activation=None)(h)
    h = LeakyReLU(0.1)(h)
    h = Dropout(0.5)(h)
    h = Dense(units=1,use_bias=True,kernel_initializer=he_normal(),activation='relu')(h)
    
    mlp = Model(inputs=x_in, outputs=h)
    mlp.compile(loss='mse',optimizer=Adam(3e-4),metrics=['mae'])
    print("[Num. Params.] {:d}".format(mlp.count_params()))
    hist = mlp.fit(X_new[train_idx], y_train, batch_size=512, epochs=200,
                   shuffle=True, verbose=2,
                   validation_data=[X_new[test_idx],y_test])
    
    print("[Duration] {:.2f} sec.".format(time.time() - sp))
    
    # evaluate
    y_pred = mlp.predict(X_new[test_idx]).reshape(-1)
    print("[MLP] mae: {:.2f} | mape: {:.2f}% | mspe: {:.2f}%".format(
        mae(y_pred, y_test),
        100 * mape(y_pred,y_test),
        100 * mspe(y_pred, y_test)))

# MMD

In [123]:
import torch

def gaussian_kernel(source, target, kernel_mul=2.0, kernel_num=5, fix_sigma=None):
    '''
    将源域数据和目标域数据转化为核矩阵，即上文中的K
    Params: 
        source: 源域数据（n * len(x))
        target: 目标域数据（m * len(y))
        kernel_mul: 
        kernel_num: 取不同高斯核的数量
        fix_sigma: 不同高斯核的sigma值
    Return:
        sum(kernel_val): 多个核矩阵之和
    '''
    n_samples = int(source.size()[0])+int(target.size()[0])# 求矩阵的行数，一般source和target的尺度是一样的，这样便于计算
    total = torch.cat([source, target], dim=0)#将source,target按列方向合并
    #将total复制（n+m）份
    total0 = total.unsqueeze(0).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
    #将total的每一行都复制成（n+m）行，即每个数据都扩展成（n+m）份
    total1 = total.unsqueeze(1).expand(int(total.size(0)), int(total.size(0)), int(total.size(1)))
    #求任意两个数据之间的和，得到的矩阵中坐标（i,j）代表total中第i行数据和第j行数据之间的l2 distance(i==j时为0）
    L2_distance = ((total0-total1)**2).sum(2) 
    #调整高斯核函数的sigma值
    if fix_sigma:
        bandwidth = fix_sigma
    else:
        bandwidth = torch.sum(L2_distance.data) / (n_samples**2-n_samples)
    #以fix_sigma为中值，以kernel_mul为倍数取kernel_num个bandwidth值（比如fix_sigma为1时，得到[0.25,0.5,1,2,4]
    bandwidth /= kernel_mul ** (kernel_num // 2)
    bandwidth_list = [bandwidth * (kernel_mul**i) for i in range(kernel_num)]
    #高斯核函数的数学表达式
    kernel_val = [torch.exp(-L2_distance / bandwidth_temp) for bandwidth_temp in bandwidth_list]
    #得到最终的核矩阵
    return sum(kernel_val)#/len(kernel_val)


def mmd_rbf(source, target, kernel_mul=2.0, batch_size=32,kernel_num=5, fix_sigma=None):
    '''
    计算源域数据和目标域数据的MMD距离
    Params: 
        source: 源域数据（n * len(x))
        target: 目标域数据（m * len(y))
        kernel_mul: 
        kernel_num: 取不同高斯核的数量
        fix_sigma: 不同高斯核的sigma值
    Return:
        loss: MMD loss
    '''
    batch_size = int(source.size()[0])#一般默认为源域和目标域的batchsize相同
    kernels = guassian_kernel(source, target,
        kernel_mul=kernel_mul, kernel_num=kernel_num, fix_sigma=fix_sigma)
    #根据式（3）将核矩阵分成4部分
    XX = kernels[:batch_size, :batch_size]
    YY = kernels[batch_size:, batch_size:]
    XY = kernels[:batch_size, batch_size:]
    YX = kernels[batch_size:, :batch_size]
    loss = torch.mean(XX + YY - XY - YX)
    return loss#因为一般都是n==m，所以L矩阵一般不加入计算


In [126]:
mmd_rbf(torch.Tensor(X_new[128:256]), torch.Tensor(X_new[-128:]))

tensor(1.0162)