# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import tensorflow as tf
from datetime import datetime
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras import layers, losses, metrics
from sklearn.preprocessing import OneHotEncoder

# Prepare data

## opt_people dataset for fit into Autoencoders

In [None]:
data_opt = pd.read_csv('/content/drive/MyDrive/Embedding2/Cleaned_data/opt_people_cleaned.csv')
data_opt = data_opt.drop(365, axis=0)
data_opt = data_opt.reset_index(drop=True)
data_opt_index = list(data_opt['ASK_ID'])
data_opt = data_opt.drop(
    ["manager_ownership_level", "ASK_ID", "manager_history", "auditor_\nfee", "managers_count", "length_in_days",
     'superannuation'], axis=1)
data_opt.info()
enc = OneHotEncoder()
enc.fit(data_opt)
df_encoded_before = enc.transform(data_opt).toarray()
pd_opt_people = pd.DataFrame(df_encoded_before)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1329 entries, 0 to 1328
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   administrator                1329 non-null   object
 1   advisor                      1329 non-null   object
 2   auditor                      1329 non-null   object
 3   custodian                    1329 non-null   object
 4   distributor                  1329 non-null   object
 5   manager_of_managers          1329 non-null   object
 6   subadvised                   1329 non-null   object
 7   subadvisor                   1329 non-null   object
 8   managers                     1329 non-null   object
 9   more_than_one_administrator  1329 non-null   bool  
 10  more_than_one_custodian      1329 non-null   bool  
dtypes: bool(2), object(9)
memory usage: 96.2+ KB


In [None]:
pd_opt_people.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1329 entries, B42 to B26072
Columns: 2108 entries, 0 to 2107
dtypes: float64(2108)
memory usage: 21.4+ MB


## Excess return dataset for predicting

In [None]:
# these codes are from TS forest group 2
excess_return = pd.read_parquet('/content/drive/MyDrive/Embedding2/Data/MF_LargeCap_ExcessReturn_3Y.parquet', engine='pyarrow')

# window length between two consecutive features: number of days
feat_window = 90
# performance measures window: number of years
pm_window = 3
lb_window = int(3 * pm_window * 365.25) + 1
# window length between training samples: number of days
sample_window = 30
# test period start
test_start_date = '2020-06-30'

def prepare_data(data):

    data_dict = {ticker: data[ticker].dropna() for ticker in data.columns}

    tickers_to_remove = []

    label_dict = {}
    for ticker, series in tqdm(data_dict.items()):
        if series.isna().sum() == series.shape[0]:
            tickers_to_remove += [ticker]
            continue

        last_date = series.index[-1] - relativedelta(years=pm_window)
        if last_date <= series.index[0]:
            tickers_to_remove.append(ticker)
            continue

        index = series.loc[:series.index[-1] - relativedelta(years=pm_window)].index
        label_dict[ticker] = pd.Series([
            series[date + relativedelta(years=pm_window)] for date in index
        ], index=index)

    _ = [data_dict.pop(ticker) for ticker in tickers_to_remove]

    return data_dict, label_dict

def alt_dataset(data_dict, label_dict, one_hot_alt_data):
    tickers = list(data_dict.keys())

    mean_labels = {}

    for ticker in tqdm(tickers):
        try:
            label = label_dict[ticker]
            if label.shape[0] == 0:
                continue
            ts = data_dict[ticker].loc[:label.index[-1]]

            indices = [np.arange(i, i + lb_window, feat_window) for i in
                       range(0, ts.shape[0] - lb_window + 1, sample_window)]

            temp_data = np.array(
                [np.concatenate([ts.iloc[sub_indices].values, one_hot_alt_data.loc[ticker]]) for sub_indices in
                 indices])
            if temp_data.shape[0] == 0:
                continue
            temp_labels = np.array([label.loc[ts.index[sub_indices[-1]]] for sub_indices in indices])

            mean_labels[ticker] = temp_labels.mean()
        except:
            continue

    one_hot_alt_data_labeled = one_hot_alt_data.loc[list(mean_labels.keys())]
    one_hot_alt_data_labeled['Y'] = list(mean_labels.values())

    return one_hot_alt_data_labeled

In [None]:
data_dict, label_dict = prepare_data(excess_return)


100%|██████████| 1330/1330 [04:35<00:00,  4.83it/s]


In [None]:
one_hot_opt_people_labeled_before = alt_dataset(data_dict, label_dict, pd_opt_people)

100%|██████████| 1130/1130 [00:27<00:00, 41.48it/s]


In [None]:
one_hot_opt_people_labeled_before.info()

<class 'pandas.core.frame.DataFrame'>
Index: 831 entries, B42 to B26061
Columns: 2109 entries, 0 to Y
dtypes: float64(2109)
memory usage: 13.4+ MB


# Predicting using orginal one-hot encoded dataset - for benchmark

In [None]:
dataX = one_hot_opt_people_labeled_before.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled_before['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])


## using SVM to predict

In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00182, test set mse:0.00142
kernel:linear c:11 val set mse:0.00182, test set mse:0.00142
kernel:linear c:21 val set mse:0.00182, test set mse:0.00142
kernel:linear c:31 val set mse:0.00182, test set mse:0.00142
kernel:linear c:41 val set mse:0.00182, test set mse:0.00142
kernel:poly c:1 val set mse:0.00189, test set mse:0.00142
kernel:poly c:11 val set mse:0.00189, test set mse:0.00142
kernel:poly c:21 val set mse:0.00189, test set mse:0.00142
kernel:poly c:31 val set mse:0.00189, test set mse:0.00142
kernel:poly c:41 val set mse:0.00189, test set mse:0.00142
kernel:rbf c:1 val set mse:0.00186, test set mse:0.00141
kernel:rbf c:11 val set mse:0.00186, test set mse:0.00141
kernel:rbf c:21 val set mse:0.00186, test set mse:0.00141
kernel:rbf c:31 val set mse:0.00186, test set mse:0.00141
kernel:rbf c:41 val set mse:0.00186, test set mse:0.00141
kernel:sigmoid c:1 val set mse:0.00181, test set mse:0.00144
kernel:sigmoid c:11 val set mse:0.00181, test set ms

# Normal Autoencoder with different activation function

In [None]:
input_size = 2108
output_size = 2108
train_val_arr, test_arr = train_test_split(df_encoded_before, test_size=0.1)
train_arr, val_arr = train_test_split(train_val_arr, test_size=0.1)

## Linear

In [None]:
class AutoencoderLinear(Model):
    def __init__(self):
        super(AutoencoderLinear, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, input_shape=(input_size,)),
            layers.Dense(1024),
            layers.Dense(512),
            layers.Dense(32),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32),
            layers.Dense(512),
            layers.Dense(1024),
            layers.Dense(2048),
            layers.Dense(output_size),
        ])
 
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_linear = AutoencoderLinear()
autoencoder_linear.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_linear.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 3s - loss: 0.0053 - val_loss: 0.0043 - lr: 1.0000e-04 - 3s/epoch - 152ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0036 - val_loss: 0.0033 - lr: 1.0000e-04 - 237ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0031 - val_loss: 0.0030 - lr: 1.0000e-04 - 230ms/epoch - 14ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0028 - val_loss: 0.0028 - lr: 1.0000e-04 - 206ms/epoch - 12ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0026 - val_loss: 0.0026 - lr: 1.0000e-04 - 209ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0024 - val_loss: 0.0024 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0022 - val_loss: 0.0023 - lr: 1.0000e-04 - 203ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0021 - val_loss: 0.0022 - lr: 1.0000e-04 - 200ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0020 - val_loss: 0.0022 - lr: 1.0000e-04 - 210ms/epoch - 12ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0020 - val_loss: 0.0021 - lr: 1.0000e-04 - 201ms/epoch - 12ms/step
E

<keras.callbacks.History at 0x7fc590470110>

In [None]:
test_pred = autoencoder_linear.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00176


In [None]:
linear_reconstruct = autoencoder_linear.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, linear_reconstruct)))

reconstruction loss:0.00148


## ReLu

In [None]:
class AutoencoderRelu(Model):
    def __init__(self, act='relu'):
        super(AutoencoderRelu, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_relu = AutoencoderRelu()
autoencoder_relu.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_relu.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0052 - val_loss: 0.0051 - lr: 1.0000e-04 - 1s/epoch - 68ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0048 - val_loss: 0.0043 - lr: 1.0000e-04 - 242ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0037 - val_loss: 0.0033 - lr: 1.0000e-04 - 228ms/epoch - 13ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0032 - lr: 1.0000e-04 - 220ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0032 - lr: 1.0000e-04 - 211ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0031 - lr: 1.0000e-04 - 219ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0030 - val_loss: 0.0030 - lr: 1.0000e-04 - 207ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0029 - val_loss: 0.0029 - lr: 1.0000e-04 - 211ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0029 - val_loss: 0.0029 - lr: 1.0000e-04 - 214ms/epoch - 13ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0028 - val_loss: 0.0028 - lr: 1.0000e-04 - 218ms/epoch - 13ms/step
Ep

<keras.callbacks.History at 0x7fc521a38ed0>

In [None]:
test_pred_relu = autoencoder_relu.predict(test_arr)
print('relu test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred_relu)))

relu test set mse:0.00121


In [None]:
relu_reduced = autoencoder_relu.encoder(df_encoded_before)
relu_reconstruct = autoencoder_relu.decoder(relu_reduced)
print('relu reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, relu_reconstruct)))

relu reconstruction loss:0.00077


## tanh

In [None]:
class AutoencoderTanh(Model):
    def __init__(self, act='tanh'):
        super(AutoencoderTanh, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_tanh = AutoencoderTanh()
autoencoder_tanh.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_tanh.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0053 - val_loss: 0.0044 - lr: 1.0000e-04 - 1s/epoch - 67ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0036 - val_loss: 0.0033 - lr: 1.0000e-04 - 240ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0031 - val_loss: 0.0030 - lr: 1.0000e-04 - 228ms/epoch - 13ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0028 - val_loss: 0.0028 - lr: 1.0000e-04 - 218ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0026 - val_loss: 0.0026 - lr: 1.0000e-04 - 206ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0024 - val_loss: 0.0024 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0023 - val_loss: 0.0023 - lr: 1.0000e-04 - 215ms/epoch - 13ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0022 - val_loss: 0.0022 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0021 - val_loss: 0.0022 - lr: 1.0000e-04 - 208ms/epoch - 12ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0020 - val_loss: 0.0021 - lr: 1.0000e-04 - 204ms/epoch - 12ms/step
Ep

<keras.callbacks.History at 0x7fc52669b150>

In [None]:
test_pred = autoencoder_tanh.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00177


In [None]:
tanh_reconstruct = autoencoder_tanh.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, tanh_reconstruct)))

reconstruction loss:0.00148


## sigmoid

In [None]:
class AutoencoderSigmoid(Model):
    def __init__(self, act='sigmoid'):
        super(AutoencoderSigmoid, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_sigmoid = AutoencoderSigmoid()
autoencoder_sigmoid.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_sigmoid.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0587 - val_loss: 0.0162 - lr: 1.0000e-04 - 1s/epoch - 67ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0090 - val_loss: 0.0050 - lr: 1.0000e-04 - 237ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0042 - val_loss: 0.0036 - lr: 1.0000e-04 - 231ms/epoch - 14ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0035 - val_loss: 0.0034 - lr: 1.0000e-04 - 233ms/epoch - 14ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 216ms/epoch - 13ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 215ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 207ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 208ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 214ms/epoch - 13ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Ep

<keras.callbacks.History at 0x7fc525c8d050>

In [None]:
test_pred = autoencoder_sigmoid.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00329


In [None]:
sig_reconstruct = autoencoder_sigmoid.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, sig_reconstruct)))

reconstruction loss:0.00328


## selu

In [None]:
class AutoencoderSelu(Model):
    def __init__(self, act='selu'):
        super(AutoencoderSelu, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_selu = AutoencoderSelu()
autoencoder_selu.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_selu.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0235 - val_loss: 0.0089 - lr: 1.0000e-04 - 1s/epoch - 65ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0059 - val_loss: 0.0048 - lr: 1.0000e-04 - 239ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0039 - val_loss: 0.0040 - lr: 1.0000e-04 - 227ms/epoch - 13ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0036 - lr: 1.0000e-04 - 221ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0030 - val_loss: 0.0033 - lr: 1.0000e-04 - 211ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0029 - val_loss: 0.0032 - lr: 1.0000e-04 - 215ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0027 - val_loss: 0.0031 - lr: 1.0000e-04 - 208ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0026 - val_loss: 0.0030 - lr: 1.0000e-04 - 207ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0025 - val_loss: 0.0029 - lr: 1.0000e-04 - 211ms/epoch - 12ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0024 - val_loss: 0.0029 - lr: 1.0000e-04 - 204ms/epoch - 12ms/step
Ep

<keras.callbacks.History at 0x7fc525a00b50>

In [None]:
test_pred = autoencoder_selu.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00178


In [None]:
selu_reconstruct = autoencoder_selu.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, selu_reconstruct)))

reconstruction loss:0.00111


## elu

In [None]:
class AutoencoderElu(Model):
    def __init__(self, act='elu'):
        super(AutoencoderElu, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_elu = AutoencoderElu()
autoencoder_elu.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_elu.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0050 - val_loss: 0.0039 - lr: 1.0000e-04 - 1s/epoch - 66ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0034 - val_loss: 0.0032 - lr: 1.0000e-04 - 248ms/epoch - 15ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0031 - val_loss: 0.0030 - lr: 1.0000e-04 - 221ms/epoch - 13ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0028 - val_loss: 0.0027 - lr: 1.0000e-04 - 220ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0026 - val_loss: 0.0026 - lr: 1.0000e-04 - 211ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0024 - val_loss: 0.0024 - lr: 1.0000e-04 - 214ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0023 - val_loss: 0.0023 - lr: 1.0000e-04 - 206ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0022 - val_loss: 0.0022 - lr: 1.0000e-04 - 204ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0021 - val_loss: 0.0022 - lr: 1.0000e-04 - 204ms/epoch - 12ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0020 - val_loss: 0.0021 - lr: 1.0000e-04 - 206ms/epoch - 12ms/step
Ep

<keras.callbacks.History at 0x7fc525788610>

In [None]:
test_pred = autoencoder_elu.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00160


In [None]:
elu_reconstruct = autoencoder_elu.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, elu_reconstruct)))

reconstruction loss:0.00129


# Predicting using best normal autoencoder - relu

In [None]:
pd_opt_people = pd.DataFrame(relu_reduced)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')
one_hot_opt_people_labeled = alt_dataset(data_dict, label_dict, pd_opt_people)

dataX = one_hot_opt_people_labeled.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])

#norm = Normalizer()
#trainX = norm.fit_transform(trainX)
#valX = norm.transform(valX)
#testX = norm.transform(testX)

100%|██████████| 1130/1130 [00:25<00:00, 43.75it/s]


In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00385, test set mse:0.00263
kernel:linear c:11 val set mse:0.00390, test set mse:0.00261
kernel:linear c:21 val set mse:0.00388, test set mse:0.00254
kernel:linear c:31 val set mse:0.00389, test set mse:0.00257
kernel:linear c:41 val set mse:0.00388, test set mse:0.00256
kernel:poly c:1 val set mse:0.00251, test set mse:0.00169
kernel:poly c:11 val set mse:0.00240, test set mse:0.00165
kernel:poly c:21 val set mse:0.00240, test set mse:0.00165
kernel:poly c:31 val set mse:0.00240, test set mse:0.00165
kernel:poly c:41 val set mse:0.00240, test set mse:0.00165
kernel:rbf c:1 val set mse:0.00284, test set mse:0.00181
kernel:rbf c:11 val set mse:0.00236, test set mse:0.00179
kernel:rbf c:21 val set mse:0.00236, test set mse:0.00179
kernel:rbf c:31 val set mse:0.00236, test set mse:0.00179
kernel:rbf c:41 val set mse:0.00236, test set mse:0.00179
kernel:sigmoid c:1 val set mse:0.00233, test set mse:0.00249
kernel:sigmoid c:11 val set mse:730.49866, test set 

# Denoising Autoencoder

## Gaussian Noise

In [None]:
class AutoencoderGaussian(Model):
    def __init__(self, act='relu'):
        super(AutoencoderGaussian, self).__init__()
        self.encoder = tf.keras.Sequential([
            layers.Dense(2048, activation=act, input_shape=(input_size,)),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(32, activation=act)
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act),
            layers.Dense(1024, activation=act),
            layers.Dense(512, activation=act),
            layers.Dense(2048, activation=act),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
        
# partially corrupted our data with gaussian noise
train_arr_noise = train_arr + np.random.normal(loc=0.0, scale=1.0, size=train_arr.shape) * 0.1
val_arr_noise = val_arr + np.random.normal(loc=0.0, scale=1.0, size=val_arr.shape) * 0.1
test_arr_noise = test_arr + np.random.normal(loc=0.0, scale=1.0, size=test_arr.shape) * 0.1

autoencoder_G = AutoencoderGaussian()
autoencoder_G.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_G.fit(train_arr_noise, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr_noise, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 1s - loss: 0.0052 - val_loss: 0.0051 - lr: 1.0000e-04 - 1s/epoch - 69ms/step
Epoch 2/500
17/17 - 0s - loss: 0.0048 - val_loss: 0.0042 - lr: 1.0000e-04 - 231ms/epoch - 14ms/step
Epoch 3/500
17/17 - 0s - loss: 0.0036 - val_loss: 0.0033 - lr: 1.0000e-04 - 227ms/epoch - 13ms/step
Epoch 4/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0033 - lr: 1.0000e-04 - 219ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 0.0033 - val_loss: 0.0032 - lr: 1.0000e-04 - 203ms/epoch - 12ms/step
Epoch 6/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0032 - lr: 1.0000e-04 - 213ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0032 - lr: 1.0000e-04 - 225ms/epoch - 13ms/step
Epoch 8/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0032 - lr: 1.0000e-04 - 203ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 0.0032 - val_loss: 0.0031 - lr: 1.0000e-04 - 208ms/epoch - 12ms/step
Epoch 10/500
17/17 - 0s - loss: 0.0031 - val_loss: 0.0030 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Ep

<keras.callbacks.History at 0x7fc5219b5f90>

In [None]:
test_pred = autoencoder_G.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00170


In [None]:
g_reconstruct = autoencoder_G.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, g_reconstruct)))

reconstruction loss:0.00137


# K-Sparse Autoencoder

In [None]:
from tensorflow.keras.layers import *
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers

p = 0.05
beta = 3
lambda_val = 0.001

# define regularized function and calculate KL divergence
def sparse_reg(activity_matrix):
    # use softmax function to make the activations are between 0 and 1
    activity_matrix = K.softmax(activity_matrix, axis=0) 
    # calculate average activations for a hidden unit
    p_hat = K.mean(activity_matrix, axis=0)  
    # calculate K-L divergence 
    KLD = p*(K.log(p/p_hat))+(1-p)*(K.log((1-p)/(1-p_hat)))  
    # sum of all K-L divergence and multiply by beta
    return beta*K.sum(KLD)  

class AutoencoderSparse(Model):
    def __init__(self, act='linear'):
        super(AutoencoderSparse, self).__init__()
        self.encoder = tf.keras.Sequential([
            # add activity_regularizer parameter
            layers.Dense(2048, activation=act, activity_regularizer=sparse_reg, input_shape=(input_size,)),
            layers.Dense(1024, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(512, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(32, activation=act, activity_regularizer=sparse_reg),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(32, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(512, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(1024, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(2048, activation=act, activity_regularizer=sparse_reg),
            layers.Dense(output_size),
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


autoencoder_sparse = AutoencoderSparse()
autoencoder_sparse.compile(optimizer=Adam(0.0001), loss=losses.MeanSquaredError())
autoencoder_sparse.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, val_arr), verbose=2, callbacks=[
    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
])

Epoch 1/500
17/17 - 4s - loss: 8.2265 - val_loss: 8.0073 - lr: 1.0000e-04 - 4s/epoch - 233ms/step
Epoch 2/500
17/17 - 0s - loss: 8.2262 - val_loss: 8.0066 - lr: 1.0000e-04 - 385ms/epoch - 23ms/step
Epoch 3/500
17/17 - 0s - loss: 8.2250 - val_loss: 8.0055 - lr: 1.0000e-04 - 346ms/epoch - 20ms/step
Epoch 4/500
17/17 - 0s - loss: 8.2246 - val_loss: 8.0054 - lr: 1.0000e-04 - 334ms/epoch - 20ms/step
Epoch 5/500
17/17 - 0s - loss: 8.2245 - val_loss: 8.0053 - lr: 1.0000e-04 - 336ms/epoch - 20ms/step
Epoch 6/500
17/17 - 0s - loss: 8.2245 - val_loss: 8.0053 - lr: 1.0000e-04 - 343ms/epoch - 20ms/step
Epoch 7/500
17/17 - 0s - loss: 8.2244 - val_loss: 8.0051 - lr: 1.0000e-04 - 322ms/epoch - 19ms/step
Epoch 8/500
17/17 - 0s - loss: 8.2242 - val_loss: 8.0051 - lr: 1.0000e-04 - 334ms/epoch - 20ms/step
Epoch 9/500
17/17 - 0s - loss: 8.2242 - val_loss: 8.0050 - lr: 1.0000e-04 - 336ms/epoch - 20ms/step
Epoch 10/500
17/17 - 0s - loss: 8.2241 - val_loss: 8.0050 - lr: 1.0000e-04 - 334ms/epoch - 20ms/step
E

<keras.callbacks.History at 0x7fc5c2087790>

In [None]:
test_pred = autoencoder_sparse.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00121


In [None]:
sparse_reconstruct = autoencoder_sparse.call(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, sparse_reconstruct)))

reconstruction loss:0.00079


## Predicting - using K-sparse

In [None]:
sparse_encoded = autoencoder_sparse.encoder(df_encoded_before)
pd_opt_people = pd.DataFrame(sparse_encoded)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')
one_hot_opt_people_labeled = alt_dataset(data_dict, label_dict, pd_opt_people)

dataX = one_hot_opt_people_labeled.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])


100%|██████████| 1130/1130 [00:26<00:00, 43.16it/s]


In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00532, test set mse:0.00355
kernel:linear c:11 val set mse:0.00532, test set mse:0.00355
kernel:linear c:21 val set mse:0.00532, test set mse:0.00355
kernel:linear c:31 val set mse:0.00532, test set mse:0.00355
kernel:linear c:41 val set mse:0.00532, test set mse:0.00355
kernel:poly c:1 val set mse:0.00375, test set mse:0.00278
kernel:poly c:11 val set mse:0.00375, test set mse:0.00278
kernel:poly c:21 val set mse:0.00375, test set mse:0.00278
kernel:poly c:31 val set mse:0.00375, test set mse:0.00278
kernel:poly c:41 val set mse:0.00375, test set mse:0.00278
kernel:rbf c:1 val set mse:0.00288, test set mse:0.00197
kernel:rbf c:11 val set mse:0.00298, test set mse:0.00205
kernel:rbf c:21 val set mse:0.00298, test set mse:0.00205
kernel:rbf c:31 val set mse:0.00298, test set mse:0.00205
kernel:rbf c:41 val set mse:0.00298, test set mse:0.00205
kernel:sigmoid c:1 val set mse:9.55113, test set mse:8.65457
kernel:sigmoid c:11 val set mse:1180.19167, test set

# Stacked AE

In [None]:
class AutoEncoderLayer():
    def __init__(self, input_dim, output_dim, act='sigmoid'):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act = act
        self.build()

    def build(self):
        self.input = Input(shape=(self.input_dim,))
        self.encode_layer = Dense(self.output_dim, activation=self.act)
        self.encoded = self.encode_layer(self.input)
        self.encoder = Model(self.input, self.encoded)

        self.decode_layer = Dense(self.input_dim, activation=self.act)
        self.decoded = self.decode_layer(self.encoded)

        self.autoencoder = Model(self.input, self.decoded)

# build stacked AE
class StackedAutoEncoder():
    def __init__(self, layer_list):
        self.layer_list = layer_list
        self.build()

    def build(self):
        out = self.layer_list[0].encoded
        for i in range(1, num_layers - 1):
            out = self.layer_list[i].encode_layer(out)
        self.model = Model(self.layer_list[0].input, out)

class StackedEncoder():
    def __init__(self, layer_list):
        self.layer_list = layer_list
        self.build()

    def build(self):
        out = self.layer_list[0].output
        for i in range(1, len(self.layer_list)):
            out = self.layer_list[i](out)
        self.model = Model(self.layer_list[0].input, out)

# pretraining for autoencoder layers
def train_layers(encoder_list=None, layer=None, epochs=None, batch_size=None):
    '''
    :param encoder_list:
    :param layer:
    :param epochs:
    :param batch_size:
    :return:
    '''
    
    # when training the ith layer, we use the parameters from trained (i-1)th layer
    out = train_arr
    origin = train_arr
    if layer != 0:
        for i in range(layer):
            out = encoder_list[i].encoder.predict(out)

    encoder_list[layer].autoencoder.summary()
    encoder_list[layer].autoencoder.compile(optimizer=Adam(0.0001), loss=tf.keras.losses.MeanSquaredError())
    
    # training ith layer of autoencoder 
    encoder_list[layer].autoencoder.fit(
        out,
        origin if layer == 0 else out,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        verbose=2
    )

# used the pretrained parameter initialize the parameter for stacked ae
# training the whole stacked ae and optimize
def train_whole(sae=None, epochs=None, batch_size=None):
    '''
    :param model:
    :param epochs:
    :param batch_size:
    :return:
    '''
    sae.model.summary()
    sae.model.compile(optimizer=Adam(0.0001), loss=tf.keras.losses.MeanSquaredError())
    sae.model.fit(
        train_arr,
        train_arr,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        validation_data=(val_arr, val_arr),
        verbose=2
    )


num_layers = 7
encoder_1 = AutoEncoderLayer(input_size, 2048)
encoder_2 = AutoEncoderLayer(2048, 1024)
encoder_3 = AutoEncoderLayer(1024, 32)
decoder_1 = AutoEncoderLayer(32, 1024)
decoder_2 = AutoEncoderLayer(1024, 2048)
decoder_3 = AutoEncoderLayer(2048, input_size)
autoencoder_list = [encoder_1, encoder_2, encoder_3, decoder_1, decoder_2, decoder_3]

# pretraining
print("Pre training:")
for level in range(num_layers - 1):
    print("level:", level)
    train_layers(encoder_list=autoencoder_list, layer=level, epochs=300, batch_size=32)


# stacked ae training and optimize
stacked_ae = StackedAutoEncoder(autoencoder_list)
print("Whole training:")
train_whole(sae=stacked_ae, epochs=500, batch_size=32)

Pre training:
level: 0
Model: "model_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 2108)]            0         
                                                                 
 dense_136 (Dense)           (None, 2048)              4319232   
                                                                 
 dense_137 (Dense)           (None, 2108)              4319292   
                                                                 
Total params: 8,638,524
Trainable params: 8,638,524
Non-trainable params: 0
_________________________________________________________________
Epoch 1/300
34/34 - 1s - loss: 0.0778 - 603ms/epoch - 18ms/step
Epoch 2/300
34/34 - 0s - loss: 0.0074 - 221ms/epoch - 7ms/step
Epoch 3/300
34/34 - 0s - loss: 0.0052 - 211ms/epoch - 6ms/step
Epoch 4/300
34/34 - 0s - loss: 0.0045 - 214ms/epoch - 6ms/step
Epoch 5/300
34/34 - 0s - loss: 0.004

In [None]:
test_pred = stacked_ae.model.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00103


In [None]:
stacked_reconstruct = stacked_ae.model.predict(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, stacked_reconstruct)))

reconstruction loss:0.00078


## Predicting - using stacked

In [None]:
enc_model = StackedEncoder(stacked_ae.model.layers[:num_layers // 2 + 1])

In [None]:
stacked_encoded = enc_model.model.predict(df_encoded_before)
pd_opt_people = pd.DataFrame(stacked_encoded)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')
one_hot_opt_people_labeled = alt_dataset(data_dict, label_dict, pd_opt_people)

dataX = one_hot_opt_people_labeled.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])


100%|██████████| 1130/1130 [00:26<00:00, 42.68it/s]


In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00215, test set mse:0.00179
kernel:linear c:11 val set mse:0.00242, test set mse:0.00241
kernel:linear c:21 val set mse:0.00254, test set mse:0.00301
kernel:linear c:31 val set mse:0.00248, test set mse:0.00301
kernel:linear c:41 val set mse:0.00254, test set mse:0.00298
kernel:poly c:1 val set mse:0.00185, test set mse:0.00146
kernel:poly c:11 val set mse:0.00186, test set mse:0.00154
kernel:poly c:21 val set mse:0.00186, test set mse:0.00154
kernel:poly c:31 val set mse:0.00186, test set mse:0.00154
kernel:poly c:41 val set mse:0.00186, test set mse:0.00154
kernel:rbf c:1 val set mse:0.00171, test set mse:0.00135
kernel:rbf c:11 val set mse:0.00178, test set mse:0.00141
kernel:rbf c:21 val set mse:0.00186, test set mse:0.00147
kernel:rbf c:31 val set mse:0.00186, test set mse:0.00147
kernel:rbf c:41 val set mse:0.00186, test set mse:0.00147
kernel:sigmoid c:1 val set mse:1.69279, test set mse:4.98335
kernel:sigmoid c:11 val set mse:107.43401, test set 

# Stacked AE - with sparse layer

In [None]:
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K

p = 0.05
beta = 3
lambda_val = 0.001

# define regularized function and calculate KL divergence
def sparse_reg(activity_matrix):
    # use softmax function to make the activations are between 0 and 1
    activity_matrix = K.softmax(activity_matrix, axis=0)  
    # calculate average activations for a hidden unit
    p_hat = K.mean(activity_matrix, axis=0)  
    # calculate K-L divergence
    KLD = p*(K.log(p/p_hat))+(1-p)*(K.log((1-p)/(1-p_hat))) 
    # sum of all K-L divergence and multiply by beta
    return beta*K.sum(KLD)  

class AutoEncoderLayer():
    def __init__(self, input_dim, output_dim, act='sigmoid'):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act = act
        self.build()

    def build(self):
        self.input = Input(shape=(self.input_dim,))
        self.encode_layer = Dense(self.output_dim, activation=self.act, activity_regularizer=sparse_reg)
        self.encoded = self.encode_layer(self.input)
        self.encoder = Model(self.input, self.encoded)

        self.decode_layer = Dense(self.input_dim, activation=self.act, activity_regularizer=sparse_reg)
        self.decoded = self.decode_layer(self.encoded)

        self.autoencoder = Model(self.input, self.decoded)

# build stacked AE
class StackedAutoEncoder():
    def __init__(self, layer_list):
        self.layer_list = layer_list
        self.build()

    def build(self):
        out = self.layer_list[0].encoded
        for i in range(1, num_layers - 1):
            out = self.layer_list[i].encode_layer(out)
        self.model = Model(self.layer_list[0].input, out)

class StackedEncoder():
    def __init__(self, layer_list):
        self.layer_list = layer_list
        self.build()

    def build(self):
        out = self.layer_list[0].output
        for i in range(1, len(self.layer_list)):
            out = self.layer_list[i](out)
        self.model = Model(self.layer_list[0].input, out)
# pretraining for autoencoder layers
def train_layers(encoder_list=None, layer=None, epochs=None, batch_size=None):
    '''
    :param encoder_list:
    :param layer:
    :param epochs:
    :param batch_size:
    :return:
    '''
    # when training the ith layer, we use the parameters from trained (i-1)th layer
    out = train_arr
    origin = train_arr
    if layer != 0:
        for i in range(layer):
            out = encoder_list[i].encoder.predict(out)

    encoder_list[layer].autoencoder.summary()
    encoder_list[layer].autoencoder.compile(optimizer=Adam(0.0001), loss=tf.keras.losses.MeanSquaredError())

    # training ith layer of autoencoder
    encoder_list[layer].autoencoder.fit(
        out,
        origin if layer == 0 else out,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        verbose=2
    )

# used the pretrained parameter initialize the parameter for stacked ae
# training the whole stacked ae and optimize
def train_whole(sae=None, epochs=None, batch_size=None):
    '''
    :param model:
    :param epochs:
    :param batch_size:
    :return:
    '''

    sae.model.summary()
    sae.model.compile(optimizer=Adam(0.0001), loss=tf.keras.losses.MeanSquaredError())
    sae.model.fit(
        train_arr,
        train_arr,
        epochs=epochs,
        batch_size=batch_size,
        shuffle=True,
        validation_data=(val_arr, val_arr),
        verbose=2
    )


num_layers = 7
encoder_1 = AutoEncoderLayer(input_size, 2048)
encoder_2 = AutoEncoderLayer(2048, 512)
encoder_3 = AutoEncoderLayer(512, 32)
decoder_1 = AutoEncoderLayer(32, 512)
decoder_2 = AutoEncoderLayer(512, 2048)
decoder_3 = AutoEncoderLayer(2048, input_size)
autoencoder_list = [encoder_1, encoder_2, encoder_3, decoder_1, decoder_2, decoder_3]

print("Pre training:")
for level in range(num_layers - 1):
    print("level:", level)
    train_layers(encoder_list=autoencoder_list, layer=level, epochs=300, batch_size=32)


stacked_ae = StackedAutoEncoder(autoencoder_list)
print("Whole training:")

train_whole(sae=stacked_ae, epochs=500, batch_size=32)

Pre training:
level: 0
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2108)]            0         
                                                                 
 dense (Dense)               (None, 2048)              4319232   
                                                                 
 dense_1 (Dense)             (None, 2108)              4319292   
                                                                 
Total params: 8,638,524
Trainable params: 8,638,524
Non-trainable params: 0
_________________________________________________________________
Epoch 1/300
34/34 - 3s - loss: 1.9635 - 3s/epoch - 81ms/step
Epoch 2/300
34/34 - 0s - loss: 1.8936 - 292ms/epoch - 9ms/step
Epoch 3/300
34/34 - 0s - loss: 1.8914 - 282ms/epoch - 8ms/step
Epoch 4/300
34/34 - 0s - loss: 1.8907 - 278ms/epoch - 8ms/step
Epoch 5/300
34/34 - 0s - loss: 1.8904 - 

In [None]:
test_pred = stacked_ae.model.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00107


In [None]:
stacked_reconstruct = stacked_ae.model.predict(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, stacked_reconstruct)))

reconstruction loss:0.00080


## Predicting use stacked with sparse

In [None]:
enc_model = StackedEncoder(stacked_ae.model.layers[:num_layers // 2 + 1])

In [None]:
stacked_encoded = enc_model.model.predict(df_encoded_before)
pd_opt_people = pd.DataFrame(stacked_encoded)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')
one_hot_opt_people_labeled = alt_dataset(data_dict, label_dict, pd_opt_people)

dataX = one_hot_opt_people_labeled.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])

100%|██████████| 1130/1130 [00:26<00:00, 41.92it/s]


In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00273, test set mse:0.00245
kernel:linear c:11 val set mse:0.00329, test set mse:0.00239
kernel:linear c:21 val set mse:0.00358, test set mse:0.00263
kernel:linear c:31 val set mse:0.00379, test set mse:0.00296
kernel:linear c:41 val set mse:0.00425, test set mse:0.00279
kernel:poly c:1 val set mse:0.00305, test set mse:0.00142
kernel:poly c:11 val set mse:0.00408, test set mse:0.00156
kernel:poly c:21 val set mse:0.00408, test set mse:0.00156
kernel:poly c:31 val set mse:0.00408, test set mse:0.00156
kernel:poly c:41 val set mse:0.00408, test set mse:0.00156
kernel:rbf c:1 val set mse:0.00223, test set mse:0.00138
kernel:rbf c:11 val set mse:0.00245, test set mse:0.00135
kernel:rbf c:21 val set mse:0.00271, test set mse:0.00137
kernel:rbf c:31 val set mse:0.00277, test set mse:0.00137
kernel:rbf c:41 val set mse:0.00277, test set mse:0.00137
kernel:sigmoid c:1 val set mse:1.84381, test set mse:3.43884
kernel:sigmoid c:11 val set mse:224.80648, test set 

# Variational AE

In [None]:
from tensorflow.keras.layers import*
import tensorflow.keras.backend as K

epsilon_std = 1.0

# sampling process
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(
        shape=(K.shape(z_mean)),
        mean=0.,
        stddev=epsilon_std
    )
    return z_mean + epsilon * K.exp(z_log_var / 2)


act = 'relu'
# encoder
encoder = tf.keras.Sequential([
    layers.Dense(2048, activation=act, input_shape=(input_size,)),
    layers.Dense(1024, activation=act),
    layers.Dense(32, activation=act),
])
# mean
mean = tf.keras.Sequential([
    layers.Dense(32, activation=act),
])
# variance
log_var = tf.keras.Sequential([
    layers.Dense(32, activation=act),
])

# sampling from distribution
z = Lambda(sampling, output_shape=(32,))

# decoder
decoder_h = tf.keras.Sequential([
    layers.Dense(32, activation=act),
])
decoder_mean = tf.keras.Sequential([
    layers.Dense(32, activation=act),
    layers.Dense(1024, activation=act),
    layers.Dense(2048, activation=act),
    layers.Dense(output_size),
])

# define the encoder network for calculating distribution mean and variance
x = Input(shape=(input_size,))
h = encoder(x)
# distribution mean
z_mean = mean(h)
# distribution variance
z_log_var = log_var(h)
# distribution z
z = z([z_mean, z_log_var])

# decoding
h_decoded = decoder_h(z)
decoded = decoder_mean(h_decoded)

# autoenoder model
autoencoder = Model(x, decoded)
# reconstruction loss
xent_loss = input_size * metrics.mean_squared_error(x, decoded)
# KL loss
kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
autoencoder.add_loss(K.mean(xent_loss + kl_loss))
autoencoder.compile(optimizer=Adam(0.0001), loss=None)
autoencoder.fit(train_arr, epochs=500, shuffle=True, batch_size=64, validation_data=(val_arr, None), verbose=2,
                callbacks=[
                    ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
                ])

Epoch 1/500
17/17 - 3s - loss: 11.2678 - val_loss: 9.1336 - lr: 1.0000e-04 - 3s/epoch - 174ms/step
Epoch 2/500
17/17 - 0s - loss: 8.3195 - val_loss: 7.9207 - lr: 1.0000e-04 - 254ms/epoch - 15ms/step
Epoch 3/500
17/17 - 0s - loss: 7.7275 - val_loss: 7.6788 - lr: 1.0000e-04 - 259ms/epoch - 15ms/step
Epoch 4/500
17/17 - 0s - loss: 7.5695 - val_loss: 7.4977 - lr: 1.0000e-04 - 224ms/epoch - 13ms/step
Epoch 5/500
17/17 - 0s - loss: 7.4739 - val_loss: 7.4472 - lr: 1.0000e-04 - 225ms/epoch - 13ms/step
Epoch 6/500
17/17 - 0s - loss: 7.4143 - val_loss: 7.3756 - lr: 1.0000e-04 - 215ms/epoch - 13ms/step
Epoch 7/500
17/17 - 0s - loss: 7.3948 - val_loss: 7.2978 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Epoch 8/500
17/17 - 0s - loss: 7.3353 - val_loss: 7.3101 - lr: 1.0000e-04 - 205ms/epoch - 12ms/step
Epoch 9/500
17/17 - 0s - loss: 7.3158 - val_loss: 7.2555 - lr: 1.0000e-04 - 217ms/epoch - 13ms/step
Epoch 10/500
17/17 - 0s - loss: 7.2804 - val_loss: 7.2325 - lr: 1.0000e-04 - 203ms/epoch - 12ms/step


<keras.callbacks.History at 0x7f2a40636550>

In [None]:
test_pred = autoencoder.predict(test_arr)
print('test set mse:{:.5f}'.format(mean_squared_error(test_arr, test_pred)))

test set mse:0.00319


In [None]:
vae_reconstruct = autoencoder.predict(df_encoded_before)
print('reconstruction loss:{:.5f}'.format(mean_squared_error(df_encoded_before, vae_reconstruct)))

reconstruction loss:0.00328


# Autoencoder separately for each category

In [None]:
data_columns = ['administrator', 'advisor', 'auditor', 'custodian', 'distributor',
                'manager_of_managers', 'subadvised', 'subadvisor', 'managers',
                'more_than_one_administrator', 'more_than_one_custodian']

class AutoencoderRelu(Model):
    def __init__(self, size, act='relu'):
        super(AutoencoderRelu, self).__init__()

        if size < 100:
            self.encoder = tf.keras.Sequential([
                layers.Dense(64, activation=act, input_shape=(size,)),
                layers.Dense(8, activation=act),
            ])
            self.decoder = tf.keras.Sequential([
                layers.Dense(8, activation=act),
                layers.Dense(64, activation=act),
                layers.Dense(size),
            ])
        else:
            self.encoder = tf.keras.Sequential([
                layers.Dense(512, activation=act, input_shape=(size,)),
                layers.Dense(16, activation=act),
            ])
            self.decoder = tf.keras.Sequential([
                layers.Dense(16, activation=act),
                layers.Dense(512, activation=act),
                layers.Dense(size),
            ])


    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

single_data_onehot_list, single_data_enc_list, res1, res2 = [], [], [], []
for col in data_columns:
    single_data = np.array(data_opt[col]).reshape((-1, 1))
    enc = OneHotEncoder()
    enc.fit(single_data)
    single_data = enc.transform(single_data).toarray()
    single_data_onehot_list.append(single_data)

    train_val_arr, test_arr = train_test_split(single_data, test_size=0.1)
    train_arr, val_arr = train_test_split(train_val_arr, test_size=0.1)

    single_autorelu = AutoencoderRelu(single_data.shape[1])
    single_autorelu.compile(optimizer=Adam(0.001), loss=losses.MeanSquaredError())
    single_autorelu.fit(train_arr, train_arr, epochs=500, shuffle=True, batch_size=64,
                        validation_data=(val_arr, val_arr), verbose=2, callbacks=[
            ReduceLROnPlateau(factor=0.9, patience=10, verbose=2)
        ])

    test_pred_relu = single_autorelu.predict(test_arr)
    reconstruct = single_autorelu.predict(single_data)
    res1.append('col:{} relu test set mse:{:.10f}'.format(col, mean_squared_error(test_arr, test_pred_relu)))
    res2.append('col:{} relu test set mse:{:.10f}'.format(col, mean_squared_error(single_data, reconstruct)))

    single_data_enc_list.append(single_autorelu.encoder.predict(single_data))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 223/500
17/17 - 0s - loss: 3.9388e-15 - val_loss: 2.1002e-15 - lr: 1.0942e-04 - 92ms/epoch - 5ms/step
Epoch 224/500
17/17 - 0s - loss: 4.2025e-15 - val_loss: 4.6872e-15 - lr: 1.0942e-04 - 95ms/epoch - 6ms/step
Epoch 225/500
17/17 - 0s - loss: 4.3262e-15 - val_loss: 2.1002e-15 - lr: 1.0942e-04 - 88ms/epoch - 5ms/step
Epoch 226/500
17/17 - 0s - loss: 3.9635e-15 - val_loss: 2.1002e-15 - lr: 1.0942e-04 - 99ms/epoch - 6ms/step
Epoch 227/500
17/17 - 0s - loss: 4.1531e-15 - val_loss: 2.1002e-15 - lr: 1.0942e-04 - 92ms/epoch - 5ms/step
Epoch 228/500

Epoch 00228: ReduceLROnPlateau reducing learning rate to 9.847709443420172e-05.
17/17 - 0s - loss: 4.1366e-15 - val_loss: 4.6872e-15 - lr: 1.0942e-04 - 90ms/epoch - 5ms/step
Epoch 229/500
17/17 - 0s - loss: 4.3509e-15 - val_loss: 2.1002e-15 - lr: 9.8477e-05 - 80ms/epoch - 5ms/step
Epoch 230/500
17/17 - 0s - loss: 3.9058e-15 - val_loss: 2.1002e-15 - lr: 9.8477e-05 - 91ms/epoch -

In [None]:
print('\n'.join(res1))

col:administrator relu test set mse:0.0004890300
col:advisor relu test set mse:0.0005750144
col:auditor relu test set mse:0.0000000000
col:custodian relu test set mse:0.0010802322
col:distributor relu test set mse:0.0001765421
col:manager_of_managers relu test set mse:0.0000000000
col:subadvised relu test set mse:0.0000000000
col:subadvisor relu test set mse:0.0004577379
col:managers relu test set mse:0.0008186290
col:more_than_one_administrator relu test set mse:0.0000000000
col:more_than_one_custodian relu test set mse:0.0000000000


In [None]:
print('\n'.join(res2))

col:administrator relu test set mse:0.0000983995
col:advisor relu test set mse:0.0003841856
col:auditor relu test set mse:0.0000000000
col:custodian relu test set mse:0.0002616260
col:distributor relu test set mse:0.0000176677
col:manager_of_managers relu test set mse:0.0000000000
col:subadvised relu test set mse:0.0000000000
col:subadvisor relu test set mse:0.0002573954
col:managers relu test set mse:0.0006243719
col:more_than_one_administrator relu test set mse:0.0000000000
col:more_than_one_custodian relu test set mse:0.0000000000


## Predicting on excess return

In [None]:
relu_reduced = np.concatenate(single_data_enc_list, axis=-1)
pd_opt_people = pd.DataFrame(relu_reduced)
pd_opt_people['id'] = data_opt_index
pd_opt_people = pd_opt_people.set_index('id')
one_hot_opt_people_labeled = alt_dataset(data_dict, label_dict, pd_opt_people)

dataX = one_hot_opt_people_labeled.drop('Y', axis=1)
dataY = one_hot_opt_people_labeled['Y']
dataX.reset_index(drop=True, inplace=True)
dataY.reset_index(drop=True, inplace=True)

train_len = int(dataX.shape[0] * 0.8)
val_len = int(dataX.shape[0] * 0.9)
trainX, trainY = np.array(dataX.iloc[:train_len]), np.array(dataY.iloc[:train_len])
valX, valY = np.array(dataX.iloc[train_len:val_len]), np.array(dataY.iloc[train_len:val_len])
testX, testY = np.array(dataX.iloc[val_len:]), np.array(dataY.iloc[val_len:])

100%|██████████| 1130/1130 [00:26<00:00, 42.06it/s]


In [None]:
from sklearn.svm import SVR
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for c in range(1, 51, 10):
        svr = SVR(kernel=kernel, C=c)
        svr.fit(trainX, trainY)
        val_pred = svr.predict(valX)
        test_pred = svr.predict(testX)
        print('kernel:{} c:{} val set mse:{:.5f}, test set mse:{:.5f}'.format(kernel, c, mean_squared_error(val_pred, valY), mean_squared_error(test_pred, testY)))

kernel:linear c:1 val set mse:0.00173, test set mse:0.00163
kernel:linear c:11 val set mse:0.00173, test set mse:0.00163
kernel:linear c:21 val set mse:0.00173, test set mse:0.00163
kernel:linear c:31 val set mse:0.00173, test set mse:0.00163
kernel:linear c:41 val set mse:0.00173, test set mse:0.00163
kernel:poly c:1 val set mse:0.00188, test set mse:0.00150
kernel:poly c:11 val set mse:0.00188, test set mse:0.00150
kernel:poly c:21 val set mse:0.00188, test set mse:0.00150
kernel:poly c:31 val set mse:0.00188, test set mse:0.00150
kernel:poly c:41 val set mse:0.00188, test set mse:0.00150
kernel:rbf c:1 val set mse:0.00188, test set mse:0.00149
kernel:rbf c:11 val set mse:0.00188, test set mse:0.00149
kernel:rbf c:21 val set mse:0.00188, test set mse:0.00149
kernel:rbf c:31 val set mse:0.00188, test set mse:0.00149
kernel:rbf c:41 val set mse:0.00188, test set mse:0.00149
kernel:sigmoid c:1 val set mse:0.00177, test set mse:0.00209
kernel:sigmoid c:11 val set mse:325.14699, test set 