In [1]:
!pip install keras-transformer
import h5py
import numpy as np
import tensorflow as tf
data_dir = "drive/My Drive/CS4995 Deep Learning/Competitions_Project/Final_Project/data/3000/new/"



In [0]:
import pickle
pickle_in = open(data_dir+'data_250.pickle', 'rb')
data = pickle.load(pickle_in)
pickle_in.close()

In [0]:
with open(data_dir+'stocks_emb.w2vec', "r") as f:
    lines = f.read().split('\n')  #embedded vectors reading
f1 = [item.split(' ') for item in lines[1:]]
word2vec = {item[0]: np.array(list(map(float, item[1:]))) for item in f1}

In [4]:
import numpy as np
from keras_transformer import get_model, decode

source_tokens = [item[0].split(',') for item in data]  #source_tokens[0] = ['AAAA', 'BBBB',...,'ZZZZ']
target_tokens =  [item[1].split(',') for item in data]

# Generate dictionaries
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for tokens in token_list:
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict    #input is a two-dimensional matrix, tokens is the ['AAAA', 'BBBB',...,'ZZZZ'], token is 'AAAA'

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}  #we need the rev as {1:'AAAA', 2:'BBBB', ...}

#record the corresponding index of the target_token_dict
tensor_dict = {}
for key in word2vec.keys():
    tensor_dict[key] = len(tensor_dict)

tensor_list = []

for key in target_token_dict.keys():
    try:
        tensor_list.append(tensor_dict[key])
    except:
        continue

with h5py.File(data_dir + "stocks_emb.gaussian_1_0_0.h5", 'r') as f:
    dist_data = np.array(f.get('weights'))
print(len(dist_data))
dist_data = dist_data[tensor_list, tensor_list]
print(len(dist_data))

dist_data = tf.convert_to_tensor(dist_data)

len(tensor_list)
# Add special tokens
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens] #it's input of encoder
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens] #it's input of decoder
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens] #it's the ground truth of decoder

# Padding
source_max_len = max(map(len, encode_tokens))  #the size of source sequence
target_max_len = max(map(len, decode_tokens))   #the size of target sequence

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens] #tokens mean the sequence, padding enough <PAD> here!
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens] #map token in tokens into the number (sparse)
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens] #why list of list? 

# Build & fit model
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # Use different embeddings for different languages
)

#handcrafted loss term with d2gpo
import keras.backend as K
import tensorflow as tf
from keras.losses import kullback_leibler_divergence, sparse_categorical_crossentropy
import numpy as np
from keras.optimizers import Adam

#self handicrafted d2gpo loss term
def wrapper_loss(topo_dict, lmbda):
    def model_loss(y_true, y_pred):
        ce_loss = sparse_categorical_crossentropy(y_true, y_pred)
        y_true_idx = tf.cast(y_true, tf.int64)
        kl_loss = kullback_leibler_divergence(tf.cast(tf.gather(dist_data, y_true_idx), tf.float32), y_pred)
        return (1-lmbda)*ce_loss + lmbda*kl_loss
    return model_loss


Using TensorFlow backend.


3209
1268


In [0]:
#optimizer and compile
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile('adam', wrapper_loss(dist_data, 0.95))

In [6]:
#model fitting
train_start = 0
train_end = 200
model.fit(
    x=[np.array(encode_input )[train_start: train_end], np.array(decode_input)[train_start: train_end]],
    y=np.array(decode_output)[train_start: train_end],
    epochs=16,
    batch_size=1,
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<keras.callbacks.callbacks.History at 0x7f8055829ac8>

In [0]:
#define the function that calculate our handcrafted metric
def accuracy(i):
    decoded = decode(
        model,
        encode_input[i],
        start_token=target_token_dict['<START>'],
        end_token=target_token_dict['<END>'],
        pad_token=target_token_dict['<PAD>'],
        top_k=1,
        temperature=1,
    )
    predicted = (','.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))).split(',')
    target = set(target_tokens[i])
    accuracy = 0
    for item in predicted:
        if item in target:
            accuracy += 1
    return accuracy / len(predicted)

In [8]:
#calculate the average accuracy of the validation set
import tqdm
np.mean([accuracy(i) for i in tqdm.tqdm(range(0, 250))]) 


#on all 250 data
#0.011 | lmbda = 1.00
#0.112 | lmbda = 0.95
#0.117 | lmbda = 0.80
#0.108 | lmbda = 0.30
#0.094 | lmbda = 0.20
#0.088 | lmbda = 0.10
#0.050 | lmbda = 0.00
#

100%|██████████| 250/250 [00:29<00:00,  8.34it/s]


0.0994

In [9]:
#example of inferences
decoded = decode(
                            model,
                            encode_input[3],
                            start_token=target_token_dict['<START>'],
                            end_token=target_token_dict['<END>'],
                            pad_token=target_token_dict['<PAD>'],
                            top_k=3,
                            temperature=1)
(','.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))).split(',')

['TBLTW',
 'PAVMW',
 'PAVMW',
 'TBLTW',
 'RBZ',
 'OCGN',
 'RBZ',
 'AMRS',
 'AMRS',
 'SNSS',
 'PHUN',
 'MRAM',
 'IPDN',
 'PHUN',
 'SNSS',
 'SNSS',
 'MICT',
 'RBZ',
 'ENLV',
 'RETO',
 'FRSX',
 '<PAD>',
 '<PAD>',
 'TMDI',
 'TBLTW',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 'CFMS',
 'RBZ',
 'RETO',
 'ICON',
 '<PAD>',
 'SES',
 'PHUN',
 'CFMS',
 'IPWR',
 'DGLY',
 'IFMK',
 'SLRX',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 'BOXL',
 'PHUN',
 'CFMS',
 'CFMS',
 'CLDX',
 'ICON',
 'CFMS',
 'YVR',
 'YVR',
 'NAKD',
 'CETXP']