In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
from os import listdir
from os.path import isfile, join, splitext
import random
from datetime import datetime, timedelta

from process_raw_prices import *

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
tf.enable_eager_execution()

In [3]:
# update batch size
batch_size = 2

# number of batch in training
num_of_batch = 500

# fixed number of time steps in one episode (not used)
trading_period = 200

# 1 is zscore
num_features = 1

# 0 is no position. 1 is long the spread. 2 is short the spread.
a_num = position_num = 3

# RNN hidden state dimension
h_dim = 30

# number of RNN layer
num_layers = 1

# number of layer1 output
layer1_out_num = 30

# learning rate
lr = 3e-3

# discount factor in reinforcement learning
gamma = 1

# random action probability
rand_action_prob = 0.05

batch_per_print = 50

# dummy initial cash
initial_cash = 10000

# processed dataset folder path
dataset_folder_path = '../../dataset/nyse-daily-transformed'
os.makedirs(dataset_folder_path, exist_ok=True)

# raw dataset files pattern
raw_files_path_pattern = "../../dataset/nyse-daily/*.csv"

df_columns = ['close1', 'close2', 'spread', 'logClose1', 'logClose2', 'zscore']
ind = {'y_close': 0, 'x_close': 1, 'spread': 2}

# checkpoint folder
checkpoint_dir = '../../model_checkpoint/'
os.makedirs(checkpoint_dir, exist_ok=True)

In [4]:
# # compute dataset for training
# all_pairs_slices = [splitext(f)[0] for f in listdir(dataset_folder_path) if isfile(join(dataset_folder_path, f))]
# if len(all_pairs_slices) == 0:
#     generate_pairs_training_data(raw_files_path_pattern=raw_files_path_pattern,
#                                  result_path=dataset_folder_path,
#                                  min_size=252*4,
#                                  training_period=52,
#                                  points_per_cut=252
#                                 )
#     all_pairs_slices = [splitext(f)[0] for f in listdir(dataset_folder_path) if isfile(join(dataset_folder_path, f))]
# print("Total number of pair slices: %d" % len(all_pairs_slices))

# # split for training and testing
# all_pairs = list(set(['-'.join(p.split('-')[0:2]) for p in all_pairs_slices]))[:2]
# all_pairs = ["VMW-WUBA"]
all_pairs = ["TWTR-UIS"]
all_pairs_slices_train = []
all_pairs_slices_test = []
for p in all_pairs:
    all_pairs_slices_train += [p+'-0', p+'-1']
    all_pairs_slices_test += [p+'-2', p+'-3']
print("Total number of pair slices for training: %d" % len(all_pairs_slices_train))
print("Total number of pair slices for testing: %d" % len(all_pairs_slices_test))

Total number of pair slices for training: 2
Total number of pair slices for testing: 2


In [5]:
# functions
def get_random_history(batch_size, training):
    """Sample some pairs and get the history of those pairs. The history should have
    three dimension. The first dimension is for time. The second dimension is indexed
    by features name. The third dimension is the index of training instance.
    """
    sample_pair_slices = random.sample(all_pairs_slices_train if training else all_pairs_slices_test, batch_size)
    history = []
    for s in sample_pair_slices:
        df = pd.read_csv(join(dataset_folder_path, s+".csv"))
        df_val = df[df_columns].values
        history.append(df_val)
    
    history = np.array(history)
    return np.transpose(history, (1, 2, 0))

def compute_input_history(history):
    """Slicing history in its second dimension."""
    # no slicing for now
    return history

def sample_action(logits, random=False):
    if random:
        dist = tf.distributions.Categorical(logits=tf.zeros([batch_size, a_num]))
    else:
        dist = tf.distributions.Categorical(logits=logits)
    
    # 1-D Tensor where the i-th element correspond to a sample from
    # the i-th categorical distribution
    return dist.sample()

def long_portfolio_value(q, p):
    return q*p

def short_portfolio_value(q, p, init_p):
    return q*(3.0*init_p/2 - p)

# def discount_rewards(r, all_actions):
#     """
#     r is a numpy array in the shape of (n, batch_size).
#     all_actions is a numpy array in the same shape as r.
    
#     return the discounted and cumulative rewards"""
    
#     result = np.zeros_like(r, dtype=float)
#     n = r.shape[0]
#     sum_ = np.zeros_like(r[0], dtype=float)
#     pre_action = all_actions[n-1]
#     for i in range(n-1,-1,-1):
#         sum_ *= gamma
        
#         # when the previous action(position) not equal to the current one,
#         # set the previous sum of reward to be zero.
#         sum_ = sum_*(all_actions[i]==pre_action) + r[i]
#         result[i] = sum_
        
#         # update pre_action
#         pre_action = all_actions[i]
    
#     return result

def discount_rewards(r, all_actions):
    """
    r is a numpy array in the shape of (n, batch_size).
    all_actions is a numpy array in the same shape as r.
    
    return the discounted and cumulative rewards"""
    
    result = np.zeros_like(r, dtype=float)
    n = r.shape[0]
    sum_ = np.zeros_like(r[0], dtype=float)
    pre_action = all_actions[n-1]
    for i in range(n-1,-1,-1):
        sum_ *= gamma
        sum_ += r[i]
        result[i] = sum_
    
    return result

def loss(all_logits, all_actions, all_advantages):
    neg_log_select_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=all_logits, labels=all_actions)
    
    # 0 axis is the time axis. 1 axis is the batch axis
    return tf.reduce_mean(neg_log_select_prob * all_advantages, 0)

def save_model():
    hkg_time = datetime.now() + timedelta(hours=16)
    checkpoint_name = hkg_time.strftime("%Y%m%d_%H%M%S")
    checkpoint_prefix = os.path.join(checkpoint_dir, checkpoint_name)
    root.save(checkpoint_prefix)
    tf.train.latest_checkpoint(checkpoint_dir)
    
def restore_model(checkpoint_name):
    root.restore(join(checkpoint_dir, checkpoint_name))


myLeakyReLU = tf.keras.layers.LeakyReLU()
myLeakyReLU.__name__ = "myLeakyReLU"

# classes
class TradingPolicyModel(tf.keras.Model):
    def __init__(self):
        super(TradingPolicyModel, self).__init__()
        self.dense1 = tf.layers.Dense(units=layer1_out_num,
                                      activation=myLeakyReLU,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer()
                                     )
        self.dense2 = tf.layers.Dense(units=layer1_out_num,
                                      activation=myLeakyReLU,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer()
                                     )
        self.dense3 = tf.layers.Dense(units=layer1_out_num,
                                      activation=myLeakyReLU,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer()
                                     )
#         self.dense4 = tf.layers.Dense(units=layer1_out_num,
#                                       activation=tf.keras.layers.LeakyReLU(),
#                                       kernel_initializer=tf.contrib.layers.xavier_initializer()
#                                      )
        self.logits = tf.layers.Dense(units=a_num,
                                      activation=myLeakyReLU,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer()
                                     )

    def call(self, inputs):
        # Forward pass
        inputs = self.dense1(inputs)
        inputs = self.dense2(inputs)
        inputs = self.dense3(inputs)
#         inputs = self.dense4(inputs)
        logits = self.logits(inputs)
        return logits


class StateEncodingModel(tf.keras.Model):
    def __init__(self):
        super(StateEncodingModel, self).__init__()
        self.cell_layer = tf.contrib.rnn.LSTMCell(h_dim)
        self.cell = tf.contrib.rnn.MultiRNNCell([self.cell_layer] * num_layers)
        self.state = self.cell.zero_state(batch_size, tf.float32)
    
    def call(self, inputs):
        oberservation, self.state = self.cell(inputs, self.state)
        return oberservation
        
    def reset_state(self):
        self.state = self.cell.zero_state(batch_size, tf.float32)


class TradingEnvironment():
    """Trading environment for reinforcement learning training.
    
    Arguments:
        state_encoding_model: the model that encode past input_history data into a state
        vector which will be fed as input to the policy network.
    """
    def __init__(self, state_encoding_model):
        # do some initialization
        self.state_encoding_model = state_encoding_model
        self._reset_env()
        
    def _reset_env(self, training=True):
        self.t = 0
        self.state_encoding_model.reset_state()

        # 0 is no position. 1 is long the spread. 2 is short the spread
        self.position = np.zeros(batch_size, dtype=int)
        
        # initialize the cash each agent has
        self.total_portfolio_value = np.ones(batch_size)*initial_cash
        
        # only useful when there is a postion on the spread
        self.quantity = {'x': np.zeros(batch_size), 'y': np.zeros(batch_size)}
        
        # for compute current portfolio value of the short side
        self.short_side_init_price = np.zeros(batch_size)

        # prepare a batch of history and input_history
        self.history = get_random_history(batch_size, training)
        self.input_history = compute_input_history(self.history)
        
        # create or update self.state variable
        self.update_state()
    
    def reset(self, training=True):
        """Return an initial state for the trading environment"""
        if self.t == 0 and True:
            return self.state
        else:
            self._reset_env(training=training)
            return self.state
    
    def compute_reward(self, action):
        """Compute the reward at time t which is the change in total portfolio value
        from time t to t+1. It also update the position for time t+1. Exit trade when
        the short side portfolio value <= 0."""
        
        r = np.zeros_like(action, dtype=float)
        cur_his = self.history[self.t]
        nex_his = self.history[self.t+1]
        
        # compute for each training instance in a batch
        for i, a in enumerate(action):
            y_p = cur_his[ind["y_close"], i]
            x_p = cur_his[ind["x_close"], i]
            nex_y_p = nex_his[ind["y_close"], i]
            nex_x_p = nex_his[ind["x_close"], i]
            
            
            if a == 0: # take no position on the spread
                # no change in portfolio value
                r[i] = 0
                self.position[i] = 0
                self.quantity['y'][i] = 0.0
                self.quantity['x'][i] = 0.0
            elif a == 1: # long the spread: long Y and short X
                # quantity of each stock will change when the current position is not previous position
                if self.position[i] == 0 or self.position[i] == 2:
                    # compute quantity from cash
                    self.quantity['y'][i] = 2.0*self.total_portfolio_value[i]/3.0/y_p
                    self.quantity['x'][i] = 2.0*self.total_portfolio_value[i]/3.0/x_p
                    self.short_side_init_price[i] = x_p

                lpv = long_portfolio_value(self.quantity['y'][i], nex_y_p)
                spv = short_portfolio_value(self.quantity['x'][i], nex_x_p, self.short_side_init_price[i])
                
                # the zero here can be changed to other positive threshold ...
                if spv <= 0:
                    # we loss all the money in the short side
                    nex_portfolio_value = lpv

                    # forced to take position 0
                    self.position[i] = 0
                else:
                    nex_portfolio_value = lpv + spv
                    self.position[i] = 1
                
                r[i] = nex_portfolio_value - self.total_portfolio_value[i]
                self.total_portfolio_value[i] = nex_portfolio_value
            elif a == 2: # short the spread: short Y and long X
                # quantity will change when the current position is not previous position
                if self.position[i] == 0 or self.position[i] == 1:
                    # compute quantity from cash
                    self.quantity['y'][i] = 2.0*self.total_portfolio_value[i]/3.0/y_p
                    self.quantity['x'][i] = 2.0*self.total_portfolio_value[i]/3.0/x_p
                    self.short_side_init_price[i] = y_p

                lpv = long_portfolio_value(self.quantity['x'][i], nex_x_p)
                spv = short_portfolio_value(self.quantity['y'][i], nex_y_p, self.short_side_init_price[i])
                
                if spv <= 0:
                    # we loss all the money in the short side
                    nex_portfolio_value = lpv

                    # forced to take position 0
                    self.position[i] = 0
                else:
                    nex_portfolio_value = lpv + spv
                    self.position[i] = 2
                
                r[i] = nex_portfolio_value - self.total_portfolio_value[i]
                self.total_portfolio_value[i] = nex_portfolio_value
        return r
    
    def update_state(self):
#         # concate next_input_history and next position to form next partial state
#         partial_state = tf.concat([self.input_history[self.t].T, tf.one_hot(self.position, position_num)], 1)
        
#         # update state
#         self.state = self.state_encoding_model(partial_state)

        partial_state = self.input_history[self.t].T
        self.state = tf.concat([
            partial_state,
            np.array([self.total_portfolio_value,
                      self.quantity['y'],
                      self.quantity['x']]).T,
            tf.one_hot(self.position, position_num)
        ], 1)
    
    def step(self, action):
        """Given the current state and action, return the reward, next state and done.
        This function should be called after reset.
        
        reward is of type numpy array. state is of type tensor. done is of type boolean.
        
        
        Arguments:
            action: a numpy array containing the current action for each training pair.

        Note that we follow the convention where the trajectory is indexed as s_0, a_0, r_0,
        s_1, ... . Therefore t is updated just after computing the reward is computed and
        before computing next state.
        """
        # r_t
        r = self.compute_reward(action) # also update the position for time t+1

        # t = t+1
        self.t += 1
        
        # compute s_(t+1)
        self.update_state()

        return r, self.state, (self.t+1) == trading_period

In [6]:
# create objects
pi = TradingPolicyModel()
state_encoding_model = StateEncodingModel()
env = TradingEnvironment(state_encoding_model)
optimizer = tf.train.AdamOptimizer(learning_rate=lr)

# create checkpoint object
root = tf.train.Checkpoint(pi=pi, state_encoding_model=state_encoding_model, optimizer=optimizer)

In [7]:
# for training reference only
average_total_r = np.zeros(batch_size)

for batch in range(num_of_batch):
    
    # saving for update
    all_logits = []
    all_actions = []
    all_rewards = []
    with tf.GradientTape() as gt:
        done = False
        s = env.reset(training=True)

        # internally the episode length is fixed by trading_period
        while not done:
            logits = pi(s)
            a = sample_action(logits, random=np.random.rand() <= rand_action_prob)
            r, next_s, done = env.step(a.numpy())

            # save the episode
            all_logits.append(logits)
            all_actions.append(a)
            all_rewards.append(r)
            
            average_total_r += r
            
            # debugging
#             print(env.t)
#             print(env.t+1==200)
#             print(r[0])
#             print(env.total_portfolio_value[0])
#             print(done)

        all_logits_stack = tf.stack(all_logits)
        all_actions_stack = tf.stack(all_actions)
        all_rewards_stack = np.array(all_rewards)
        
        # compute cummulative rewards for each action
        all_cum_rewards = discount_rewards(all_rewards_stack, all_actions_stack.numpy())
        all_cum_rewards -= np.mean(all_cum_rewards)
        all_cum_rewards /= np.std(all_cum_rewards)
        all_cum_rewards = tf.convert_to_tensor(all_cum_rewards, dtype=tf.float32)

        loss_value = loss(all_logits_stack, all_actions_stack, all_cum_rewards)
    
    if (batch+1) % batch_per_print == 0:
        print("batch id: {}, average_total_r_per_ep: {}".format(batch, np.mean(average_total_r/batch_per_print)))
        average_total_r = np.zeros(batch_size)
    
    grads = gt.gradient(loss_value, state_encoding_model.variables + pi.variables)
    optimizer.apply_gradients(zip(grads, state_encoding_model.variables + pi.variables))

batch id: 49, average_total_r_per_ep: -136.60943314153099
batch id: 99, average_total_r_per_ep: 84.78978871643503
batch id: 149, average_total_r_per_ep: 94.96325589298998
batch id: 199, average_total_r_per_ep: 34.30157620589861
batch id: 249, average_total_r_per_ep: -129.67120068977732
batch id: 299, average_total_r_per_ep: 13745.563335705961
batch id: 349, average_total_r_per_ep: 19227.10060887947
batch id: 399, average_total_r_per_ep: 18229.183469552598
batch id: 449, average_total_r_per_ep: 18583.84062468518
batch id: 499, average_total_r_per_ep: 19358.416424155956


In [22]:
# test time
average_total_r = np.zeros(batch_size)
done = False
s = env.reset()

# internally the episode length is fixed by trading_period
while not done:
    logits = pi(s)
    a = sample_action(logits)
    r, next_s, done = env.step(a.numpy())
    
#     print(r[0])
#     print(env.total_portfolio_value[0])
#     print(done)

    average_total_r += r

print("At test time, average_total_r_per_ep: {}".format(np.mean(average_total_r)))

65.58472873919163
10065.584728739192
False
231.73679497384728
10297.321523713039
False
-11.104921338475833
10286.216602374563
False
-132.14359293915368
10154.07300943541
False
-19.123817203524595
10134.949192231885
False
-511.3982681888883
9623.550924042996
False
-96.72462945416919
9526.826294588827
False
166.3945969855322
9693.22089157436
False
-990.1806086711549
8703.040282903205
False
35.5760066044586
8738.616289507663
False
210.21152209094907
8948.827811598612
False
-166.60123853521873
8782.226573063394
False
-31.921103855082038
8750.305469208311
False
-92.15600101744894
8658.149468190863
False
146.78114853725128
8804.930616728114
False
64.80608911345553
8869.73670584157
False
-424.67758637966654
8445.059119461903
False
-84.50903765040675
8360.550081811496
False
167.19062392612796
8527.740705737624
False
90.21140818174354
8617.952113919368
False
75.36665374822587
8693.318767667593
False
-2.421406508605287
8690.897361158988
False
220.12284884711698
8911.020210006105
False
-1412.3090