In [1]:
from google.colab import files
files.upload()

Saving aapl_msi_sbux.csv to aapl_msi_sbux.csv


{'aapl_msi_sbux.csv': b'AAPL,MSI,SBUX\n67.8542,60.3,28.185\n68.5614,60.9,28.07\n66.8428,60.83,28.13\n66.7156,60.81,27.915\n66.6556,61.12,27.775\n65.7371,61.43,27.17\n65.7128,62.03,27.225\n64.1214,61.26,26.655\n63.7228,60.88,26.675\n64.4014,61.9,27.085\n63.2571,60.28,26.605\n64.1385,60.63,26.64\n63.5099,62.09,27.285\n63.0571,62.21,27.425\n61.4957,62.03,27.435\n60.0071,62.5,27.85\n61.5919,62.97,28.255\n60.8088,63.11,28.55\n61.5117,62.64,29.125\n61.6742,62.75,29.335\n62.5528,62.56,29.305\n61.2042,62.13,29.14\n61.1928,62.22,29.2925\n61.7857,62.34,28.84\n63.3799,62.07,28.83\n65.1028,61.64,28.465\n64.9271,61.67,28.415\n64.5828,62.4,28.715\n64.6756,62.43,28.525\n65.9871,63.61,28.69\n66.2256,63.29,28.345\n65.8765,63.46,28.525\n64.5828,63.56,28.455\n63.2371,64.03,28.475\n61.2728,63.7,28.435\n61.3988,63.7,29.13\n61.7128,62.8,28.85\n61.1028,62.99,29.055\n60.4571,62.67,28.9\n60.8871,63.17,29.06\n60.9971,63.64,28.705\n62.2414,64.69,28.9\n62.0471,64.63,29.2875\n61.3999,63.87,29.545\n59.9785,61.83,28

In [None]:
import numpy as np
import pandas as pd

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

from datetime import datetime
import itertools
import argparse
import re
import os
import pickle

from sklearn.preprocessing import StandardScaler


def get_data():

  df = pd.read_csv('aapl_msi_sbux.csv')
  return df.values



class ReplayBuffer:
  def __init__(self, obs_dim, act_dim, size):
    self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.acts_buf = np.zeros(size, dtype=np.uint8)
    self.rews_buf = np.zeros(size, dtype=np.float32)
    self.done_buf = np.zeros(size, dtype=np.uint8)
    self.ptr, self.size, self.max_size = 0, 0, size

  def store(self, obs, act, rew, next_obs, done):
    self.obs1_buf[self.ptr] = obs
    self.obs2_buf[self.ptr] = next_obs
    self.acts_buf[self.ptr] = act
    self.rews_buf[self.ptr] = rew
    self.done_buf[self.ptr] = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.size+1, self.max_size)

  def sample_batch(self, batch_size=32):
    idxs = np.random.randint(0, self.size, size=batch_size)
    return dict(s=self.obs1_buf[idxs],
                s2=self.obs2_buf[idxs],
                a=self.acts_buf[idxs],
                r=self.rews_buf[idxs],
                d=self.done_buf[idxs])





def get_scaler(env):
  # return scikit-learn scaler object to scale the states
  # Note: you could also populate the replay buffer here

  states = []
  for _ in range(env.n_step):
    action = np.random.choice(env.action_space)
    state, reward, done, info = env.step(action)
    states.append(state)
    if done:
      break

  scaler = StandardScaler()
  scaler.fit(states)
  return scaler




def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)




def mlp(input_dim, n_action, n_hidden_layers=1, hidden_dim=32):
  """ A multi-layer perceptron """

  # input layer
  i = Input(shape=(input_dim,))
  x = i

  # hidden layers
  for _ in range(n_hidden_layers):
    x = Dense(hidden_dim, activation='relu')(x)

  # final layer
  x = Dense(n_action)(x)

  # make the model
  model = Model(i, x)

  model.compile(loss='mse', optimizer='adam')
  print((model.summary()))
  return model




class MultiStockEnv:
  """
    - 0 = sell
    - 1 = hold
    - 2 = buy
  """
  def __init__(self, data, initial_investment=20000):
    # data
    self.stock_price_history = data
    self.n_step, self.n_stock = self.stock_price_history.shape

    # instance attributes
    self.initial_investment = initial_investment
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None

    self.action_space = np.arange(3**self.n_stock)

    self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))

    self.state_dim = self.n_stock * 2 + 1

    self.reset()


  def reset(self):
    self.cur_step = 0
    self.stock_owned = np.zeros(self.n_stock)
    self.stock_price = self.stock_price_history[self.cur_step]
    self.cash_in_hand = self.initial_investment
    return self._get_obs()


  def step(self, action):
    assert action in self.action_space

    prev_val = self._get_val()

    self.cur_step += 1
    self.stock_price = self.stock_price_history[self.cur_step]

    self._trade(action)

    cur_val = self._get_val()

    reward = cur_val - prev_val

    done = self.cur_step == self.n_step - 1

    info = {'cur_val': cur_val}

    return self._get_obs(), reward, done, info


  def _get_obs(self):
    obs = np.empty(self.state_dim)
    obs[:self.n_stock] = self.stock_owned
    obs[self.n_stock:2*self.n_stock] = self.stock_price
    obs[-1] = self.cash_in_hand
    return obs



  def _get_val(self):
    return self.stock_owned.dot(self.stock_price) + self.cash_in_hand


  def _trade(self, action):

    action_vec = self.action_list[action]


    sell_index = []
    buy_index = []
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)


    if sell_index:
      # NOTE: to simplify the problem, when we sell, we will sell ALL shares of that stock
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      # NOTE: when buying, we will loop through each stock we want to buy,
      #       and buy one share at a time until we run out of cash
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1 # buy one share
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False





class DQNAgent(object):
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplayBuffer(state_size, action_size, size=500)
    self.gamma = 0.95  # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.model = mlp(state_size, action_size)
    self.target_model = mlp(state_size, action_size)
    self.target_model.set_weights(self.model.get_weights())
  def update_target_network(self):
    self.target_model.set_weights(self.model.get_weights())
    print("Updated")



  def update_replay_memory(self, state, action, reward, next_state, done):
    self.memory.store(state, action, reward, next_state, done)


  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return np.random.choice(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])  # returns action


  def replay(self, batch_size=32):
    # first check if replay buffer contains enough data
    if self.memory.size < batch_size:
      return

    # sample a batch of data from the replay memory
    minibatch = self.memory.sample_batch(batch_size)
    states = minibatch['s']
    actions = minibatch['a']
    rewards = minibatch['r']
    next_states = minibatch['s2']
    done = minibatch['d']

    # Calculate the tentative target: Q(s',a)
    target = rewards + (1 - done) * self.gamma * np.amax(self.target_model.predict(next_states), axis=1)

    # With the Keras API, the target (usually) must have the same
    # shape as the predictions.
    # However, we only need to update the network for the actions
    # which were actually taken.
    # We can accomplish this by setting the target to be equal to
    # the prediction for all values.
    # Then, only change the targets for the actions taken.
    # Q(s,a)
    target_full = self.model.predict(states)
    target_full[np.arange(batch_size), actions] = target

    # Run one training step
    self.model.train_on_batch(states, target_full)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def load(self, name):
    self.model.load_weights(name)


  def save(self, name):
    self.model.save_weights(name)


def play_one_episode(agent, env, is_train):
  # note: after transforming states are already 1xD
  state = env.reset()
  state = scaler.transform([state])
  done = False

  while not done:
    action = agent.act(state)
    next_state, reward, done, info = env.step(action)
    next_state = scaler.transform([next_state])
    if is_train == 'train':
      agent.update_replay_memory(state, action, reward, next_state, done)
      agent.replay(batch_size)
    state = next_state

  return info['cur_val']





# config
models_folder = 'rl_trader_models'
rewards_folder = 'rl_trader_rewards'
num_episodes = 100
batch_size = 32
initial_investment = 20000




maybe_make_dir(models_folder)
maybe_make_dir(rewards_folder)

data = get_data()
n_timesteps, n_stocks = data.shape

n_train = n_timesteps // 2

train_data = data[:n_train]
test_data = data[n_train:]

env = MultiStockEnv(train_data, initial_investment)
state_size = env.state_dim
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)
scaler = get_scaler(env)

# store the final value of the portfolio (end of episode)
portfolio_value = []



  # play the game num_episodes times
# Training the Q-network
for e in range(num_episodes):
  t0 = datetime.now()
  val = play_one_episode(agent, env, 'train')
  dt = datetime.now() - t0
  print(f"Episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
  if (e+1)%20==0:
    agent.update_target_network()

  portfolio_value.append(val) # append episode end portfolio value



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode: 13/100, episode end value: 34386.88, duration: 0:02:52.109747
Episode: 14/100, episode end value: 43761.47, duration: 0:02:51.486855
Episode: 15/100, episode end value: 54956.43, duration: 0:02:53.054026

In [None]:
 # save the weights when we are done

    # save the DQN
mode='train'
agent.save(f'{models_folder}/dqn.h5')

# save the scaler
with open(f'{models_folder}/scaler.pkl', 'wb') as f:
  pickle.dump(scaler, f)


  # save portfolio value for each episode
  np.save(f'{rewards_folder}/{mode}.npy', portfolio_value)

In [None]:

#Testing
  # then load the previous scaler
mode = 'test'
with open(f'{models_folder}/scaler.pkl', 'rb') as f:
  scaler = pickle.load(f)

  # remake the env with test data
  env = MultiStockEnv(test_data, initial_investment)

  # make sure epsilon is not 1!
  # no need to run multiple episodes if epsilon = 0, it's deterministic
  agent.epsilon = 0.01

  # load trained weights
  agent.load(f'{models_folder}/dqn.h5')
  for e in range(num_episodes):
  t0 = datetime.now()
  val = play_one_episode(agent, env, 'test')
  dt = datetime.now() - t0
  print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
  portfolio_value.append(val) # append episode end portfolio value
