# Episode 53. Reinforcement Learning: Robo Trader

In [None]:
!pip install yfinance
import yfinance as yf

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import math
from scipy.stats import norm
#import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from tensorflow.keras.layers import Input, Dense, Lambda, Reshape
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dropout, BatchNormalization, LeakyReLU
from tensorflow.keras.layers import Reshape, Flatten, Conv2D, Conv2DTranspose
from tensorflow.keras import backend as K
from tensorflow.keras import metrics
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.initializers import RandomNormal
from tensorflow.keras.constraints import Constraint
#from tensorflow.keras.layers.merge import concatenate
from tensorflow.keras.models import load_model


# changed to yfinance
import yfinance as yf
import random
import pandas_datareader.data as web

## Yahoo Finance API

In [None]:
def get_prices(share_symbol, start_date, end_date,
               cache_filename='stock_prices.npy', force=False):
    try:
        if force:
            raise IOError
        else:
            stock_prices = np.load(cache_filename)
    except IOError:
        stock_hist = yf.download(share_symbol, start=start_date, end=end_date)
        stock_prices = []
        stock_hist = stock_hist.sort_values(by=['Date'])
        for index, day in stock_hist.iterrows():
            stock_val = day['Open'].astype(float)
            stock_prices.append(stock_val)
            
        stock_prices = np.asarray(stock_prices)
        np.save(cache_filename, stock_prices)
        
    return stock_prices.astype(float)

In [None]:
def plot_prices(prices):
    plt.title('Opening stock prices')
    plt.xlabel('day')
    plt.ylabel('price ($)')
    plt.plot(prices)
    plt.show()

## Deep Q-Network

In [None]:
# set neural network structure
hist = 30
Na = 3 # action choices: 0=hold,1=sell,2=buy
Ns = 2+hist # # state: budget,shares,past Nlag sp500 indices

input_dim = 2+hist # budget,shares,past Nlag sp500 indices
intermediate_dim1 = 10
intermediate_dim2 = 10
intermediate_dim3 = 10
output_dim = Na # Q-values for 0=hold,1=sell,2=buy

In [None]:
# deep Q network
# input: num shares, portfolio amount, past Nlag sp500 index values
# output: action choice(0=hold,1=sell,2=buy)
def build_dqn():
    # define deep q-network
    dqn_in = Input(shape=(input_dim,), name="dqn_input")
    # intermediate layer
    h1 = Dense(intermediate_dim1, name="hidden1")(dqn_in)
    #h1 = BatchNormalization()(h1)
    h1 = LeakyReLU(alpha=0.1)(h1)
    #h1 = Dropout(rate=0.01)(h1)
    
    h2 = Dense(intermediate_dim2, name="hidden2")(h1)
    #h2 = BatchNormalization()(h2)
    h2 = LeakyReLU(alpha=0.1)(h2)
    #h2 = Dropout(rate=0.01)(h2)
    
    #h3 = Dense(intermediate_dim3, name="hidden3")(h1)
    #h3 = BatchNormalization()(h3)
    #h3 = LeakyReLU(alpha=0.1)(h3)
    #h3 = Dropout(rate=0.01)(h3)
    #dqn_out = Dense(output_dim, activation='linear', name="dqn_output")(h3)
    
    dqn_out = Dense(output_dim, activation='linear', name="dqn_output")(h2)
    
    dqn = Model(dqn_in, dqn_out, name="DQN")
    dqn.compile(loss='mean_squared_error', optimizer='adam')
    return dqn

In [None]:
# agent's action choice (0=hold,1=sell,2=buy)
def chooseaction(q,explore):
    # choose action of largest q
    a = np.argmax(q)
    # exploration
    if np.random.rand(1) < explore:
        amax = a
        while a==amax:
            a = np.random.randint(0,Na)
    return a

## simulation: multiple episodes

In [None]:
# run simulation for one episode
def run_episode(dqn, initial_budget, initial_num_stocks, prices, hist):
    budget = initial_budget # The state is a hist + 2 dimensional vector. You’ll force it to be a NumPy matrix.
    # Calculates the portfolio value
    num_stocks = initial_num_stocks
    share_value = 0
    for i in range(len(prices) - hist - 1):
        # Initializes values that depend on computing the net worth of a portfolio
        if i % 1000 == 0:
            print('progress {:.2f}%'.format(float(100*i) / (len(prices) - hist - 1)))
        # current state: busget, share, current & lag prices
        current_state = np.asmatrix(np.hstack((budget, num_stocks, prices[i:i+hist])))
        current_portfolio = budget + num_stocks * share_value
        
        # get Q-values for a=0 (continue) and a=1 (stop)
        Q1 = dqn.predict(current_state)
        # choose action
        a = chooseaction(Q1,explore)  
        # update share value
        share_value = prices[i+hist]
        
        # update share and num_stocks based on action choice
        if a == 2 and budget >= share_value: # buy
            budget -= share_value
            num_stocks += 1
        elif a == 1 and num_stocks > 0: # sell
            budget += share_value
            num_stocks -= 1
        else:
            action = 0
        
        # next state
        next_state = np.asmatrix(np.hstack((budget, num_stocks, prices[i+1:i+hist+1])))
        next_portfolio = budget + num_stocks * share_value
        
        # reward
        r = next_portfolio-current_portfolio
        
        # get max of Q(s(t+1))  
        Q2 = dqn.predict(next_state)
        maxQ2 = np.max(Q2)
        # get target Q1 (only change value of Q1 for chosen action a)
        targetQ1 = Q1
        targetQ1[0,a] = r+gamma*maxQ2
        # update Q0 (Q-learning)
        dqn_hist = dqn.fit(current_state,targetQ1,verbose=0)
    portfolio = budget + num_stocks * share_value
    return portfolio

In [None]:
def test_episode(dqn, initial_budget, initial_num_stocks, prices, hist):
    budget = initial_budget # The state is a hist + 2 dimensional vector. You’ll force it to be a NumPy matrix.
    # Calculates the portfolio value
    num_stocks = initial_num_stocks
    share_value = 0
    for i in range(len(prices) - hist - 1):
        # Initializes values that depend on computing the net worth of a portfolio
        if i % 1000 == 0:
            print('progress {:.2f}%'.format(float(100*i) / (len(prices) - hist - 1)))
        # current state: busget, share, current & lag prices
        current_state = np.asmatrix(np.hstack((budget, num_stocks, prices[i:i+hist])))
        current_portfolio = budget + num_stocks * share_value
        
        # get Q-values for a=0 (continue) and a=1 (stop)
        Q1 = dqn.predict(current_state)
        # choose action
        a = chooseaction(Q1,explore)
        # update share value
        share_value = float(prices[i + hist])
        
        # update share and num_stocks based on action choice
        if a == 2 and budget >= share_value: # buy
            budget -= share_value
            num_stocks += 1
        elif a == 1 and num_stocks > 0: # sell
            budget += share_value
            num_stocks -= 1
        else:
            action = 0

    portfolio = budget + num_stocks * share_value
    return portfolio

## set initial parameters

In [None]:
# learning rate: slow learning makes Q-curve smoother (<0.005)
alpha = 0.01
# discount rate: option value becomes more distinct for large discount
gamma = 0.99
# epsilon-greedy search: need sufficiently large exploration (>0.2)
explore = 0.3

## Build Deep Q-network

In [None]:
dqn = build_dqn()

## Run simulation (July, 1992 ~ July, 2016) 

In [None]:
#prices = get_prices('MSFT', '1992-07-22', '2016-07-22')
stock_hist = yf.download('^GSPC', start='1992-07-22', end='2016-07-22')
prices = stock_hist['Close']
plot_prices(prices)
actions = ['Buy', 'Sell', 'Hold']
budget = 10000
num_stocks = 0

num_episodes = 20
final_portfolios = list()
final_portfolios.append(budget)
for i in range(num_episodes):
    # reset budget & num_shares for each episode
    budget = 10000
    num_stocks = 0
    # Stores the portfolio worth of each run in this array
    # Runs this simulation
    final_portfolio = run_episode(dqn, budget, num_stocks, prices, hist)
    final_portfolios.append(final_portfolio)
    print('Final portfolio: ${}'.format(final_portfolio))
    plt.title('Final Portfolio Value')
    plt.xlabel('Simulation #')
    plt.ylabel('Net worth')
    plt.plot(final_portfolios)
    plt.show()

## Out-Of-Sample Test (July, 2016 ~ July, 2019)

In [None]:
# reset budget & num_shares for each episode
budget = 10000
num_stocks = 0
stock_hist = yf.download('^GSPC', start='2016-07-23', end='2019-07-22')
prices = stock_hist['Close']
test_portfolio = test_episode(dqn, budget, num_stocks, prices, hist)
print("Return(%): ", 100*test_portfolio/budget)

[*********************100%***********************]  1 of 1 completed
progress 0.00%
Return(%):  128.0311083984375
