In [62]:
import argparse
import importlib
import logging
import sys
import time
from agents.DQN import Agent
from utils import *


In [73]:
import random
from collections import deque

import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam

from utils import Portfolio

class Agent(Portfolio):
    def __init__(self,state_dim,balance,is_eval=False,model_name=""):
        super().__init__(balance=balance)
        self.model_type="DQN"
        self.state_dim = state_dim
        self.action_dim = 3
        self.memory=deque(maxlen=100)
        self.buffer_size=60

        self.gamma=0.95
        self.epsilon=1.0
        self.epsilon_min= 0.01
        self.epsilon_decay = 0.995
        self.is_eval = is_eval
        self.model = load_model("save_models/{}.h5".format(model_name)) if is_eval else self.model()
        self.print_f()
        
        
    def print_f(self):
        print("hello world.")
        return
    
    
    def model(self):
        print("test")
        model=Sequential()
        model.add(Dense(units=64,input_dim = self.state_dim,activation="relu"))
        model.add(Dense(units =32,activation="relu"))
        model.add(Dense(units = 8,activation='relu'))
        model.add(Dense(self.action_dim,activation='softmax'))
        model.compile(loss='mse',optimizer=Adam(lr=0.01))
        return model

    def reset(self):
        self.reset_portfolio()
        self.epsilon=1.0

    def remember(self,state,actions,reward,next_state,done):
        self.memory.append((state,actions,reward,next_state,done))

    def act(self,state):
        if not self.is_eval and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_dim)
        options = self.model.predict(state)
        return np.argmax(options[0])

    def experience_replay(self):

        mini_batch = [self.memory[i] for i in range(len(self.memory)-self.buffer_size+1,len(self.memory))]

        for state, actions, reward, next_state, done in mini_batch:
            if not done:
                Q_target_value = reward+self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                Q_target_value = reward

            next_actions = self.model.predict(state)
            next_actions[0][np.argmax(actions)]=Q_target_value
            history = self.model.fit(state,next_actions,epochs=1,verbose =1)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        return history.history['loss'][0]









In [74]:
# parser = argparse.ArgumentParser(description="command line options")
# parser.add_argument("--model_name",action="store",dest="model_name",default="DQN",help='model_name')
# parser.add_argument('--stock_name',action="store",dest='stock_name',default='^GSPC_2010-2015',help="stock_name")
# parser.add_argument('--window_size',action="store",dest='window_size',default='10',type =int ,help="span (days) of observation")
# parser.add_argument('--num_episode',action="store",dest='num_episode',default='10',type =int ,help="episode number")
# parser.add_argument('--initial_balance',action="store",dest='initial_balance',default='50000',help="initial balance")

# inputs = parser.parse_args()

# model_name = inputs.model_name
# stock_name = inputs.stock_name
# window_size= inputs.window_size
# num_episode =inputs.num_episode
# initial_balance=inputs.initial_balance


model_name = "DQN"
stock_name = '^GSPC_2010-2015'
window_size= 10
num_episode =10
initial_balance=50000


stock_prices = stock_close_prices(stock_name)
trading_period = len(stock_prices) -1
returns_across_episodes=[]
num_experience_replay=0
action_dict={0:"Hold",1:"Buy",2:"Sell"}


agent = Agent(state_dim=window_size +3, balance= initial_balance)


def hold(actions):
    # encourage selling for profit and liquidity
    next_probable_action = np.argsort(actions)[1]
    if next_probable_action==2 and len(agent.inventory) >0:
        max_profit = stock_prices[t]  - agent.inventory[0]
        if max_profit >0:
            sell(t)
            actions[next_probable_action] = 1 #reset this action's value to the highest
            return "Hold",actions


def buy(t):
    if agent.balance > stock_prices[t]:
        agent.balance-=stock_prices[t]
        agent.inventory.add(stock_prices[t])
        return 'Buy: ${:.2f}'.format(stock_prices[t])

def sell(t):
    if len(agent.inventory)>0:
        agent.balance+=stock_prices[t]

        #应该pop 最便宜的inventory
        bought_price = agent.inventory.pop(0)
        profit = stock_prices[t] - bought_price
        global reward

        reward = profit

        return "Sell: ${:.2f} | Profit: ${:.2f}".format(stock_prices[t],profit)


logging.basicConfig(filename=f'log/{model_name}_training_{stock_name}.log',filemode="w",
format='[%(asctime)s.%(msecs)03d %(filename)s:%(lineno)3s] %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',level = logging.INFO)

logging.info(f'Trading Object:              {stock_name}')
logging.info(f'Trading Period:              {trading_period} days')
logging.info(f'Window Size:              {window_size} days')
logging.info(f'Training Episode:              {num_episode}')
logging.info(f'Model Name:              {model_name}')
logging.info(f'Initial Portfolio Value: ${initial_balance}')



test
hello world.


In [76]:
start_time = time.time()
for e in range(1,num_episode +1):
    logging.info(f'n\nEpisode: {e}/{num_episode}')

    agent.reset()
    state = generate_combined_state(0,window_size,stock_prices,agent.balance,len(agent.inventory))

    for t in range(1,trading_period+1):
        if t % 100 ==0:
            logging.info(f"\n -----------------------------Period: {t}/{trading_period}---------------")

            reward = 0
            next_state= generate_combined_state(t,window_size,stock_prices,agent.balance,len(agent.inventory))
            previous_portfolio_vlaue = len(agent.inventory) * stock_prices[t] +agent.balance

            if model_name == "DDPG":
                actions = agent.act(state,t)
                action = np.argmax(actions)
            else:
                actions = agent.model.predict(state)[0]
                action = agent.act(state)
            print(t,actions)
            logging.info("Step: 『』\tHold signal: {:.4} \t Buy signal: {:.4} \tSell signal: {:.4}".format(t,actions[0],actions[1],actions[2]))
            if action != np.argmax(actions): logging.info(f"\t\t'{action_dict[action]}' is an exploration.")
            if action ==0:
                execution_result = hold(actions)
            if action ==1:
                execution_result =buy(t)
            if action ==2:
                execution_result = sell(t)

            #checking execution result
            if execution_result is None:
                reward -=treasury_bond_daily_return_rate() * agent.balance #missing opportunity
            else:
                if isinstance(execution_result,tuple): #if execution_result is 'Hold'
                    actions = execution_result[1]
                    execution_result = execution_result[0]

                logging.info(execution_result)

            #calculate reward
            current_portfolio_value = len(agent.inventory) * stock_prices[t] + agent.balance    
            unrealized_profit = current_portfolio_value - agent.initial_portfolio_value
            reward += unrealized_profit

            agent.portfolio_values.append(current_portfolio_value)
            agent.return_rate.append((current_portfolio_value-previous_portfolio_vlaue)/previous_portfolio_vlaue)

            done = True if t == trading_period else False   
            agent.remember(state,actions,reward,next_state,done)

            #udpate state
            state = next_state
            
            #experience replay
            if len(agent.memory) >agent.buffer_size:
                num_experience_replay += 1
                loss = agent.experience.replay()
                #{:.2f} with 2 decimal place
                logging.info("Episode: {}\t Loss: {:.2f}\tAction: {}\t Reward:{:.2f}\t Number of Stocks: {}".format(e,loss,agent.balance,len(agent.inventory)))
                agent.tensorboard.on_batch_end(num_experience_replay,{'loss':loss,"portfolio value": current_portfolio_value})
            
            if done:
                portfolio_retunr = evaluate_portfolio_performance(agent,logging)
                returns_across_episodes.append(portfolio_return)

        if e%5 == 0:
            if model_name =="DQN":
                agent.model.save("saved_models/DQN_ep"+str(e)+".h5")
            elif model_name == "DDPG":
                agent.actor.model.save_weights("saved_models/DDPG_ep{}_actor.h5".format(str(e)))
                agent.critic.model.save_weights("saved_models/DDPG_ep{}_critic".format(str(e)))

            logging.info("model saved")

    logging.info('total training time: {0:.2f} min'.format((time.time()-start_time)/60))

    plot_portfolio_returns_across_episodes(model_name, returns_across_episodes)
    


100 [0.04918071 0.51252866 0.43829057]


ValueError: Precision not allowed in integer format specifier