### Import Package

In [302]:
import math
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import linear_model

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, layers, models

## Constants

In [303]:
ACTION_SELL = 0
ACTION_HOLD = 1
ACTION_BUY = 2

CMS_RATE = 0.02 # 2% commission rate

### Custom Function

In [304]:
def round_down_precision(val,precision):
    return(math.floor(val*(10**precision)))/(10**precision)


In [305]:
round_down_precision(12.3895,2)

12.38

### Set the data source path

In [306]:
# Set the data source path
interval = "daily"
region = "us"
ex_product = "nasdaq stocks"
section = "1"
stock = "aapl"
data_path = "test_data/"+interval+"/"+region+"/"+ex_product+"/"+section+"/"+stock+"."+region+".txt"

column_to_use = ["OPEN","LOW","HIGH","CLOSE"]

### Load the stock data

In [307]:
# Load the data
ori_data = pd.read_csv(data_path, sep=",")

# Rename the column names
ori_data.columns = [colname[1:-1] for colname in ori_data.columns]

# Drop the unnecessary
ori_data = ori_data.drop(columns=['PER','TIME', 'VOL', 'OPENINT'])
ori_data = ori_data.drop(columns=['TICKER'])

In [308]:
ori_data

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE
0,19840907,0.10150,0.10274,0.10028,0.10150
1,19840910,0.10150,0.10181,0.09905,0.10090
2,19840911,0.10181,0.10456,0.10181,0.10274
3,19840912,0.10274,0.10334,0.09966,0.09966
4,19840913,0.10518,0.10548,0.10518,0.10518
...,...,...,...,...,...
9357,20211021,148.81000,149.64000,147.87000,149.48000
9358,20211022,149.69000,150.18000,148.64000,148.69000
9359,20211025,148.68000,149.37000,147.62110,148.64000
9360,20211026,149.33000,150.84000,149.01010,149.32000


In [309]:

def generate_indicator(data,BB = (True, 20,2),SMA = (True, 10),EMA = (True,[12,26]), MACD = (True,12,26,9)):
    bb_true, bb_day,bb_sd = BB
    sma_true,sma_day = SMA
    ema_true, ema_list = EMA
    macd_true, macd_n1, macd_n2, macd_n3 = MACD
    if macd_n1 not in ema_list:
        ema_list,append(macd_n1)
    if macd_n2 not in ema_list:
        ema_list,append(macd_n2)
            
    for i, row in data.iterrows():
        ### SMA Section
        if sma_true:
            if i >= (sma_day-1):
                data.loc[i,"SMA_"+str(sma_day)] = np.mean(data.loc[i-(sma_day-1):i,"CLOSE"])
            else:
                data.loc[i,"SMA_"+str(sma_day)] = 0
                
        ### BB
        if bb_true:
            if i >= (bb_day-1):
                mb = data.loc[i,"MB"] = np.mean(data.loc[i-(bb_day-1):i,"CLOSE"])
                sd = np.std(data.loc[i-(bb_day-1):i,"CLOSE"])
                data.loc[i,"UB"] = mb+bb_sd*sd
                data.loc[i,"LB"] = mb-bb_sd*sd
            else:
                data.loc[i,"UB"] = data.loc[i,"LB"] = data.loc[i,"MB"] = 0
                
        
        
        ### EMA Section    
        if ema_true:
            for ema in ema_list:
                weight = 2/(1+ema)
                if i == 0:
                    data.loc[i,"EMA_"+str(ema)] = data.loc[i,"CLOSE"] * weight
                else:
                    data.loc[i,"EMA_"+str(ema)] = data.loc[i,"CLOSE"] * weight + data.loc[i-1,"EMA_"+str(ema)]*(1-weight)
                    
        ### MACD Section
        if macd_true:
            data.loc[i,"MACD"] = data.loc[i,"EMA_"+str(macd_n1)]-data.loc[i,"EMA_"+str(macd_n2)]
            ### MACD Signal
            weight = 2/(1+macd_n3)
            if i == 0:
                data.loc[i,"MACD_sig"] = data.loc[i,"MACD"] * weight
            else:
                data.loc[i,"MACD_sig"] = data.loc[i,"MACD"] * weight + data.loc[i-1,"MACD_sig"]*(1-weight)
            ### MACD Diff
            data.loc[i,"MACD_diff"] = data.loc[i,"MACD"]-data.loc[i,"MACD_sig"]
            
    return data
            

In [310]:
ori_data = generate_indicator(ori_data)

In [311]:
ori_data

Unnamed: 0,DATE,OPEN,HIGH,LOW,CLOSE,SMA_10,UB,LB,MB,EMA_12,EMA_26,MACD,MACD_sig,MACD_diff
0,19840907,0.10150,0.10274,0.10028,0.10150,0.000,0.000000,0.000000,0.0000,0.015615,0.007519,0.008097,0.001619,0.006477
1,19840910,0.10150,0.10181,0.09905,0.10090,0.000,0.000000,0.000000,0.0000,0.028736,0.014436,0.014300,0.004156,0.010145
2,19840911,0.10181,0.10456,0.10181,0.10274,0.000,0.000000,0.000000,0.0000,0.040121,0.020977,0.019145,0.007153,0.011991
3,19840912,0.10274,0.10334,0.09966,0.09966,0.000,0.000000,0.000000,0.0000,0.049281,0.026805,0.022476,0.010218,0.012258
4,19840913,0.10518,0.10548,0.10518,0.10518,0.000,0.000000,0.000000,0.0000,0.057881,0.032611,0.025270,0.013228,0.012042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9357,20211021,148.81000,149.64000,147.87000,149.48000,145.078,149.643391,138.106609,143.8750,145.816724,145.441107,0.375617,-0.693396,1.069014
9358,20211022,149.69000,150.18000,148.64000,148.69000,145.657,149.965621,137.961379,143.9635,146.258767,145.681766,0.577001,-0.439317,1.016318
9359,20211025,148.68000,149.37000,147.62110,148.64000,146.240,150.443392,137.810608,144.1270,146.625110,145.900894,0.724216,-0.206610,0.930826
9360,20211026,149.33000,150.84000,149.01010,149.32000,147.021,151.112495,137.882505,144.4975,147.039709,146.154161,0.885547,0.011821,0.873726


In [192]:
ori_data.iloc[0,]["OPEN"]

0.1015

### Split the train and test data

In [312]:
def custom_split_test_train(data,date_col_name,train_start,train_end):
    train = (data[date_col_name] >= train_start) & (data[date_col_name] <= train_end)
    test = (data[date_col_name] > train_end)
    train_X = data[train]
    test_X = data[test]
    
    return (train_X,test_X)

In [313]:
train_X, test_X = custom_split_test_train(ori_data,
                                          "DATE",
                                          train_start = 20180101,
                                          train_end = 20210101)
## Drop the date column
train_X = train_X.drop(columns=['DATE'])
test_X = test_X.drop(columns=['DATE'])

In [314]:
train_X

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,SMA_10,UB,LB,MB,EMA_12,EMA_26,MACD,MACD_sig,MACD_diff
8399,40.746,41.256,40.529,41.247,41.4025,42.244820,40.104380,41.17460,41.189792,41.088170,0.101621,0.223031,-0.121410
8400,41.312,41.793,41.173,41.240,41.3018,42.247637,40.159463,41.20355,41.197516,41.099417,0.098099,0.198045,-0.099946
8401,41.314,41.541,41.203,41.436,41.2665,42.257636,40.230664,41.24415,41.234206,41.124349,0.109857,0.180407,-0.070550
8402,41.533,41.995,41.442,41.902,41.2818,42.303174,40.327626,41.31540,41.336943,41.181953,0.154991,0.175324,-0.020333
8403,41.749,42.051,41.651,41.759,41.2672,42.314915,40.437385,41.37615,41.401875,41.224697,0.177178,0.175695,0.001484
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9150,130.700,132.830,130.480,131.350,127.2290,132.445780,116.193220,124.31950,127.100116,123.779447,3.320669,2.671578,0.649091
9151,133.360,136.710,132.880,136.060,128.6510,134.033401,116.606599,125.32000,128.478560,124.689118,3.789442,2.895151,0.894291
9152,137.400,138.140,133.710,134.240,129.9540,135.053238,117.161762,126.10750,129.364935,125.396590,3.968345,3.109790,0.858555
9153,134.910,135.360,132.770,133.090,130.5350,135.898314,117.410686,126.65450,129.938022,125.966473,3.971549,3.282142,0.689408


### User Account

In [315]:
class User:
    def __init__(self,capital,stock_acc):
        self.wallet = capital
        self.stock_account = stock_acc

### Define the state

In [316]:
class state:
    def __init__(self,cash_owned,share_num_owned,open_price,lb,mb,ub,macd,macd_sig,sma):
        self.state_vector = [cash_owned,share_num_owned,open_price,lb,mb,ub,macd,macd_sig,sma]
        self.cash_owned = cash_owned
        self.share_num_owned = share_num_owned
        self.open_price = open_price
        self.lb = lb
        self.mb = mb
        self.ub = ub
        self.macd = macd
        self.macd_sig = macd_sig
        self.sma = sma
        
    def get_state(self):
        return self.state_vector
    
    def get_cash_owned(self):
        return self.cash_owned
    
    def get_share_num_owned(self):
        return self.share_num_owned
    
    def get_open_price(self):
        return self.open_price
    
    def get_lb(self):
        return self.lb
    
    def get_mb(self):
        return self.mb
    
    def get_ub(self):
        return self.ub
    
    def get_macd(self):
        return self.macd
    
    def get_macd_sig(self):
        return self.macd_sig
    
    def get_sma(self):
        return self.sma
    
    def update_state(self,cash_owned,share_num_owned,open_price,lb,mb,ub,macd,macd_sig,sma):
        self.state_vector = [cash_owned,share_num_owned,open_price,lb,mb,ub,macd,macd_sig,sma]
        self.cash_owned = cash_owned
        self.share_num_owned = share_num_owned
        self.open_price = open_price
        self.lb = lb
        self.mb = mb
        self.ub = ub
        self.macd = macd
        self.macd_sig = macd_sig
        self.sma = sma
        
    def update_cash_owned(self,cash_in_hand):
        self.cash_owned = cash_in_hand
        self.state_vector[0] = cash_in_hand
        
    def update_share_num_owned(self,share_in_hand):
        self.share_num_owned = share_in_hand
        self.state_vector[1] = share_in_hand
    
    def get_value(self):
        return round_down_precision(self.cash_owned + self.share_num_owned * self.open_price,2)
    
    def print_state(self):
        print(self.state_vector)

### Define the actions

In [317]:


class action:
    def __init__(self,action_id):
        self.action_id = action_id
        self.cash_owned = 0
        self.commission = 0 # in percentage
        self.price = 0
        self.share_num = 0 # Integer
        self.taken = False
        self.final_amount = 0
        
        if self.action_id == ACTION_SELL:
            self.action = "SELL"
        elif self.action_id == ACTION_HOLD:
            self.action = "HOLD"
        elif self.action_id == ACTION_BUY:
            self.action = "BUY"
            
    def get_action_id(self):
        return self.action_id
    
    def get_action(self):
        return self.action
        
    def take_action(self,cash_owned,price,share_num_owned,commission = CMS_RATE):
        self.cash_owned = cash_owned
        self.commission = commission # in percentage
        self.price = price
        self.share_num_owned = share_num_owned # Integer
        # Calculate the maximum share that can buy
        max_share_buy = math.floor(cash_owned/price)
        
        if self.action_id == ACTION_SELL:
            if share_num_owned == 0:
                self.action_id = ACTION_HOLD
                self.final_amount = cash_owned
            else:
                self.final_amount = cash_owned + price*share_num_owned*(1-r)
        elif self.action_id == ACTION_HOLD:
            self.final_amount = cash_owned
        elif self.action_id == ACTION_BUY:
            if max_share_buy == 0:
                self.action_id = ACTION_HOLD
                self.final_amount = cash_owned
            else:
                self.final_amount = cash_owned - price*max_share_buy*(1+r)
        self.taken = True
            
    def get_final_amount(self):
        if not self.taken:
            print("This action have not been taken yet.")
        return self.final_amount


        

### Define the reward machanism

In [318]:
class reward:
    def __init__(self,old_state_vector,cur_state_vector,last_action_made):
        self.old_state_vector = old_state_vector
        self.cur_state_vector = cur_state_vector
        self.last_action_made = last_action_made
        self.calculated = False
        self.reward = 0
        
    def calculate_reward(self):
        self.calculated = True
        print(self.old_state_vector)
        print(self.cur_state_vector)
        cur_val = self.cur_state_vector[0]+self.cur_state_vector[1]*self.cur_state_vector[2]
        old_val = self.old_state_vector[0]+self.old_state_vector[1]*self.old_state_vector[2]
        self.reward = cur_val - old_val
        
    def get_reward(self):
        return self.reward
        

### Define the Agent

In [319]:
class agent:    
    def __init__(self,selection = ("EpsilonGreedy",0.3)):
        (self.method, self.para) = selection
        
    def make_action(self,curr_state,qa,total_action_made,action_count):
        
        # Define 3 actions to choose
        a1 = action(0) # SELL Action
        a2 = action(1) # HOLD Action
        a3 = action(2) # BUY Action
        actions = [a1,a2,a3] # Define all three action

        # Retrieve state
        cash_in_hand = curr_state.get_cash_owned()
        stock_price = curr_state.get_open_price()
        share_in_hand = curr_state.get_share_num_owned()

        # Action-Value Selection
        j = -1 # Index of the action with largest return
        
        # Upper-Confidence Bounce (UCB) Action Selection
        if self.method == "UpperConfidenceBounce":
            j = np.argmax([qa[a.get_action_id()] + self.para*math.sqrt((0 if np.log(total_action_made) < 0 else np.log(total_action_made))/(0.01 if action_count[a.get_action_id()] == 0 else action_count[a.get_action_id()])) for a in actions])
        
        ## Apply Epsilon-Greedy Algorithm to choose the action with maximum return
        #elif method == "EpsilonGreedy":
        #    prob = np.random.random() # Set the prob to try new actions
        #    if prob < self.para:
        #        j = np.random.choice(3)
        #    else:
        #        j = np.argmax([a.take_action(cash_in_hand,stock_price,share_in_hand).get_final_amount() for a in actions])
                        
        return j

### Define the environment

In [328]:
class envir:
    
    def __init__(self,data,capital,num_share):
        self.data = data
        # Timestamp
        self.t = 0
        self.max_t = len(data)
        
        # Initial Capital
        self.capital = capital
        # Initial Number of Share
        self.num_share = num_share
        
        #self.user = User(capital,num_share)
        
        # Initiate the first state
        self.curr_state = state(self.capital,self.num_share,
                                self.data.iloc[self.t,]["OPEN"],self.data.iloc[self.t,]["LB"],
                                self.data.iloc[self.t,]["MB"],self.data.iloc[self.t,]["UB"],
                                self.data.iloc[self.t,]["MACD"],self.data.iloc[self.t,]["MACD_sig"],
                                self.data.iloc[self.t,]["SMA_10"])
        
        # Initiate the agent
        self.agent = agent(selection = ("UpperConfidenceBounce",0.2))
        self.agent_record = pd.DataFrame(data=[],index = data.index,columns = ["Action","Rewards_0","Rewards_1","Rewards_2"])
        
        self.num_action_made = 0
        self.action_count = [0,0,0]
        self.total_rewards = 0
        self.rewards_count = [0,0,0]
        self.qa = [0,0,0]
        self.last_action = -1
        
        self.last_state_vector = self.curr_state.get_state()
        
    def make_decision_by_agent(self):
        action = self.agent.make_action(self.curr_state,self.qa,self.num_action_made,self.action_count)
        if action == -1:
            print("Agent make no decision~~~")
        else:
            self.num_action_made += 1
            self.action_count[action] += 1
            new_cap = cap = self.curr_state.get_cash_owned()
            new_share = share = self.curr_state.get_share_num_owned()
            price = self.curr_state.get_open_price()
            max_share_buy = math.floor(cap/price)
            # If SELL then
            if action == 0:
                if share == 0:
                    print("Agent said HOLD (nothing to sell)!!")
                    self.agent_record.iloc[self.t,]["Action"] = "HOLD"
                else:
                    new_cap = cap + price*share*(1-CMS_RATE)
                    new_share = share - share
                    print("Agent said SELL!!")
                    self.agent_record.iloc[self.t,]["Action"]  = "SELL"

            # If HOLD then
            elif action == 1:
                print("Agent said HOLD!!")
                self.agent_record.iloc[self.t,]["Action"]  = "HOLD"
                
            # If BUY then
            elif action == 2:
                if max_share_buy == 0:
                    print("Agent said HOLD (no money buy)!!")
                    self.agent_record.iloc[self.t,]["Action"]  = "HOLD"
                else:
                    new_cap = cap - price*max_share_buy*(1+CMS_RATE)
                    new_share = share + max_share_buy
                    print("Agent said BUY!!")
                    self.agent_record.iloc[self.t,]["Action"]  = "BUY"

            self.num_share = new_share
            self.capital = new_cap   
            self.last_action = action
            self.last_state_vector = self.curr_state.get_state()
            print("Old State: ")
            self.curr_state.print_state()    
    
    def next_state(self):
        self.t += 1
        if self.t >= self.max_t:
            return True
        self.curr_state.update_state(self.capital,self.num_share,
                                       self.data.iloc[self.t,]["OPEN"],self.data.iloc[self.t,]["LB"],
                                       self.data.iloc[self.t,]["MB"],self.data.iloc[self.t,]["UB"],
                                       self.data.iloc[self.t,]["MACD"],self.data.iloc[self.t,]["MACD_sig"],
                                       self.data.iloc[self.t,]["SMA_10"])
        
        print("New State: ")
        self.curr_state.print_state()    
        return False
        
    def calculate_reward(self):
        if not self.last_action == -1:
            rew = reward(self.last_state_vector,self.curr_state.get_state(),self.last_action) # Initiate a reward if last action exists
            rew.calculate_reward() # calculate the rewards
            self.total_rewards += rew.get_reward() # add the rewards to the agent
            self.rewards_count[self.last_action] += rew.get_reward()
            print("Last action is wher?", self.last_action)
            
        # If SELL then
        if self.last_action == 0:
            print("123", rew.get_reward())
            self.agent_record.iloc[self.t,]["Rewards_0"] = rew.get_reward()
            if self.t > 0:
                self.agent_record.iloc[self.t,]["Rewards_1"] = self.agent_record.iloc[self.t-1,]["Rewards_1"]
                self.agent_record.iloc[self.t,]["Rewards_2"] = self.agent_record.iloc[self.t-1,]["Rewards_2"]

        # If HOLD then
        elif self.last_action == 1:
            print("456", rew.get_reward())
            self.agent_record.iloc[self.t,]["Rewards_1"] = rew.get_reward()
            if self.t > 0:
                self.agent_record.iloc[self.t,]["Rewards_0"] = self.agent_record.iloc[self.t-1,]["Rewards_0"]
                self.agent_record.iloc[self.t,]["Rewards_2"] = self.agent_record.iloc[self.t-1,]["Rewards_2"]

        # If BUY then
        elif self.last_action == 2:
            print("789", rew.get_reward())
            self.agent_record.iloc[self.t,]["Rewards_2"] = rew.get_reward()
            if self.t > 0:
                self.agent_record.iloc[self.t,]["Rewards_0"] = self.agent_record.iloc[self.t-1,]["Rewards_0"]
                self.agent_record.iloc[self.t,]["Rewards_1"] = self.agent_record.iloc[self.t-1,]["Rewards_1"]
            
    def update_qa(self):
        for i in range(len(self.qa)):
            if self.action_count[i] == 0:
                self.qa[i] = 0
            else:
                self.qa[i] = self.rewards_count[i]/self.action_count[i]
            print(self.qa)
        
    def iterate(self):
        while self.t < self.max_t:
            print("Running at time ", self.t,"/",self.max_t) 
            print("make decision")
            self.make_decision_by_agent()
            print("Import next state")
            terminate = self.next_state()
            if terminate:
                break
            print("Calculate rewards")
            self.calculate_reward()
            print("update Qa")
            self.update_qa()
        print("Completed")
        

In [329]:
user_cap = 100000 # Initial Captial
user_port = 0 # num of share in single stock portfolio
env = envir(train_X,user_cap,user_port)

In [330]:
env.iterate()

Running at time  0 / 756
make decision
Agent said HOLD (nothing to sell)!!
Old State: 
[100000, 0, 40.746, 40.10437994786118, 41.1746, 42.24482005213881, 0.10162129979634216, 0.2230309753105267, 41.4025]
Import next state
New State: 
[100000, 0, 41.312, 40.15946332256369, 41.20355, 42.24763667743631, 0.09809901307920654, 0.19804458286426269, 41.30180000000001]
Calculate rewards
[100000, 0, 40.746, 40.10437994786118, 41.1746, 42.24482005213881, 0.10162129979634216, 0.2230309753105267, 41.4025]
[100000, 0, 41.312, 40.15946332256369, 41.20355, 42.24763667743631, 0.09809901307920654, 0.19804458286426269, 41.30180000000001]
Last action is wher? 0
123 0.0
update Qa
[0.0, 0, 0]
[0.0, 0, 0]
[0.0, 0, 0]
Running at time  1 / 756
make decision
Agent said HOLD (nothing to sell)!!
Old State: 
[100000, 0, 41.312, 40.15946332256369, 41.20355, 42.24763667743631, 0.09809901307920654, 0.19804458286426269, 41.30180000000001]
Import next state
New State: 
[100000, 0, 41.314, 40.23066368534154, 41.24415000




[95840.80028, 0, 42.064, 39.410836717241146, 41.19049999999999, 42.97016328275884, 0.19746928400677888, 0.045980350905072916, 41.780100000000004]
Calculate rewards
[95840.80028, 0, 42.753, 39.39976170931017, 41.17185, 42.94393829068983, 0.24060816027511578, 0.008108117629646416, 41.780100000000004]
[95840.80028, 0, 42.064, 39.410836717241146, 41.19049999999999, 42.97016328275884, 0.19746928400677888, 0.045980350905072916, 41.780100000000004]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -9.907072463768165, -1479.486619999996]
[-665.375033333335, -9.765542857142906, -1479.486619999996]
[-665.375033333335, -9.765542857142906, -1479.486619999996]
Running at time  74 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 42.064, 39.410836717241146, 41.19049999999999, 42.97016328275884, 0.19746928400677888, 0.045980350905072916, 41.780100000000004]
Import next state
New State: 
[95840.80028, 0, 41.019, 39.29402766739266, 41.152699999999996, 43.01137233260733, 0.0

Running at time  124 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 44.962, 43.76168846686917, 45.50580000000001, 47.24991153313085, -0.1036654675347819, 0.09847646609054261, 44.715799999999994]
Import next state
New State: 
[95840.80028, 0, 44.365, 43.73966279572636, 45.4497, 47.15973720427364, -0.07306485765091963, 0.06416820134225017, 44.67809999999999]
Calculate rewards
[95840.80028, 0, 44.962, 43.76168846686917, 45.50580000000001, 47.24991153313085, -0.1036654675347819, 0.09847646609054261, 44.715799999999994]
[95840.80028, 0, 44.365, 43.73966279572636, 45.4497, 47.15973720427364, -0.07306485765091963, 0.06416820134225017, 44.67809999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -5.696566666666695, -1479.486619999996]
[-665.375033333335, -5.649487603305813, -1479.486619999996]
[-665.375033333335, -5.649487603305813, -1479.486619999996]
Running at time  125 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 44.365, 43.73966

[95840.80028, 0, 55.464, 49.14184829817731, 52.4082, 55.67455170182269, 1.7477639813555825, 1.6015547663310226, 53.652499999999996]
Calculate rewards
[95840.80028, 0, 55.326, 48.94978338633088, 52.1601, 55.37041661366912, 1.7474984672584597, 1.5650024625748826, 53.366]
[95840.80028, 0, 55.464, 49.14184829817731, 52.4082, 55.67455170182269, 1.7477639813555825, 1.6015547663310226, 53.652499999999996]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -4.142957575757596, -1479.486619999996]
[-665.375033333335, -4.118000000000021, -1479.486619999996]
[-665.375033333335, -4.118000000000021, -1479.486619999996]
Running at time  170 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 55.464, 49.14184829817731, 52.4082, 55.67455170182269, 1.7477639813555825, 1.6015547663310226, 53.652499999999996]
Import next state
New State: 
[95840.80028, 0, 54.79, 49.46312874957507, 52.60889999999999, 55.75467125042491, 1.6551427860600967, 1.6122723702768373, 53.84740000000001]
Calc

New State: 
[95840.80028, 0, 51.034, 49.19867584907355, 52.211149999999996, 55.22362415092644, -0.8267061956739639, -0.5716694340383625, 51.22499999999999]
Calculate rewards
[95840.80028, 0, 49.887, 49.34185753086966, 52.27445, 55.20704246913034, -0.8275317118449408, -0.507910243629462, 51.4813]
[95840.80028, 0, 51.034, 49.19867584907355, 52.211149999999996, 55.22362415092644, -0.8267061956739639, -0.5716694340383625, 51.22499999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -3.2397535545023857, -1479.486619999996]
[-665.375033333335, -3.2244716981132235, -1479.486619999996]
[-665.375033333335, -3.2244716981132235, -1479.486619999996]
Running at time  216 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 51.034, 49.19867584907355, 52.211149999999996, 55.22362415092644, -0.8267061956739639, -0.5716694340383625, 51.22499999999999]
Import next state
New State: 
[95840.80028, 0, 49.956, 48.89673040983452, 52.006400000000006, 55.116069590165495, -0.894

New State: 
[95840.80028, 0, 42.015, 34.98413967661052, 38.318200000000004, 41.65226032338949, 0.5169612052607135, -0.19317014505729774, 39.3495]
Calculate rewards
[95840.80028, 0, 40.688, 35.06580459771945, 37.99905, 40.932295402280545, 0.2835815454192314, -0.37070298263680057, 38.841899999999995]
[95840.80028, 0, 42.015, 34.98413967661052, 38.318200000000004, 41.65226032338949, 0.5169612052607135, -0.19317014505729774, 39.3495]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -2.5412193308550313, -1479.486619999996]
[-665.375033333335, -2.53180740740742, -1479.486619999996]
[-665.375033333335, -2.53180740740742, -1479.486619999996]
Running at time  274 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 42.015, 34.98413967661052, 38.318200000000004, 41.65226032338949, 0.5169612052607135, -0.19317014505729774, 39.3495]
Import next state
New State: 
[95840.80028, 0, 42.444, 34.93327989305477, 38.60385000000001, 42.27442010694525, 0.6951138025124592, -0.015513

Old State: 
[95840.80028, 0, 48.687, 45.16927457240581, 47.43705, 49.70482542759419, 1.2628445391094303, 1.312172775361913, 48.44160000000001]
Import next state
New State: 
[95840.80028, 0, 48.707, 45.279896495429576, 47.619699999999995, 49.959503504570414, 1.2887034551380339, 1.3074789113171372, 48.631600000000006]
Calculate rewards
[95840.80028, 0, 48.687, 45.16927457240581, 47.43705, 49.70482542759419, 1.2628445391094303, 1.312172775361913, 48.44160000000001]
[95840.80028, 0, 48.707, 45.279896495429576, 47.619699999999995, 49.959503504570414, 1.2887034551380339, 1.3074789113171372, 48.631600000000006]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -2.1429090909091015, -1479.486619999996]
[-665.375033333335, -2.1362125000000107, -1479.486619999996]
[-665.375033333335, -2.1362125000000107, -1479.486619999996]
Running at time  324 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 48.707, 45.279896495429576, 47.619699999999995, 49.959503504570414, 1.288703

[-665.375033333335, -1.8475351351351443, -1479.486619999996]
[-665.375033333335, -1.8475351351351443, -1479.486619999996]
Running at time  374 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 48.685, 43.75868499038493, 47.2831, 50.80751500961507, 0.6773827468386671, 0.5031971352435254, 48.517999999999994]
Import next state
New State: 
[95840.80028, 0, 49.781, 44.769932259318175, 47.6294, 50.48886774068182, 0.7249961923087795, 0.5475569466565763, 48.70569999999999]
Calculate rewards
[95840.80028, 0, 48.685, 43.75868499038493, 47.2831, 50.80751500961507, 0.6773827468386671, 0.5031971352435254, 48.517999999999994]
[95840.80028, 0, 49.781, 44.769932259318175, 47.6294, 50.48886774068182, 0.7249961923087795, 0.5475569466565763, 48.70569999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.8475351351351443, -1479.486619999996]
[-665.375033333335, -1.8425552560646992, -1479.486619999996]
[-665.375033333335, -1.8425552560646992, -1479.486619999996]
Running

update Qa
[-665.375033333335, -1.6122358490566118, -1479.486619999996]
[-665.375033333335, -1.6084423529411844, -1479.486619999996]
[-665.375033333335, -1.6084423529411844, -1479.486619999996]
Running at time  429 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 54.102, 49.24384022430064, 52.271249999999995, 55.29865977569935, 1.0497290826623882, 0.8248253762122506, 53.4385]
Import next state
New State: 
[95840.80028, 0, 54.376, 49.21698048129488, 52.42385, 55.63071951870512, 1.094401850299704, 0.8787406710297413, 53.7721]
Calculate rewards
[95840.80028, 0, 54.102, 49.24384022430064, 52.271249999999995, 55.29865977569935, 1.0497290826623882, 0.8248253762122506, 53.4385]
[95840.80028, 0, 54.376, 49.21698048129488, 52.42385, 55.63071951870512, 1.094401850299704, 0.8787406710297413, 53.7721]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.6084423529411844, -1479.486619999996]
[-665.375033333335, -1.6046666666666747, -1479.486619999996]
[-665.375033333335,

Calculate rewards
[95840.80028, 0, 66.61, 63.953543202162514, 65.2729, 66.5922567978375, 1.1431673152050763, 1.3539271717726287, 65.48179999999999]
[95840.80028, 0, 66.264, 63.99686412681452, 65.35040000000001, 66.7039358731855, 1.1288386248926798, 1.3089094623966389, 65.53399999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.415296066252595, -1479.486619999996]
[-665.375033333335, -1.4123719008264533, -1479.486619999996]
[-665.375033333335, -1.4123719008264533, -1479.486619999996]
Running at time  488 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 66.264, 63.99686412681452, 65.35040000000001, 66.7039358731855, 1.1288386248926798, 1.3089094623966389, 65.53399999999999]
Import next state
New State: 
[95840.80028, 0, 66.316, 64.01025340333054, 65.4592, 66.90814659666945, 1.149899182341457, 1.2771074063856025, 65.69399999999999]
Calculate rewards
[95840.80028, 0, 66.264, 63.99686412681452, 65.35040000000001, 66.7039358731855, 1.1288386248926798,

New State: 
[95840.80028, 0, 73.306, 67.835848658777, 76.36165, 84.887451341223, -1.3719519728827123, -0.7501689692846797, 72.94949999999999]
Calculate rewards
[95840.80028, 0, 75.091, 67.99849373980855, 76.55154999999999, 85.10460626019143, -1.5940547734912087, -0.5947232183851715, 73.46579999999999]
[95840.80028, 0, 73.306, 67.835848658777, 76.36165, 84.887451341223, -1.3719519728827123, -0.7501689692846797, 72.94949999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.26590370370371, -1479.486619999996]
[-665.375033333335, -1.2635637707948306, -1479.486619999996]
[-665.375033333335, -1.2635637707948306, -1479.486619999996]
Running at time  545 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 73.306, 67.835848658777, 76.36165, 84.887451341223, -1.3719519728827123, -0.7501689692846797, 72.94949999999999]
Import next state
New State: 
[95840.80028, 0, 73.078, 67.44054915027255, 76.0183, 84.59605084972745, -1.3762309527719765, -0.875381365982139, 7

Running at time  601 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 78.297, 69.01836300446894, 75.19085, 81.36333699553106, 2.6763885611860445, 2.5570847460006765, 77.7191]
Import next state
New State: 
[95840.80028, 0, 80.215, 69.76192607617176, 75.6163, 81.47067392382823, 2.5982153906846293, 2.5653108749374676, 77.76169999999999]
Calculate rewards
[95840.80028, 0, 78.297, 69.01836300446894, 75.19085, 81.36333699553106, 2.6763885611860445, 2.5570847460006765, 77.7191]
[95840.80028, 0, 80.215, 69.76192607617176, 75.6163, 81.47067392382823, 2.5982153906846293, 2.5653108749374676, 77.76169999999999]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.1450385259631548, -1479.486619999996]
[-665.375033333335, -1.1431237458194037, -1479.486619999996]
[-665.375033333335, -1.1431237458194037, -1479.486619999996]
Running at time  602 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 80.215, 69.76192607617176, 75.6163, 81.47067392382823, 2.598215

[-665.375033333335, -1.0452415902140724, -1479.486619999996]
Running at time  658 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 113.7, 86.1573702601384, 102.39565000000002, 118.63392973986163, 5.64652197087608, 4.788464257956824, 110.2]
Import next state
New State: 
[95840.80028, 0, 114.1, 86.63689128417164, 103.32764999999999, 120.01840871582834, 5.728115774307767, 4.976394561227012, 111.079]
Calculate rewards
[95840.80028, 0, 113.7, 86.1573702601384, 102.39565000000002, 118.63392973986163, 5.64652197087608, 4.788464257956824, 110.2]
[95840.80028, 0, 114.1, 86.63689128417164, 103.32764999999999, 120.01840871582834, 5.728115774307767, 4.976394561227012, 111.079]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -1.0452415902140724, -1479.486619999996]
[-665.375033333335, -1.0436458015267227, -1479.486619999996]
[-665.375033333335, -1.0436458015267227, -1479.486619999996]
Running at time  659 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028,

update Qa
[-665.375033333335, -0.9655197740113042, -1479.486619999996]
[-665.375033333335, -0.9641579689703855, -1479.486619999996]
[-665.375033333335, -0.9641579689703855, -1479.486619999996]
Running at time  713 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 110.34, 108.94792231641216, 115.82199999999997, 122.69607768358779, -0.6904745883074668, 0.19088465562345777, 114.08700000000002]
Import next state
New State: 
[95840.80028, 0, 108.42, 107.7796670221255, 115.43799999999999, 123.09633297787448, -1.1094642539475075, -0.06918512629073525, 113.37100000000001]
Calculate rewards
[95840.80028, 0, 110.34, 108.94792231641216, 115.82199999999997, 122.69607768358779, -0.6904745883074668, 0.19088465562345777, 114.08700000000002]
[95840.80028, 0, 108.42, 107.7796670221255, 115.43799999999999, 123.09633297787448, -1.1094642539475075, -0.06918512629073525, 113.37100000000001]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -0.9641579689703855, -1479.486619999996

New State: 
[95840.80028, 0, 130.99, 114.17921303698816, 122.74100000000003, 131.3027869630119, 2.997340840473271, 2.3482298494059815, 125.44800000000002]
Calculate rewards
[95840.80028, 0, 124.43, 113.28116542259517, 121.84400000000001, 130.40683457740485, 2.660618079647179, 2.1859521016391588, 124.70100000000002]
[95840.80028, 0, 130.99, 114.17921303698816, 122.74100000000003, 131.3027869630119, 2.997340840473271, 2.3482298494059815, 125.44800000000002]
Last action is wher? 1
456 0.0
update Qa
[-665.375033333335, -0.9188010752688217, -1479.486619999996]
[-665.375033333335, -0.9175677852349039, -1479.486619999996]
[-665.375033333335, -0.9175677852349039, -1479.486619999996]
Running at time  749 / 756
make decision
Agent said HOLD!!
Old State: 
[95840.80028, 0, 130.99, 114.17921303698816, 122.74100000000003, 131.3027869630119, 2.997340840473271, 2.3482298494059815, 125.44800000000002]
Import next state
New State: 
[95840.80028, 0, 131.54, 115.206196459864, 123.52650000000003, 131.84680

In [327]:
env.agent_record

Unnamed: 0,Action,Rewards_0,Rewards_1,Rewards_2
8399,HOLD,,,
8400,HOLD,0.0,,
8401,HOLD,0.0,,
8402,BUY,0.0,0.0,
8403,HOLD,0.0,0.0,-1479.48662
...,...,...,...,...
9150,HOLD,-1996.1251,0.0,-1479.48662
9151,HOLD,-1996.1251,0.0,-1479.48662
9152,HOLD,-1996.1251,0.0,-1479.48662
9153,HOLD,-1996.1251,0.0,-1479.48662
