In [None]:
!pip install tensorboardX #Install tensorboardX module

Collecting tensorboardX
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Softmax, Concatenate, Reshape
import numpy as np
import random
from collections import deque
import time
import os
import json
import copy
from datetime import datetime
from collections import deque
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorboardX import SummaryWriter
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Conv1D,Conv2D, MaxPooling1D, Activation, Concatenate, LSTM, ZeroPadding2D #, LSTM
#from tensorflow.compat.v1.keras.layers import CuDNNLSTM as LSTM #This line was removed, because CuDNNLSTM is not supported in TensorFlow 2.x
import pandas as pd

In [None]:
K.set_image_data_format("channels_first")

random.seed(2002)
np.random.seed(32)
tf.random.set_seed(100)

In [None]:
class CustomEnv:
    def __init__(self, df, df_normalized, initial_balance=1000, stocks=['USDCUSDT','BTCUSDT','BNBBTC','BNBBTC'],lookback_window_size=50, model=''):
        # Define action space and state size and other custom parameters
        self.xarray = df_normalized
        self.df = df
        self.df_total_steps = self.xarray.shape[0]
        self.initial_balance = initial_balance
        self.lookback_window_size = lookback_window_size		# Historical data window
        self.normalize_value = 40000		#Value to normalize transaction data
        self.model = model

        self.weights = [1]+[0]*(self.xarray.shape[2]-1)		# Initial Weights
        self.quants = [0]*self.xarray.shape[2] 		#Initial quantities
        self.quants_ubah = [0]*self.xarray.shape[2]		# Initial quantities for buy and hold


        self.cash = 0 # Amout of cash

        self.stocks =  stocks 	# list of assets
        self.market_state = dict.fromkeys(self.stocks)		# Dict for each asset


        #Initial amount of money to Buy n hold
        self.ubah = initial_balance


        #Deque for Order History
        self.orders_history = deque(maxlen=self.lookback_window_size)
        self.market_history = deque(maxlen=self.lookback_window_size)  # Market history contains the OHCL/Technical Features values for the last lookback_window_size prices (open, high, close, low)


    # Reset the state of the environment to an initial state
    def reset(self, env_steps_size = 0):
        self.balance = self.initial_balance
        self.net_worth = self.initial_balance
        self.prev_net_worth = self.initial_balance
        self.weights = [1]+[0]*(self.xarray.shape[2]-1)
        self.quants = [0]*self.xarray.shape[2]
        self.quants_ubah = [0]*self.xarray.shape[2]
        self.short_sell = [1,1,1]
        self.cash = self.initial_balance
        self.ubah = self.initial_balance



        if env_steps_size > 0: # used for training dataset
            # Randomly selects a value contained between the initial size of the dataset and the final size minus the number of steps.
            self.start_step = random.randint(self.lookback_window_size, self.df_total_steps -env_steps_size)
            self.end_step = self.start_step + env_steps_size
        else: # used for testing dataset
            # Randomly selects a value contained between the initial size of the dataset and the final size minus the number of steps.
            self.start_step = random.randint(self.lookback_window_size, self.df_total_steps -env_steps_size)
            self.end_step = self.start_step + env_steps_size


        self.current_step = self.start_step #Define initial Step
        self.quants_ubah = [(self.initial_balance/len(self.weights))/ np.array([self.df[self.current_step,2,x] for x in range(0,len(self.stocks))])]    # Defines the quantities for buy n hold

        #Append the data from end t beginning
        for i in reversed(range(self.lookback_window_size)):
            current_step = self.current_step - i
            self.orders_history.append([self.net_worth/self.normalize_value,
                          self.cash/self.normalize_value] +
                          [number for number in self.quants] +
                          [number for number in self.weights])

        #Append the data from end t beginning for each asset
        for j in range(0,len(self.stocks)):
            self.market_state[str(j)] = deque(maxlen=self.lookback_window_size)

            for i in reversed(range(self.lookback_window_size)):
              current_step = self.current_step - i
              self.market_state[str(j)].append(self.xarray[current_step, :,j])

        #The model is EIIE type, the state will contain only asset-related data, transaction history data, is added separately in the network.
        if self.model == "EIIE":
            state = np.stack(([self.market_state[str(x)] for x in range(0,len(self.stocks))]))
        else:
            state = np.concatenate(([self.market_state[str(x)] for x in range(0,len(self.stocks))]), axis=1)
            state = np.concatenate((state, self.orders_history) , axis=1)

        return state, self.orders_history



    # Get the data points for the given current_step
    def _next_observation(self):
        start = time.time()
        # In this step, it updates the state with the most recent point that was used in 'step', for example, in Step it takes the next point after the market history, so if the market history goes to t, in the step it takes the point t+1, in the next observation it appends this point.
        for j in range(0, len(self.stocks)):
          self.market_state[str(j)].append(self.xarray[self.current_step, :, j])


        if self.model == "EIIE":
            obs = np.stack(([self.market_state[str(x)] for x in range(0,len(self.stocks))]))
        else:
            obs = np.concatenate(([self.market_state[str(x)] for x in range(0,len(self.stocks))]), axis=1)
            obs = np.concatenate((obs, self.orders_history) , axis=1)

        return obs

    def get_portfolio_percentages(self, prices):
        """Calculates the percentage of each stock in the portfolio."""
        percentages = []
        total_value = self.net_worth
        for i in range(len(self.stocks)):
            stock_value = self.quants[i] * prices[i]
            percentage = (stock_value / total_value) * 100 if total_value else 0  # Avoid division by zero
            percentages.append((self.stocks[i],percentage))
        return percentages

    # Execute one time step within the environment
    def step(self, prediction):
        # Use to calculate the transactions fee
        prices_ant =  np.array([self.df[self.current_step,2,x] for x in range(0,len(self.stocks))])
        # One step on env
        self.current_step += 1


        # Get the prices in the current step
        prices = np.array([self.df[self.current_step,2,x] for x in range(0,len(self.stocks))])

        #Calculates the balance considering the quantities purchased in the previous step, and the prices at the current time
        self.balance = self.cash + np.dot(prices[1:],self.quants[1:])


        # Use to calculate the transactions fee
        quants_ant = self.quants

        #Get the quantities, considering the current values and the balance of the previous transaction
        self.quants = [self.balance*prediction[x]/prices[x] for x in range(0,len(self.stocks))]

        # Calculate the tax of buying and selling, 10% of the difference between quants of the periods
        # 0,1% is the binance tax source
        tax = np.sum(abs(np.dot(np.array(self.quants),prices) - np.dot(np.array(quants_ant),prices_ant)))*0.001


        #See the value of the cash term(Stable currency, in the future consider whether this approach is valid)
        self.cash = self.quants[0]*prices[0]

        #Save the previous net worth
        self.prev_net_worth = self.net_worth


        #Calculate the new portfolio value
        self.net_worth = np.dot(self.quants,prices) - tax

        # Calculate portfolio percentages
        percentages = self.get_portfolio_percentages(prices)

        #Append the transactions values to deque
        self.orders_history.append([self.net_worth/self.normalize_value,
                      self.cash/self.normalize_value] +
                      [number/self.normalize_value for number in self.quants] + prediction.tolist())

        # Calculate reward
        reward = np.log(self.net_worth/self.prev_net_worth)
        #reward = self.net_worth - self.prev_net_worthh

        if self.net_worth <= self.initial_balance/2:
          done = True
        else:
          done = False
        obs = self._next_observation()


        return obs, self.orders_history, reward, done, prices, percentages

    # render environment
    def render(self):
        print(f'Step: {self.current_step}, Net Worth: {self.net_worth}')

In [None]:
class CustomAgent:
    # A custom Bitcoin trading agent
    def __init__(self, lookback_window_size=50, lr=0.00005, epochs=1, stocks=[], optimizer=Adam, batch_size=32, model='', shape = [],depth=0, comment=""):
        self.lookback_window_size = lookback_window_size
        self.comment = comment
        self.depth = depth
        self.stocks = stocks
        self.shape = shape
        self.model = model

        # Action Space it goes from 0 to the number of assets in the portfolio
        self.action_space = np.array(range(0,len(self.stocks)))

        # Create a folder to save models
        self.log_name = datetime.now().strftime("%Y_%m_%d_%H_%M")+"_Crypto_trader"

        # State size contains Market+Orders+Indicators history for the last lookback_window_size steps
        if self.model =="EIIE":
            self.state_size = (len(stocks), lookback_window_size, self.shape[1])
        else:
            self.state_size = (lookback_window_size, self.shape[1]*self.shape[2]+2+2*self.shape[2]) # 5 standard OHCL information + market and indicators

        # Neural Networks part
        self.lr = lr
        self.epochs = epochs
        self.optimizer = optimizer
        self.batch_size = batch_size

        # Create shared Actor-Critic network model
        self.Actor = self.Critic = Shared_Model(input_shape=self.state_size, action_space = self.action_space.shape[0], lr=self.lr, optimizer = self.optimizer, model=self.model)



    # create tensorboard writer
    def create_writer(self, initial_balance, normalize_value, train_episodes):
        self.replay_count = 0
        self.writer = SummaryWriter('runs/'+self.log_name)

        # Create folder to save models
        if not os.path.exists(self.log_name):
          os.makedirs(self.log_name)

        self.start_training_log(initial_balance, normalize_value, train_episodes)

    def start_training_log(self, initial_balance, normalize_value, train_episodes):
        # save training parameters to Parameters.json file for future
        current_date = datetime.now().strftime('%Y-%m-%d %H:%M')
        params = {
          "training start": current_date,
          "initial balance": initial_balance,
          "training episodes": train_episodes,
          "lookback window size": self.lookback_window_size,
          "depth": self.depth,
          "lr": self.lr,
          "epochs": self.epochs,
          "batch size": self.batch_size,
          "normalize value": normalize_value,
          "model": self.model,
          "comment": self.comment,
          "saving time": "",
          "Actor name": "",
          "Critic name": "",
        }
        with open(self.log_name+"/Parameters.json", "w") as write_file:
          json.dump(params, write_file, indent=4)


    def get_gaes(self, rewards, dones, values, next_values, gamma = 0.99, lamda = 0.95, normalize=True):
        deltas = [r + gamma * (1 - d) * nv - v for r, d, nv, v in zip(rewards, dones, next_values, values)]
        deltas = np.stack(deltas)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(deltas) - 1)):
          gaes[t] = gaes[t] + (1 - dones[t]) * gamma * lamda * gaes[t + 1]

        target = gaes + values
        if normalize:
          gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)
        return np.vstack(gaes), np.vstack(target)

    def replay(self, states,orders,  rewards, predictions, dones, next_states, orders_history):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        order = np.vstack(orders)
        next_states = np.vstack(next_states)
        orders_history =  np.vstack(orders_history)

        if self.model == "EIIE":
            values = self.Critic.critic_predict(states, np.expand_dims(order, axis=1))
        else:
            values = self.Critic.critic_predict(states, np.expand_dims(np.expand_dims(order, axis=0), axis=0))

        predictions = np.vstack(predictions)
        next_values = self.Critic.critic_predict(next_states, np.expand_dims(orders_history, axis=1))

        # Compute advantages
        advantages, target = self.get_gaes(rewards, dones, np.squeeze(values), np.squeeze(next_values))
        '''
        plt.plot(target,'-')
        plt.plot(advantages,'.')
        ax=plt.gca()
        ax.grid(True)
        plt.show()
        '''
        # Stack everything to numpy array
        y_true = np.hstack([advantages, predictions])


        # training Actor and Critic networks
        if self.model == "EIIE":
            a_loss = self.Actor.Actor.fit([states,np.expand_dims(order, axis=1)], y_true, epochs=self.epochs, verbose=0, shuffle=True, batch_size=self.batch_size)
            c_loss = self.Critic.Critic.fit([states,np.expand_dims(order, axis=1)], target, epochs=self.epochs, verbose=0, shuffle=True, batch_size=self.batch_size)
        else:
            a_loss = self.Actor.Actor.fit(states, y_true, epochs=self.epochs, verbose=0, shuffle=True, batch_size=self.batch_size)
            c_loss = self.Critic.Critic.fit(states, target, epochs=self.epochs, verbose=0, shuffle=True, batch_size=self.batch_size)

        self.writer.add_scalar('Data/actor_loss_per_replay', np.sum(a_loss.history['loss']), self.replay_count)
        self.writer.add_scalar('Data/critic_loss_per_replay', np.sum(c_loss.history['loss']), self.replay_count)
        self.replay_count += 1

        return np.sum(a_loss.history['loss']), np.sum(c_loss.history['loss'])

    def act(self, state, order):
        # Use the network to predict the next action to take, using the model
        prediction = self.Actor.actor_predict(np.expand_dims(state, axis=0), np.expand_dims(np.expand_dims(order, axis=0), axis=0))[0]

        # print(prediction)

        return prediction

    def save(self, name="Crypto_trader", score="", args=[]):
        # save keras model weights
        self.Actor.Actor.save_weights(f"{self.log_name}/{score}_{name}_Actor.weights.h5")
        self.Critic.Critic.save_weights(f"{self.log_name}/{score}_{name}_Critic.weights.h5")

        # update json file settings
        if score != "":
          with open(self.log_name+"/Parameters.json", "r") as json_file:
            params = json.load(json_file)
          params["saving time"] = datetime.now().strftime('%Y-%m-%d %H:%M')
          params["Actor name"] = f"{score}_{name}_Actor.h5"
          params["Critic name"] = f"{score}_{name}_Critic.h5"
          with open(self.log_name+"/Parameters.json", "w") as write_file:
            json.dump(params, write_file, indent=4)

        # log saved model arguments to file
        if len(args) > 0:
          with open(f"{self.log_name}/log.txt", "a+") as log:
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            arguments = ""
            for arg in args:
              arguments += f", {arg}"
            log.write(f"{current_time}{arguments}\n")

    def load(self, folder, name):
        # load keras model weights
        self.Actor.Actor.load_weights(os.path.join(folder, f"{name}_Actor.h5"))
        self.Critic.Critic.load_weights(os.path.join(folder, f"{name}_Critic.h5"))

In [None]:
#tf.config.experimental_run_functions_eagerly(True) # used for debuging and development
# tf.compat.v1.disable_eager_execution() # usually using this for fastest performance
tf.keras.utils.disable_interactive_logging()
np.random.seed(32)
tf.random.set_seed(100)

gpus = tf.config.experimental.list_physical_devices('GPU')
if len(gpus) > 0:
	print(f'GPUs {gpus}')
	try: tf.config.experimental.set_memory_growth(gpus[0], True)
	except RuntimeError: pass

GPUs [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
class Shared_Model:
    def __init__(self, input_shape, action_space, lr, optimizer, model="Dense"):
        X_input = Input(input_shape)
        self.action_space = action_space

        self.model = model

        # Shared CNN layers:
        if model=="CNN":
          X = Conv1D(filters=64, kernel_size=6, padding="same", activation="tanh")(X_input)
          X = MaxPooling1D(pool_size=2)(X)
          X = Conv1D(filters=32, kernel_size=3, padding="same", activation="tanh")(X)
          X = MaxPooling1D(pool_size=2)(X)
          X = Flatten()(X)
        #EIIE Layers
        elif model=="EIIE":
          X = Conv2D(2, (3, 1))(X_input)
          X = Activation('relu')(X)
          X = Conv2D(20, (48, 1))(X)
          X = Activation('relu')(X)
          # print("X shape:", X.shape)
          inputB = Input(shape=(1, 50, 10))
          modelB = Conv2D(filters=2, kernel_size=(3, 1), activation='relu')(inputB)
          modelB = Conv2D(filters=20, kernel_size=(50 - 2, 1), activation='relu')(modelB)
          modelB = ZeroPadding2D(padding=((0, 0), (0, 4)))(modelB)
          # print("modelB shape:", modelB.shape)
          merged = Concatenate(axis=3)([X, modelB])
          X = Conv2D(filters=1, kernel_size=(1, 1))(merged)

          #output = Dense(self.action_space, activation="softmax")(x)
        # Shared LSTM layers:
        elif model=="LSTM":
          X = LSTM(512, return_sequences=True)(X_input)
          X = LSTM(256)(X)

        # Shared Dense layers:
        else:
          X = Flatten()(X_input)
          X = Dense(512, activation="relu")(X)

        # Critic model
        V = Dense(512, activation="relu")(X)
        V = Dense(256, activation="relu")(V)
        V = Dense(64, activation="relu")(V)
        value = Dense(1, activation=None)(V)
        if model == "EIIE":
          self.Critic = Model(inputs=[X_input,inputB], outputs = value)
        else:
          self.Critic = Model(inputs=X_input, outputs = value)
        self.Critic.compile(loss=self.critic_PPO2_loss, optimizer=optimizer(learning_rate=lr))

        # Actor model
        A = Dense(512, activation="relu")(X)
        A = Dense(256, activation="relu")(A)
        A = Dense(64, activation="relu")(A)
        output = Dense(self.action_space, activation="softmax")(A)
        if model == "EIIE":
          self.Actor = Model(inputs = [X_input,inputB], outputs = output)
        else:
          self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))

    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        advantages, prediction_picks = y_true[:, :1], y_true[:, 1:1+self.action_space]
        LOSS_CLIPPING = 0.2
        ENTROPY_LOSS = 0.001
        # Calculate the ratio pi_theta(a_t | s_t) / pi_theta_k(a_t | s_t)
        # NOTE: we just subtract the logs, which is the same as
        # dividing the values and then canceling the log with e^log.
        # For why we use log probabilities instead of actual probabilities,
        # here's a great explanation:
        # https://cs.stackexchange.com/questions/70518/why-do-we-use-the-log-in-gradient-based-reinforcement-algorithms
        # TL;DR makes gradient ascent easier behind the scenes.
        prob = y_pred
        old_prob = prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))

        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = ENTROPY_LOSS * K.mean(entropy)

        total_loss = actor_loss - entropy

        return total_loss

    def actor_predict(self, state, order):
        if self.model == "EIIE":
          return self.Actor.predict([state, order])
        else:
          return self.Actor.predict([state, np.zeros((state.shape[0], 1))])

    def critic_PPO2_loss(self, y_true, y_pred):
        value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss

    def critic_predict(self, state, order):
        if self.model == "EIIE":
          return self.Critic.predict([state, order])
        else:
          return self.Critic.predict([state, np.zeros((state.shape[0], 1))])


class Actor_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)
        self.action_space = action_space

        X = Flatten(input_shape=input_shape)(X_input)
        X = Dense(512, activation="relu")(X)
        X = Dense(256, activation="relu")(X)
        X = Dense(64, activation="relu")(X)
        output = Dense(self.action_space, activation="softmax")(X)

        self.Actor = Model(inputs = X_input, outputs = output)
        self.Actor.compile(loss=self.ppo_loss, optimizer=optimizer(learning_rate=lr))


    def ppo_loss(self, y_true, y_pred):
        # Defined in https://arxiv.org/abs/1707.06347
        advantages, prediction_picks, actions = y_true[:, :1], y_true[:, 1:1+self.action_space], y_true[:, 1+self.action_space:]
        LOSS_CLIPPING = 0.2
        ENTROPY_LOSS = 0.001

        prob = actions * y_pred
        old_prob = actions * prediction_picks

        prob = K.clip(prob, 1e-10, 1.0)
        old_prob = K.clip(old_prob, 1e-10, 1.0)

        ratio = K.exp(K.log(prob) - K.log(old_prob))

        p1 = ratio * advantages
        p2 = K.clip(ratio, min_value=1 - LOSS_CLIPPING, max_value=1 + LOSS_CLIPPING) * advantages

        actor_loss = -K.mean(K.minimum(p1, p2))

        entropy = -(y_pred * K.log(y_pred + 1e-10))
        entropy = ENTROPY_LOSS * K.mean(entropy)

        total_loss = actor_loss - entropy

        return total_loss

    def actor_predict(self, state):
        return self.Actor.predict(state)

class Critic_Model:
    def __init__(self, input_shape, action_space, lr, optimizer):
        X_input = Input(input_shape)

        V = Flatten(input_shape=input_shape)(X_input)
        V = Dense(512, activation="relu")(V)
        V = Dense(256, activation="relu")(V)
        V = Dense(64, activation="relu")(V)
        value = Dense(1, activation=None)(V)

        self.Critic = Model(inputs=X_input, outputs = value)
        self.Critic.compile(loss=self.critic_PPO2_loss, optimizer=optimizer(learning_rate=lr))

    def critic_PPO2_loss(self, y_true, y_pred):
        value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss
        return value_loss

    def critic_predict(self, state):
        return self.Critic.predict([state, np.zeros((state.shape[0], 1))])


In [None]:
def train_agent(env, agent, visualize=False, train_episodes = 50, training_batch_size=500):
    agent.create_writer(env.initial_balance, env.normalize_value, train_episodes) # create TensorBoard writer
    total_average = deque(maxlen=100) # save recent 100 episodes net worth
    best_average = 0 # used to track best average net worth

    for episode in range(train_episodes):
        #Reset the env
        state, order = env.reset(env_steps_size = training_batch_size)
        states, orders, rewards, predictions, dones, next_states, next_orders = [], [], [], [], [], [], []
        best_prediction = []
        for t in range(training_batch_size):
            # Gets the action to be taken by the agent
            prediction = agent.act(state, np.array(order))


            prediction = np.squeeze(prediction)

            # Perform an action on env
            next_state, next_order, reward, done, prices, percentages = env.step( prediction)
            states.append(np.expand_dims(state, axis=0))
            orders.append(np.expand_dims(order, axis=0))

            next_states.append(np.expand_dims(next_state, axis=0))
            next_orders.append(np.expand_dims(next_order, axis=0))

            rewards.append(reward)
            dones.append(done)
            predictions.append(prediction)
            state = next_state
            order = next_order

        # Train the Model
        a_loss, c_loss = agent.replay(states, orders, rewards, predictions, dones, next_states, next_orders)
        total_average.append(env.net_worth)
        average = np.average(total_average)
        rewardFull = np.average(rewards)

        agent.writer.add_scalar('Data/average reward', rewardFull, episode)
        agent.writer.add_scalar('Data/average net_worth', average, episode)
        agent.writer.add_scalar('Data/average net_worth_percent',  round((average-1000)/10), episode)

        ubah_value = np.dot(env.quants_ubah, prices).item()  # Extract scalar
        diff_value = (env.net_worth - ubah_value).item()  # Ensure scalar

        # Include portfolio percentages in the print statement
        print(len(predictions))
        print(f"Step: {t}, Portfolio Percentages: {percentages}")
        print(f"net worth {episode} {env.net_worth:.2f} {average:.2f} {(average - 1000) / 10:.2f} % UBAH {ubah_value:.2f} diff {diff_value:.2f}")
        if len(total_average) == total_average.maxlen:
          if best_average < average:
            best_average = average
            print("Saving model")
            agent.save(score="{:.2f}".format(best_average), args=[episode, average,  a_loss, c_loss])
        agent.save()

def test_agent(env, visualize=True, test_episodes=10, testing_batch_size=500):

    average_net_worth = 0
    average_UBAH = 0
    for episode in range(test_episodes):
        state, order = env.reset(env_steps_size=testing_batch_size)
        old = 0
        for t in range(testing_batch_size):
            prediction = agent.act(state, np.array(order))
            prediction = np.squeeze(prediction)
            old = env.net_worth
            state, order, reward, done, prices, percentages = env.step(prediction)

        average_net_worth += env.net_worth
        ubah_value = np.dot(env.quants_ubah, prices).item()  # Extract scalar
        diff_value = (env.net_worth - ubah_value).item()  # Ensure scalar
        average_UBAH += ubah_value


        # Include portfolio percentages in the print statement
        print(f"Step: {t}, Portfolio Percentages: {percentages}")
        print("net_worth: {:.2f} % {:.2f} UBAH {:.2f} diff {:.2f}".format(
            env.net_worth, (env.net_worth - 1000) / 10, ubah_value, diff_value
        ))

    print("average: {:.2f} % {:.2f}".format(
        average_net_worth / test_episodes, average_UBAH / test_episodes
    ))

In [None]:
AAPL = pd.read_csv('/content/AAPL.csv', index_col=0)
COST = pd.read_csv('/content/COST.csv', index_col=0)
PEP = pd.read_csv('/content/PEP.csv', index_col=0)
C = pd.read_csv('/content/C.csv', index_col=0)
AAPL_norm = pd.read_csv('/content/AAPL_norm.csv', index_col=0)
COST_norm = pd.read_csv('/content/COST_norm.csv', index_col=0)
PEP_norm = pd.read_csv('/content/PEP_norm.csv', index_col=0)
C_norm = pd.read_csv('/content/C_norm.csv', index_col=0)


train_df = []
train_df_norm = []
train_df.append(AAPL.values)
train_df.append(COST.values)
train_df.append(PEP.values)
train_df.append(C.values)
train_df_norm.append(AAPL_norm.values)
train_df_norm.append(COST_norm.values)
train_df_norm.append(PEP_norm.values)
train_df_norm.append(C_norm.values)


xa = np.copy(np.moveaxis(np.array(train_df),0,-1))
x_norm = np.copy(np.moveaxis(np.array(train_df_norm),0,-1))
lookback_window_size = 50
print('shapedp',x_norm.shape, xa.shape)
train_df = xa[:-10000-lookback_window_size]
train_df_norm = x_norm[:-10000-lookback_window_size]
test_df = xa[-10000:] # ideally 30 days
test_df_norm = x_norm[-10000:]


print('shape12345',train_df.shape)
model = 'EIIE'
train_env = CustomEnv(train_df, train_df_norm, lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
test_env = CustomEnv(test_df, test_df_norm,lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
agent = CustomAgent(lookback_window_size=lookback_window_size, lr=0.00001, epochs=5, stocks=['AAPL','COST','PEP','C'], optimizer=Adam, batch_size = 32, model=model, shape = x_norm.shape)
train_agent(train_env, agent, visualize=False, train_episodes=100, training_batch_size=500)
test_agent(test_env, visualize=False, test_episodes=15, testing_batch_size=500)

shapedp (15096, 15, 4) (15096, 14, 4)
shape12345 (12046, 14, 4)
500
Step: 499, Portfolio Percentages: [('AAPL', 24.99766133466246), ('COST', 24.968004973740175), ('PEP', 24.980778279228357), ('C', 25.05379265109205)]
net worth 0 968.94 968.94 -3.11 % UBAH 1007.53 diff -38.59
500
Step: 499, Portfolio Percentages: [('AAPL', 24.95456578960754), ('COST', 24.919038441900536), ('PEP', 24.993055488092214), ('C', 25.133340976831736)]
net worth 1 1064.19 1016.56 1.66 % UBAH 1110.84 diff -46.66
500
Step: 499, Portfolio Percentages: [('AAPL', 24.893570196054778), ('COST', 24.952231604938056), ('PEP', 24.92275859438855), ('C', 25.23145108973386)]
net worth 2 1059.56 1030.90 3.09 % UBAH 1150.78 diff -91.22
500
Step: 499, Portfolio Percentages: [('AAPL', 24.977213417969647), ('COST', 24.913086197379727), ('PEP', 24.89807475041277), ('C', 25.211741572295832)]
net worth 3 985.19 1019.47 1.95 % UBAH 1009.16 diff -23.96
500
Step: 499, Portfolio Percentages: [('AAPL', 25.127015286445488), ('COST', 24.833

In [None]:
train_env1 = CustomEnv(train_df, train_df_norm, lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
test_env1 = CustomEnv(test_df, test_df_norm,lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
agent1 = CustomAgent(lookback_window_size=lookback_window_size, lr=0.0001, epochs=5, stocks=['AAPL','COST','PEP','C'], optimizer=Adam, batch_size = 32, model=model, shape = x_norm.shape)
train_agent(train_env1, agent1, visualize=False, train_episodes=100, training_batch_size=400)
test_agent(test_env1, visualize=False, test_episodes=15, testing_batch_size=400)

400
Step: 399, Portfolio Percentages: [('AAPL', 24.92718165126963), ('COST', 24.748534082033366), ('PEP', 25.112566055389436), ('C', 25.21201073859835)]
net worth 0 913.94 913.94 -8.61 % UBAH 938.69 diff -24.75
400
Step: 399, Portfolio Percentages: [('AAPL', 28.19927881594445), ('COST', 27.59174903032955), ('PEP', 21.44979971508807), ('C', 22.75925524041011)]
net worth 1 1011.33 962.63 -3.74 % UBAH 935.40 diff 75.93
400
Step: 399, Portfolio Percentages: [('AAPL', 28.907925495744724), ('COST', 27.893540637589858), ('PEP', 21.305551032751076), ('C', 21.893587225081735)]
net worth 2 1095.28 1006.85 0.68 % UBAH 1152.25 diff -56.97
400
Step: 399, Portfolio Percentages: [('AAPL', 30.852383467444056), ('COST', 28.278847121873422), ('PEP', 20.828512524117045), ('C', 20.040318123822384)]
net worth 3 920.87 985.35 -1.46 % UBAH 936.03 diff -15.16
400
Step: 399, Portfolio Percentages: [('AAPL', 30.419330473082418), ('COST', 24.51632588398982), ('PEP', 24.6161190441801), ('C', 20.44830397697747)]
n

In [None]:
train_env1 = CustomEnv(train_df, train_df_norm, lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
test_env1 = CustomEnv(test_df, test_df_norm,lookback_window_size=lookback_window_size, model=model, stocks=['AAPL','COST','PEP','C'])
agent1 = CustomAgent(lookback_window_size=lookback_window_size, lr=0.001, epochs=5, stocks=['AAPL','COST','PEP','C'], optimizer=Adam, batch_size = 32, model=model, shape = x_norm.shape)
train_agent(train_env1, agent1, visualize=False, train_episodes=100, training_batch_size=400)
test_agent(test_env1, visualize=False, test_episodes=15, testing_batch_size=400)