# Step 0: Global Libraries Import

In [123]:
# !pip install exchange_calendars
# !pip install gymnasium
# !pip install yfinance stockstats
# !pip install optuna
# !pip install stable_baselines3
# !pip install pyportfolioopt

In [124]:
#global variables
Tickers = ["AAPL", "COST", "CVX", "MSFT", "JNJ", "NVDA", "PG", "UNH", "V", "WMT"]
#randomly chosen from top 30 tickers.
start_date = '2022-01-01'
end_date = '2024-03-13'
time_interval = "1H"
# INDICATORS = [
#     "macd",
#     "boll_ub",
#     "boll_lb",
#     "rsi_30",
#     "cci_30",
#     "dx_30",
#     "close_30_sma",
#     "close_60_sma",
# ]
#only indicator we used is s&p 500 rn
INDICATORS = ['sp']

In [125]:
#global libraries
import datetime
from datetime import date
from datetime import timedelta
from sqlite3 import Timestamp
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Type
from typing import TypeVar
from typing import Union

import exchange_calendars as tc
import numpy as np
import pandas as pd
import pytz
import yfinance as yf
from stockstats import StockDataFrame as Sdf
from __future__ import annotations

from typing import List

import gymnasium as gym
import matplotlib
import matplotlib.pyplot as plt
from gymnasium import spaces
from gymnasium.utils import seeding
from stable_baselines3.common.vec_env import DummyVecEnv

matplotlib.use("Agg")

from stable_baselines3.common.logger import Logger, KVWriter, CSVOutputFormat


# Step 1: Data Preprocessing
We used hourly US historical stock data from Kaggle dataset, from 2022/1/1 to 2024/3/15. Since the author of dataset is located in Moscow, when cleaning the data, we converted the timezone to New York, then added S&P indexes as a benchmark.

In [126]:
def import_data(tickers: list[str]) -> dict[str, pd.DataFrame]:
  dataframes = {}
  for ticker in tickers:
    filename = f"{ticker}.US_H1.csv"
    df = pd.read_csv(filename)
    dataframes[ticker] = df
  return dataframes

def process_datetime(row):
    dt = pd.to_datetime(row['datetime'])
    dt_moscow = pytz.timezone('Europe/Moscow').localize(dt)
    dt_new_york = dt_moscow.astimezone(pytz.timezone('America/New_York'))
    return dt_new_york


def clean_data(tickers: list[str], start_date: date, end_date: date) -> pd.DataFrame:
  # tic_list = tickers
  dataframes = import_data(tickers)

  # trading_days = get_trading_days(start=start_date, end=end_date)
  new_df = pd.DataFrame()
  for ticker, df in dataframes.items():
      # Add "tic" column
      df["tic"] = ticker
      df = df.sort_values(by='datetime', ascending=True)
      # Rename "datetime" to "timestamp"
      # df.rename(columns={"datetime": "timestamp"}, inplace=True)
      date_formats = ["%Y-%m-%d %H:%M", "%m/%d/%Y %I:%M:%S %p"]

  # Iterate over each format and try to convert datetime
      for date_format in date_formats:
          try:
              df['datetime'] = pd.to_datetime(df['datetime'], format=date_format)
              break
          except ValueError:
              pass
      df['datetime'] = df.apply(lambda row: process_datetime(row), axis=1)


      all_days = pd.date_range(start=start_date, end=end_date, freq='H', tz='America/New_York')
      complete_df = pd.DataFrame(all_days, columns=['datetime'])
      complete_df['key'] = 1
      df['key'] = 1

      # Merge to align existing data with the complete timeframe
      merged_df = pd.merge(complete_df, df, on='datetime', how='left')
      # merged_df.drop(['key'], axis=1, inplace=True)

      # Forward fill missing data
      merged_df['tic'].fillna(method='ffill', inplace=True)
      merged_df.fillna(method='ffill', inplace=True)

      # Filter rows to include only 9 AM to 4 PM and weekdays
      merged_df = merged_df[merged_df['datetime'].dt.hour.between(9, 16)]
      merged_df = merged_df[merged_df['datetime'].dt.weekday < 5]

      # Concatenate the prepared data for this ticker to the new DataFrame
      new_df = pd.concat([new_df, merged_df], ignore_index=True)

  new_df = new_df.reset_index()
  new_df = new_df.rename(columns={"index": "timestamp"})
  return new_df

def add_sp(data):
  sp = pd.read_csv('S&P500.csv')
  #convert to same format
  data['date'] = pd.to_datetime(data['datetime']).dt.normalize()
  sp['Date'] = pd.to_datetime(sp['Date']).dt.tz_localize('America/New_York')

  combined_df = pd.merge(data, sp, left_on='date', right_on='Date', how='left')
  combined_df.fillna(method='ffill', inplace=True)
  #merge by same date
  combined_df['sp'] = combined_df['Close/Last']
  combined_df = combined_df.drop(columns=['Date', 'Close/Last', 'Open', 'High', 'Low', 'date'])
  combined_df['date'] = combined_df['datetime']
  return combined_df

In [127]:
new_df = clean_data(Tickers, start_date, end_date)
new_df = add_sp(new_df)
new_df.head(10)

  all_days = pd.date_range(start=start_date, end=end_date, freq='H', tz='America/New_York')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['tic'].fillna(method='ffill', inplace=True)
  merged_df['tic'].fillna(method='ffill', inplace=True)
  merged_df.fillna(method='ffill', inplace=True)
  all_days = pd.date_range(start=start_date, end=end_date, freq='H', tz='America/New_York')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=T

Unnamed: 0,timestamp,datetime,key_x,open,high,low,close,volume,tic,key_y,sp,date
0,0,2022-01-03 09:00:00-05:00,1,177.83,180.93,177.71,180.45,12571329.0,AAPL,1.0,4796.56,2022-01-03 09:00:00-05:00
1,1,2022-01-03 10:00:00-05:00,1,180.46,181.43,180.27,180.96,12817320.0,AAPL,1.0,4796.56,2022-01-03 10:00:00-05:00
2,2,2022-01-03 11:00:00-05:00,1,180.95,181.77,180.39,181.47,9409836.0,AAPL,1.0,4796.56,2022-01-03 11:00:00-05:00
3,3,2022-01-03 12:00:00-05:00,1,181.47,182.17,181.08,181.69,7256820.0,AAPL,1.0,4796.56,2022-01-03 12:00:00-05:00
4,4,2022-01-03 13:00:00-05:00,1,181.69,182.88,181.64,182.37,9391830.0,AAPL,1.0,4796.56,2022-01-03 13:00:00-05:00
5,5,2022-01-03 14:00:00-05:00,1,182.37,182.47,181.66,181.88,7665642.0,AAPL,1.0,4796.56,2022-01-03 14:00:00-05:00
6,6,2022-01-03 15:00:00-05:00,1,181.87,182.2,181.2,182.0,12634063.0,AAPL,1.0,4796.56,2022-01-03 15:00:00-05:00
7,7,2022-01-03 16:00:00-05:00,1,182.01,182.01,182.0,182.01,7016928.0,AAPL,1.0,4796.56,2022-01-03 16:00:00-05:00
8,8,2022-01-04 09:00:00-05:00,1,182.63,182.94,181.49,182.26,11553605.0,AAPL,1.0,4793.54,2022-01-04 09:00:00-05:00
9,9,2022-01-04 10:00:00-05:00,1,182.27,182.68,180.97,181.03,14797301.0,AAPL,1.0,4793.54,2022-01-04 10:00:00-05:00


# Step 2: Data Splittng
Split the data into two parts - 19 months for training the agents, and 9 for testing

In [128]:
def data_split(df, start, end, target_date_col="datetime"):
    """
    split the dataset into training or testing using date
    :param data: (df) pandas dataframe, start, end
    :return: (df) pandas dataframe
    """
    data = df[(df[target_date_col] >= start) & (df[target_date_col] < end)]

    data = data.sort_values([target_date_col, "tic"], ignore_index=True)
    data.index = data[target_date_col].factorize()[0]
    return data


In [129]:
TRAIN_START_DATE = '2022-01-01'
TRAIN_END_DATE = '2023-07-01'
TEST_START_DATE = '2023-07-01'
TEST_END_DATE = '2024-03-01'

In [130]:
train = data_split(new_df, TRAIN_START_DATE,TRAIN_END_DATE)
test = data_split(new_df, TEST_START_DATE, TEST_END_DATE)
train_length = len(train)
test_length = len(test)
print(train_length)
print(test_length)

31200
13920


In [131]:
train.tail()

Unnamed: 0,timestamp,datetime,key_x,open,high,low,close,volume,tic,key_y,sp,date
3119,25999,2023-06-30 16:00:00-04:00,1,423.13,423.16,422.78,422.81,15705.0,NVDA,1.0,4450.38,2023-06-30 16:00:00-04:00
3119,30575,2023-06-30 16:00:00-04:00,1,151.78,151.8,151.67,151.74,3664992.0,PG,1.0,4450.38,2023-06-30 16:00:00-04:00
3119,35151,2023-06-30 16:00:00-04:00,1,480.96,481.04,480.47,480.64,895538.0,UNH,1.0,4450.38,2023-06-30 16:00:00-04:00
3119,39727,2023-06-30 16:00:00-04:00,1,237.49,237.53,237.32,237.48,1415378.0,V,1.0,4450.38,2023-06-30 16:00:00-04:00
3119,44303,2023-06-30 16:00:00-04:00,1,157.27,157.28,157.16,157.18,1349958.0,WMT,1.0,4450.38,2023-06-30 16:00:00-04:00


# Step 3: Construct Environment
Build training environment mimicing openAI gym, so that it works with stable baseline3

In [132]:
stock_dimension = len(train.tic.unique())
state_space = 1 + 2*stock_dimension + len(INDICATORS)*stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension
INDICATORS = ['sp']
env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

Stock Dimension: 10, State Space: 31


In [133]:
#class make it easier to store variables
class TradingEnvironment(gym.Env):
    metadata = {"render.modes": ["human"]}
    def __init__(
        self,
        df: pd.DataFrame,
        stock_dim: int,
        hmax: int,
        initial_amount: int,
        num_stock_shares: list[int],
        buy_cost_pct: list[float],
        sell_cost_pct: list[float],
        reward_scaling: float,
        state_space: int,
        action_space: int,
        tech_indicator_list: list[str],
        print_verbosity=10,
        totalhour = 0,
        day=0,
        hour=9,
        initial=True,
        previous_state=[],
        iteration="",
    ):
        self.day = day
        self.hour = hour
        self.totalhour = totalhour
        self.df = df
        self.stock_dim = stock_dim
        self.hmax = hmax
        self.num_stock_shares = num_stock_shares
        self.initial_amount = initial_amount  # get the initial cash
        self.buy_cost_pct = buy_cost_pct
        self.sell_cost_pct = sell_cost_pct
        self.reward_scaling = reward_scaling
        self.state_space = state_space
        self.action_space = action_space
        self.tech_indicator_list = tech_indicator_list
        self.action_space = spaces.Box(low=-1, high=1, shape=(self.action_space,))
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.state_space,)
        )
        self.data = self.df.loc[self.day, :]
        self.terminal = False
        self.print_verbosity = print_verbosity
        self.initial = initial
        self.previous_state = previous_state
        self.iteration = iteration
        # initalize state
        self.state = self._initiate_state()

        # initialize reward
        self.reward = 0
        self.turbulence = 0
        self.cost = 0
        self.tests = 0
        self.episode = 0
        #store all the total balance change
        self.balance_memory = [
            self.initial_amount
            + np.sum(
                np.array(self.num_stock_shares)
                * np.array(self.state[1 : 1 + self.stock_dim])
            )
        ]
        #cash + current stock (should be 0)
        self.rewards_memory = []
        self.actions_memory = []
        self.state_memory = ([])
        #store the state in middle of training
        self.date_memory = [self._get_date()]
        self._seed()

    def _sell_stock(self, index, action):
        if (self.state[index + 2 * self.stock_dim + 1] != True):  #if price > 0 (correct data:)
            if self.state[index + self.stock_dim + 1] > 0:
                #if holding more than 0 shares
                sell_num_shares = min(abs(action), self.state[index + self.stock_dim + 1])
                sell_amount = (
                    self.state[index + 1]
                    * sell_num_shares
                    * (1 - self.sell_cost_pct[index]))
                # update balance and do sell
                self.state[0] += sell_amount

                self.state[index + self.stock_dim + 1] -= sell_num_shares
                self.cost += (
                    self.state[index + 1]
                    * sell_num_shares
                    * self.sell_cost_pct[index])
                self.tests += 1
            else:
                sell_num_shares = 0
        else:
            sell_num_shares = 0

        return sell_num_shares


    def _buy_stock(self, index, action):
        if (self.state[index + 2 * self.stock_dim + 1] != True):
            available_amount = self.state[0] // (self.state[index + 1] * (1 + self.buy_cost_pct[index])) 
            #check cost of trading and get max available

            #get as much as possible
            buy_num_shares = min(available_amount, action)
            #   print(available_amount)
            buy_amount = (
                self.state[index + 1]
                * buy_num_shares
                * (1 + self.buy_cost_pct[index])
            )
            self.state[0] -= buy_amount

            self.state[index + self.stock_dim + 1] += buy_num_shares
            self.cost += (
                self.state[index + 1] * buy_num_shares * self.buy_cost_pct[index]
            )
            self.tests += 1
        else:
            buy_num_shares = 0

        return buy_num_shares


    def step(self, actions):
        # Adjust the terminal condition to account for daily trading hours
        self.terminal = self.totalhour >= len(self.df.index.unique()) - 1 // 8 - 1

        if self.terminal:
            end_total_asset = self.state[0] + sum(
                np.array(self.state[1 : (self.stock_dim + 1)])
                * np.array(self.state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)]))
            df_total_value = pd.DataFrame(self.balance_memory)
            tot_reward = (
                self.state[0]
                + sum(
                    np.array(self.state[1 : (self.stock_dim + 1)])
                    * np.array(
                        self.state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)]
                    )
                )
                - self.balance_memory[0]
            )
            # initial_amount is only cash part of our initial asset
            df_total_value.columns = ["account_value"]
            df_total_value["date"] = self.date_memory
            df_total_value["daily_return"] = df_total_value["account_value"].pct_change(1)
            if df_total_value["daily_return"].std() != 0:
                sharpe = (
                    (252**0.5)
                    * df_total_value["daily_return"].mean()
                    / df_total_value["daily_return"].std()
                )
            df_rewards = pd.DataFrame(self.rewards_memory)
            df_rewards.columns = ["account_rewards"]
            df_rewards["date"] = self.date_memory[:-1]
            if self.episode % self.print_verbosity == 0:
                print(f"day: {self.day}, episode: {self.episode}")
                print(f"begin_total_asset: {self.balance_memory[0]:0.2f}")
                print(f"end_total_asset: {end_total_asset:0.2f}")
                print(f"total_reward: {tot_reward:0.2f}")
                print(f"total_cost: {self.cost:0.2f}")
                print(f"total_tests: {self.tests}")
                if df_total_value["daily_return"].std() != 0:
                    print(f"Sharpe: {sharpe:0.3f}")
                print("=================================")

            return self.state, self.reward, self.terminal, False, {}

        else:
            actions = actions * self.hmax #scale actions
            actions = actions.astype(int)
            begin_total_asset = self.state[0] + sum(np.array(self.state[1 : (self.stock_dim + 1)]) * np.array(self.state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)]))
            argsort_actions = np.argsort(actions)
            sell_index = argsort_actions[:np.count_nonzero(actions < 0)]
            buy_index = argsort_actions[::-1][:np.count_nonzero(actions > 0)]
            for index in sell_index:
                actions[index] = self._sell_stock(index, actions[index]) * (-1)

            for index in buy_index:
                actions[index] = self._buy_stock(index, actions[index])

            self.actions_memory.append(actions)

            # Transition to the next state
            self.totalhour += 1
            self.hour += 1
            if self.hour > 16:  #Count for time and day
                self.hour = 9
                self.day += 1  # Move to the next day

            # Ensure that data fetching considers both day and hour
            self.data = self.df.loc[self.totalhour, :]
            self.state = self._update_state()

            end_total_asset = self.state[0] + sum(
                np.array(self.state[1 : (self.stock_dim + 1)])
                * np.array(self.state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)])
            )

            self.state = self._update_state()
            self.balance_memory.append(end_total_asset)
            self.date_memory.append(self._get_date())
            self.reward = end_total_asset - begin_total_asset
            self.rewards_memory.append(self.reward)
            self.state_memory.append(self.state)

            return self.state, self.reward, self.terminal, False, {}

    def reset(
        self,
        *,
        seed=None,
        options=None,
    ):
        # initiate state
        self.day = 0
        self.totalhour = 0
        self.hour = 9
        self.data = self.df.loc[self.totalhour, :]
        self.state = self._initiate_state()

        if self.initial:
            self.balance_memory = [
                self.initial_amount
                + np.sum(
                    np.array(self.num_stock_shares)
                    * np.array(self.state[1 : 1 + self.stock_dim])
                )
            ]
        else:
            previous_total_asset = self.previous_state[0] + sum(
                np.array(self.state[1 : (self.stock_dim + 1)])
                * np.array(
                    self.previous_state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)]
                )
            )
            self.balance_memory = [previous_total_asset]

        self.turbulence = 0
        self.cost = 0
        self.tests = 0
        self.terminal = False
        self.rewards_memory = []
        self.actions_memory = []
        self.date_memory = [self._get_date()]

        self.episode += 1
        return self.state, {}

    def render(self, mode="human", close=False):
        return self.state

    def _initiate_state(self):
        if self.initial:
            # For Initial State
            state = (
                [self.initial_amount]
                + self.data.close.values.tolist()
                + self.num_stock_shares
                + sum((self.data[tech].values.tolist() for tech in self.tech_indicator_list),
                    [],
                )
            ) 
            
        else:
            # Using Previous State
            state = (
                [self.previous_state[0]]
                + self.data.close.values.tolist()
                + self.previous_state[
                    (self.stock_dim + 1) : (self.stock_dim * 2 + 1)
                ]
                + sum(
                    (self.data[tech].values.tolist() for tech in self.tech_indicator_list),
                    [],
                )
            )
            
        return state

    def _update_state(self):
        state = (
            [self.state[0]]
            + self.data.close.values.tolist()
            + list(self.state[(self.stock_dim + 1) : (self.stock_dim * 2 + 1)])
            + sum(
                (self.data[tech].values.tolist() for tech in self.tech_indicator_list),
                [],
            )
        )

        return state

    def _get_date(self):
        date = self.data.date.unique()[0]

    def save_balance_memory(self):
        date_list = self.date_memory
        asset_list = self.balance_memory
        df_account_value = pd.DataFrame(
            {"date": date_list, "account_value": asset_list}
        )
        return df_account_value
    
    # add save_state_memory to preserve state in the trading process
    def save_state_memory(self):
        if len(self.df.tic.unique()) > 1:
            # date and close price length must match actions length
            date_list = self.date_memory[:-1]
            df_date = pd.DataFrame(date_list)
            df_date.columns = ["date"]

            state_list = self.state_memory
            df_states = pd.DataFrame(
                state_list,
                columns=["cash"],
            )
            df_states.index = df_date.date
        return df_states

    def save_balance_memory(self):
        date_list = self.date_memory
        asset_list = self.balance_memory
        df_account_value = pd.DataFrame({"date": date_list, "account_value": asset_list})
        return df_account_value

    def save_action_memory(self):
        # date and close price length must match actions length
        date_list = self.date_memory[:-1]
        df_date = pd.DataFrame(date_list)
        df_date.columns = ["date"]

        action_list = self.actions_memory
        df_actions = pd.DataFrame(action_list)
        df_actions.columns = self.data.tic.values
        df_actions.index = df_date.date
        # df_actions = pd.DataFrame({'date':date_list,'actions':action_list})
        return df_actions

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def get_sb_env(self):
        e = DummyVecEnv([lambda: self])
        obs = e.reset()
        return e, obs


# Step 4: Stable Baselines3 Libraries

In [134]:
from __future__ import annotations

DATA_SAVE_DIR = "datasets"
TRAINED_MODEL_DIR = "trained_models"
TENSORBOARD_LOG_DIR = "tensorboard_log"
RESULTS_DIR = "results"

# Model Parameters
A2C_PARAMS = {"n_steps": 5, "ent_coef": 0.01, "learning_rate": 0.0007}
PPO_PARAMS = {"n_steps": 2048, "ent_coef": 0.01, "learning_rate": 0.00025, "batch_size": 64}
DDPG_PARAMS = {"batch_size": 128, "buffer_size": 50000, "learning_rate": 0.001}
TD3_PARAMS = {"batch_size": 100, "buffer_size": 1000000, "learning_rate": 0.001}

In [135]:
#DRL

# DRL models from Stable Baselines 3
from __future__ import annotations

import time

from stable_baselines3 import A2C
from stable_baselines3 import DDPG
from stable_baselines3 import PPO
from stable_baselines3 import TD3
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv


MODELS = {"a2c": A2C, "ddpg": DDPG, "td3": TD3, "ppo": PPO}
MODEL_KWARGS = {
    "a2c": A2C_PARAMS,
    "ppo": PPO_PARAMS,
    "ddpg": DDPG_PARAMS,
    "td3": TD3_PARAMS,
}

NOISE = {
    "normal": NormalActionNoise,
    "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise,
}


class TensorboardCallback(BaseCallback):
    
    #Custom callback for plotting additional values in tensorboard.
    
    def __init__(self, verbose=0):
        super().__init__(verbose)

    def _on_step(self) -> bool:
        try:
            self.logger.record(key="train/reward", value=self.locals["rewards"][0])

        except BaseException as error:
            try:
                self.logger.record(key="train/reward", value=self.locals["reward"][0])

            except BaseException as inner_error:
                # Handle the case where neither "rewards" nor "reward" is found
                self.logger.record(key="train/reward", value=None)
                # Print the original error and the inner error for debugging
                print("Original Error:", error)
                print("Inner Error:", inner_error)
        return True


class DRLAgent:
    #help call agents with given gym environment
    def __init__(self, env):
        self.env = env

    def get_model(
        self,
        model_name,
        policy="MlpPolicy",
        policy_kwargs=None,
        model_kwargs=None,
        verbose=1,
        seed=None,
        tensorboard_log=None,
    ):
        if model_name not in MODELS:
            raise ValueError(
                f"Model '{model_name}' not found in MODELS."
            )  # this is more informative than NotImplementedError("NotImplementedError")

        if model_kwargs is None:
            model_kwargs = MODEL_KWARGS[model_name]

        if "action_noise" in model_kwargs:
            n_actions = self.env.action_space.shape[-1]
            model_kwargs["action_noise"] = NOISE[model_kwargs["action_noise"]](
                mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
            )
        print(model_kwargs)
        return MODELS[model_name](
            policy=policy,
            env=self.env,
            tensorboard_log=tensorboard_log,
            verbose=verbose,
            policy_kwargs=policy_kwargs,
            seed=seed,
            **model_kwargs,
        )

    @staticmethod
    def train_model(model, tb_log_name, total_timesteps=5000):  
        model = model.learn(
            total_timesteps=total_timesteps,
            tb_log_name=tb_log_name,
            callback=TensorboardCallback(),
        )
        return model

    @staticmethod
    def DRL_prediction(model, environment, deterministic=True):
        #make prediction
        test_env, test_obs = environment.get_sb_env()
        account_memory = None  #avoid unnecessary list creation
        actions_memory = None

        test_env.reset()
        max_steps = len(environment.df.index.unique()) - 1

        for i in range(len(environment.df.index.unique())):
            action, _states = model.predict(test_obs, deterministic=deterministic)
            test_obs, rewards, dones, info = test_env.step(action)

            if (i == max_steps - 1):
                account_memory = test_env.env_method(method_name="save_balance_memory")
                actions_memory = test_env.env_method(method_name="save_action_memory")
            # add current state to state memory
            if dones[0]:
                print("Prediction Complete")
                break
        return account_memory[0], actions_memory[0]


# Step 5: Train Agents
Train three agents baised on different algorithms

In [136]:
#Environment Construction
e_train_gym = TradingEnvironment(df = train, **env_kwargs)
env_train, _ = e_train_gym.get_sb_env()

In [137]:
agent = DRLAgent(env = env_train)
#select algorithm in use
if_using_a2c = True
if_using_ddpg = True
if_using_ppo = True

In [138]:
import os
from stable_baselines3.common.logger import Logger, configure

def configure_logger(log_dir, formats):
    os.makedirs(log_dir, exist_ok=True)
    logger = Logger(
        folder=log_dir,
        output_formats=[format_ for format_ in formats if format_ in ["stdout", "csv", "tensorboard"]]
    )
    return logger

In [139]:
agent = DRLAgent(env = env_train)
model_a2c = agent.get_model("a2c")

if if_using_a2c:
  # set up logger
  tmp_path = RESULTS_DIR + '/a2c'
  new_logger_a2c = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_a2c.set_logger(new_logger_a2c)

{'n_steps': 5, 'ent_coef': 0.01, 'learning_rate': 0.0007}
Using cpu device
Logging to results/a2c


In [140]:
trained_a2c = agent.train_model(model=model_a2c, tb_log_name='a2c', total_timesteps=5000) if if_using_a2c else None

------------------------------------
| time/                 |          |
|    fps                | 413      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -14.3    |
|    explained_variance | 1.79e-07 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 2.07e+04 |
|    reward             | 2030.15  |
|    std                | 1.01     |
|    value_loss         | 6.78e+06 |
------------------------------------
-------------------------------------
| time/                 |           |
|    fps                | 438       |
|    iterations         | 200       |
|    time_elapsed       | 2         |
|    total_timesteps    | 1000      |
| train/                |           |
|    entropy_loss       | -14.3     |
|    explained_variance | 1.79e-07  |
|    learning_rate      | 0.0007    |
|    n_updates          | 19

In [142]:
agent = DRLAgent(env = env_train)
model_ddpg = agent.get_model("ddpg")

if if_using_ddpg:
  # set up logger
  tmp_path = RESULTS_DIR + '/ddpg'
  new_logger_ddpg = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_ddpg.set_logger(new_logger_ddpg)

{'batch_size': 128, 'buffer_size': 50000, 'learning_rate': 0.001}
Using cpu device
Logging to results/ddpg


In [143]:
trained_ddpg = agent.train_model(model=model_ddpg,
                             tb_log_name='ddpg',
                             total_timesteps=5000) if if_using_ddpg else None

In [145]:
agent = DRLAgent(env = env_train)
model_ppo = agent.get_model("ppo",model_kwargs = PPO_PARAMS)

if if_using_ppo:
  # set up logger
  tmp_path = RESULTS_DIR + '/ppo'
  new_logger_ppo = configure(tmp_path, ["stdout", "csv", "tensorboard"])
  # Set new logger
  model_ppo.set_logger(new_logger_ppo)

{'n_steps': 2048, 'ent_coef': 0.01, 'learning_rate': 0.00025, 'batch_size': 64}
Using cpu device
Logging to results/ppo


In [146]:
trained_ppo = agent.train_model(model=model_ppo,
                             tb_log_name='ppo',
                             total_timesteps=5000) if if_using_ppo else None

-----------------------------------
| time/              |            |
|    fps             | 656        |
|    iterations      | 1          |
|    time_elapsed    | 3          |
|    total_timesteps | 2048       |
| train/             |            |
|    reward          | -3014.6902 |
-----------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 214         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005292926 |
|    clip_fraction        | 0.0261      |
|    clip_range           | 0.2         |
|    entropy_loss         | -14.2       |
|    explained_variance   | -8.34e-07   |
|    learning_rate        | 0.00025     |
|    loss                 | 6.64e+07    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00606    |
|   

# Step 6: Backtesting
Compare and evaluate performance of three models as well as prediction from Mean-Variance Optimization (MVO) methdology

In [148]:
stock_dimension = len(test.tic.unique())
state_space = 1 + 2 * stock_dimension + len(INDICATORS) * stock_dimension
print(f"Stock Dimension: {stock_dimension}, State Space: {state_space}")
buy_cost_list = sell_cost_list = [0.001] * stock_dimension
num_stock_shares = [0] * stock_dimension

env_kwargs = {
    "hmax": 100,
    "initial_amount": 1000000,
    "num_stock_shares": num_stock_shares,
    "buy_cost_pct": buy_cost_list,
    "sell_cost_pct": sell_cost_list,
    "state_space": state_space,
    "stock_dim": stock_dimension,
    "tech_indicator_list": INDICATORS,
    "action_space": stock_dimension,
    "reward_scaling": 1e-4
}

Stock Dimension: 10, State Space: 31


In [149]:
e_test_gym = TradingEnvironment(df = test, **env_kwargs)
#new test environment

In [150]:
df_account_value_a2c, df_actions_a2c = DRLAgent.DRL_prediction(
    model=trained_a2c, 
    environment = e_test_gym) if if_using_a2c else (None, None)

Prediction Complete


In [151]:
df_account_value_ddpg, df_actions_ddpg = DRLAgent.DRL_prediction(
    model=trained_ddpg, 
    environment = e_test_gym) if if_using_ddpg else (None, None)

Prediction Complete


In [152]:
df_account_value_ppo, df_actions_ppo = DRLAgent.DRL_prediction(
    model=trained_ppo, 
    environment = e_test_gym) if if_using_ppo else (None, None)

Prediction Complete


In [153]:
def process_df_for_mvo(df):
  return df.pivot(index="date", columns="tic", values="close")

def StockReturnsComputing(StockPrice, Rows, Columns): 
  return (np.diff(StockPrice, axis=0) / StockPrice[:-1]) * 100

In [154]:
StockData = process_df_for_mvo(train)
testData = process_df_for_mvo(test)

testData.to_numpy()

array([[192.94, 538.21, 157.16, ..., 479.08, 236.66, 157.48],
       [191.98, 539.56, 157.62, ..., 478.41, 236.61, 158.03],
       [192.23, 539.88, 157.79, ..., 478.42, 237.65, 158.38],
       ...,
       [180.2 , 749.19, 152.22, ..., 493.31, 283.01,  58.79],
       [180.74, 743.59, 152.12, ..., 493.53, 282.66,  58.62],
       [180.72, 743.27, 152.01, ..., 493.6 , 282.64,  58.61]])

In [155]:
#compute asset returns
#array for numerical operations
arStockPrices = np.asarray(StockData)
[Rows, Cols] = arStockPrices.shape

# daily returns for each stock
arReturns = StockReturnsComputing(arStockPrices, Rows, Cols)
meanReturns = np.mean(arReturns, axis=0)
covReturns = np.cov(arReturns, rowvar=False)
np.set_printoptions(precision=3, suppress=True)

print('Mean returns of assets in k-portfolio 1\n', meanReturns)
print('Variance-Covariance matrix of returns\n', covReturns)


Mean returns of assets in k-portfolio 1
 [ 0.004  0.001  0.012 -0.     0.003  0.019 -0.001  0.001  0.005  0.004]
Variance-Covariance matrix of returns
 [[0.442 0.228 0.135 0.076 0.345 0.54  0.103 0.138 0.258 0.123]
 [0.228 0.365 0.092 0.087 0.225 0.322 0.123 0.138 0.173 0.182]
 [0.135 0.092 0.504 0.047 0.108 0.196 0.029 0.097 0.105 0.062]
 [0.076 0.087 0.047 0.151 0.073 0.047 0.094 0.108 0.073 0.07 ]
 [0.345 0.225 0.108 0.073 0.508 0.607 0.109 0.133 0.258 0.117]
 [0.54  0.322 0.196 0.047 0.607 1.655 0.098 0.179 0.41  0.12 ]
 [0.103 0.123 0.029 0.094 0.109 0.098 0.198 0.108 0.096 0.101]
 [0.138 0.138 0.097 0.108 0.133 0.179 0.108 0.31  0.118 0.091]
 [0.258 0.173 0.105 0.073 0.258 0.41  0.096 0.118 0.407 0.096]
 [0.123 0.182 0.062 0.07  0.117 0.12  0.101 0.091 0.096 0.252]]


In [156]:
from pypfopt.efficient_frontier import EfficientFrontier
from pypfopt.exceptions import OptimizationError

# Initialize the Efficient Frontier with the previously calculated mean returns and covariance matrix
try:
    ef = EfficientFrontier(meanReturns, covReturns, weight_bounds=(0, 1))
    # Optimize the portfolio to achieve the maximum Sharpe ratio
    raw_weights = ef.max_sharpe(risk_free_rate=0.005)
    cleaned_weights = ef.clean_weights()

    # Calculate the monetary value of the weights by scaling them up by balance
    mvo_weights = np.array([1_000_000 * cleaned_weights[ticker] for ticker in cleaned_weights])
    print("Optimized Portfolio Weights:", mvo_weights)

except OptimizationError as e:
    print("Optimization Error:", e)



Optimized Portfolio Weights: [     0.      0. 580480.      0.      0. 419520.      0.      0.      0.
      0.]


In [157]:
#last price from the stock data
LastPrice = np.array([1/p for p in StockData.tail(1).to_numpy()[0]])
Initial_Portfolio = np.multiply(mvo_weights, LastPrice)

# Compute the portfolio value using test data
Portfolio_Assets = testData @ Initial_Portfolio
MVO_result = pd.DataFrame(Portfolio_Assets, columns=["Mean Var"])
MVO_result

Unnamed: 0_level_0,Mean Var
date,Unnamed: 1_level_1
2023-07-03 09:00:00-04:00,1.001274e+06
2023-07-03 10:00:00-04:00,1.004380e+06
2023-07-03 11:00:00-04:00,1.002675e+06
2023-07-03 12:00:00-04:00,1.000236e+06
2023-07-03 13:00:00-04:00,1.000783e+06
...,...
2024-02-29 12:00:00-05:00,1.346609e+06
2024-02-29 13:00:00-05:00,1.348074e+06
2024-02-29 14:00:00-05:00,1.349317e+06
2024-02-29 15:00:00-05:00,1.349603e+06


In [159]:
df_result_a2c = (
    df_account_value_a2c.set_index(MVO_result.index)
    if if_using_a2c
    else None
)
df_result_ddpg = (
    df_account_value_ddpg.set_index(MVO_result.index)
    if if_using_ddpg
    else None
)
df_result_ppo = (
    df_account_value_ppo.set_index(MVO_result.index)
    if if_using_ppo
    else None
)
result = pd.DataFrame(
    {
        "a2c": df_result_a2c["account_value"] if if_using_a2c else None,
        "ddpg": df_result_ddpg["account_value"] if if_using_ddpg else None,
        "ppo": df_result_ppo["account_value"] if if_using_ppo else None,
        "mvo": MVO_result["Mean Var"],
    }
)

In [160]:
result

Unnamed: 0_level_0,a2c,ddpg,ppo,mvo
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-07-03 09:00:00-04:00,1.000000e+06,1.000000e+06,1.000000e+06,1.001274e+06
2023-07-03 10:00:00-04:00,1.000002e+06,9.997399e+05,9.999793e+05,1.004380e+06
2023-07-03 11:00:00-04:00,9.999204e+05,9.994429e+05,9.999795e+05,1.002675e+06
2023-07-03 12:00:00-04:00,9.999160e+05,9.997709e+05,9.999701e+05,1.000236e+06
2023-07-03 13:00:00-04:00,9.999169e+05,9.997657e+05,9.999648e+05,1.000783e+06
...,...,...,...,...
2024-02-29 12:00:00-05:00,1.258454e+06,1.227701e+06,1.079224e+06,1.346609e+06
2024-02-29 13:00:00-05:00,1.259610e+06,1.228664e+06,1.077750e+06,1.348074e+06
2024-02-29 14:00:00-05:00,1.261552e+06,1.229972e+06,1.077674e+06,1.349317e+06
2024-02-29 15:00:00-05:00,1.261603e+06,1.230042e+06,1.077944e+06,1.349603e+06


In [161]:
import matplotlib
matplotlib.use('TkAgg')  
import matplotlib.pyplot as plt

result.plot()
plt.savefig('myplot.png')  # Save the plot as a PNG file
plt.close()

