## Initial Environment - Long Only

In [1]:
class LongOnlyEnv(gym.Env):

    metadata = {'render.modes': ['human']}

    def __init__(self, 
                df,
                stock_dim=17,
                initial_amount=1e4,
                state_space=13,
                action_space=17,
                lookback=13,
                max_timesteps=1e4,
                mode='train'):

        # Initialise parameters
        self.t = 0                                                                    # Initialise initial timestep, t, to be 0
        self.max_timesteps = max_timesteps                                            # Maximum number of timesteps per episode/trajectory
        self.lookback = lookback
        self.df = df
        self.stock_dim = stock_dim                                                    # 17 different assets
        self.initial_amount = initial_amount                                          # Initial investment amount
        self.state_space = state_space                                                # 16 different features (14x action-independent, 2x action-dependent)
        self.action_space = action_space                                              # 17 continuous actions (weights)
        self.current_episode = 0                                                      # Episode counter
        self.mode = mode                                                              # Mode = 'train' prints episodic items, mode = 'predict' doesn't

        # Action_space normalization and shape is self.stock_dim
        self.action_space = spaces.Box(low = 0, high = 1,shape = (self.action_space,))                                                              # The action space outputted should be a 1D 16-dimensional array (assuming 16 stocks)
        
        # State space shape = (17, 16)
        # We have 17 unique cryptocurrencies and 16 features
        # For each crypto we have: 1x closing price at t, 12x percentage changes (t-11, t-10, ... , t), 1x EMA_t, 1x current holdings, 1x invested amount
        # No time dimesnion, each observation space is representative of a given timestep
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.stock_dim, self.state_space))                                     # The observation space (14x action independent features, 2x action dependent features)

        # ------------------ Load data from a pandas dataframe -----------------------------
        ## Extracts the data for the current timestep (specified by self.t) from the input DataFrame self.df. This data includes close prices, % changes, and EMA
        self.data = self.df[self.t]

        ## Constructs the state array by combining the action independent features: closing price t, 12x percentage change in closing prices, EMA_t
        ## The resulting self.state is a 2D array with the shape (17, 13), as discussed earlier
        self.state = self.data

        ## Initialise terminality. False at the beginning. Will be updated as we train
        self.terminal = False    
        
        # ------------------ Initialises the agent's portfolio value to the initial investment amount --------------------------------
        ## This portfolio value will be updated as we go along
        self.portfolio_value = self.initial_amount

        # Memorize portfolio value each step
        ## Initialises a list called `self.asset_memory` to store the agent's portfolio value at each time step. The list starts with the initial investment amount.
        self.asset_memory = [self.initial_amount]

        # Memorize portfolio return each step
        ## Initialises a list called `self.portfolio_return_memory` to store the agent's daily portfolio return at each time step. The list starts with 0, as there is no return on the first day.
        self.portfolio_return_memory = [0]

        ## Initialises a list called `self.actions_memory` to store the agent's actions (portfolio weights) at each time step. The list starts with equal weights for all stocks in the portfolio (i.e., 1 divided by the number of stocks).
        self.actions_memory = [[0] * (self.stock_dim-1) + [1]]                    # 16 crypto assets, 1 cash asset. Initialise by allocating all funds to the cash asset

        ## Initialises a list called self.date_memory to store each time step. The list starts with the timestep of the first data entry in the environment's data (self.data).
        self.timestep_memory = [self.t]

        
    def step(self, actions):
        ## Checks if the current t >= max_timesteps. If true, sets self.terminal to True, indicating the end of the episode.
        self.terminal = self.t >= self.max_timesteps - 1
        
        # ------------------ If end of episode (i.e., the last timestep, which is user specified) -----------------
        if self.terminal:

          cumulative_return = (self.portfolio_value / self.initial_amount) - 1

          if self.mode == 'train':
            print(f"Episode: {self.current_episode} | Timesteps: {self.t + 1} | Cumulative Return: {cumulative_return:.2%}")
            
          ## Finally, it returns the current state, reward, and terminal status along with an empty dictionary.
          return self.state, self.reward, self.terminal, {}

        # ----------------- If the current timestep is not the last timestep ---------------------
        else:
            
            ## Normalize the input actions using softmax normalization, which ensures that the portfolio weights sum to 1.
            weights = self.softmax_normalization(torch.tensor(actions))

            ## Append the normalized weights to the actions memory.
            self.actions_memory.append(weights)

            ## Store the data for the current timestep before moving to the next timestep.
            prev_timestep_memory = self.data

            # ------------------------ Next State ------------------------------
            ##  Increment the `time` counter to move to the next timestep.
            self.t += 1
            
            ## Update `self.data` and `self.state` to the new timestep's values
            ## Creating a new state with the new timestep action independent features and create new action dependent features
            self.data = self.df[self.t]

            self.state = self.data

            # Calculate portfolio return
            ## Portfolio return (a scalar) based on the new timestep's closing prices and the portfolio weights (returns x weights)
            ## [close_price_t / close_price_(t-1)] - 1. If positive, then returns are profitable, otherwise loss
            ## NOTE: (self.data.close.values / prev_timestep_memory.close.values) - 1 : Calculates the percentage gain/loss for each crypto
            ## NOTE: Multiplying these percentage returns by their respective weights (*weights) gives the weighted return of each crypto
            ## NOTE: Then sum over all stocks to calculate the portfolio return for that timestep
            ## Column 0 = closing prices. Row index = cryptocurrencies
            portfolio_return = torch.sum(((self.data[:,0] / prev_timestep_memory[:,0]) - 1) * weights)

            # Update portfolio value
            ## Calculate the new portfolio value based on the calculated portfolio return from before
            new_portfolio_value = self.portfolio_value*(1+portfolio_return)

            ## Update the portfolio value with the new value
            self.portfolio_value = new_portfolio_value

            # Save into memory: the new portfolio return, new date, and new portfolio value
            self.portfolio_return_memory.append(portfolio_return)
            self.timestep_memory.append(self.t)            
            self.asset_memory.append(new_portfolio_value)

            # # The reward is the immediate reward at the current timestep not the cumulative returns (i.e., new_portfolio_value)
            self.reward = portfolio_return

        # Return the updated state, reward, terminal status, and an empty dictionary
        return self.state, self.reward, self.terminal, {}

    
    # The environment is reset to its initial state, which is necessary for the beginning of a new training episode or evaluation
    def reset(self):
        
        ## Resets the asset_memory to the initial amount (i.e., portfolio value)
        self.asset_memory = [self.initial_amount]

        ## Resets the timestep counter to 0 (the first timestep)
        self.t = 0

        ## Increment episode number by 1
        self.current_episode += 1

        ## Retrieves the data for the first timestep from the dataframe
        self.data = self.df[self.t]
        
        self.state = self.data

        ## Resets the portfolio value to the initial amount
        self.portfolio_value = self.initial_amount

        ## Resets the terminal flag to False, indicating that the episode has not ended
        self.terminal = False 

        ## Resets the portfolio_return_memory to its initial state, with a 0% return (no return on the first timestep)
        self.portfolio_return_memory = [0]

        ## Resets the actions_memory with an initial equal-weighted portfolio
        self.actions_memory = [[0] * (self.stock_dim-1) + [1]]                    # 16 crypto assets, 1 cash asset. Initialise by allocating all funds to the cash asset

        ## Resets the timestep_memory with the first timestep
        self.timestep_memory = [self.t]

        # Initial state is returned 
        return self.state
    
    #  In general, the render method is used to display the current state of the environment, and the mode argument can be used to determine how this visualisation should be presented
    ## Typically, a mode of 'human' would mean that the visualisation is intended to be easily interpretable by humans, often with some kind of graphical display
    ## In this specific implementation, the render method does not provide any visualisation, regardless of the mode being set to 'human'
    ## Hence, the method simply returns the current state of the environment.
    def render(self, mode='human'):
        return self.state
    
    # Softmax_normalization method, which takes actions as input and returns normalized actions using the softmax function.
    def softmax_normalization(self, actions):

        ## Calculates the exponential of each action in the input list
        numerator = torch.exp(torch.tensor(actions))

        ## Calculates the sum of the exponentials of the actions
        denominator = torch.sum(np.exp(torch.tensor(actions)))
        
        ## Divides each exponential action by the sum of the exponentials to normalize the actions
        softmax_output = numerator/denominator

        ## Returns the normalized actions
        return softmax_output

    # Saves the date and daily return of the portfolio into a DataFrame and return this DataFrame
    def save_asset_memory(self):

        # ## Retrieves the date_memory list (all dates in an episode)
        # timestep_list = self.timestep_memory

        # ## Retrieves the portfolio_return_memory list (all returns in an episode)
        # portfolio_return = self.portfolio_return_memory

        # ## Creates a DataFrame with 2 columns: 'date' with the dates from date_memory
        # ## 'daily_return' with the portfolio returns from portfolio_return_memory.
        # df_account_value = pd.DataFrame({'timestep':timestep_list,'daily_return':portfolio_return})

        # ## Returns the DataFrame containing the date and daily return of the portfolio
        # ## Each row in the DataFrame is basically an episode/trajectory
        # return df_account_value
        return self.asset_memory

    def save_action_memory(self):

        ## Retrieves the date_memory list (all dates in an episode)
        timestep_list = self.timestep_memory

        ## Creates a DataFrame using the list of dates in an episode and labels the column as 'date'
        ## Basically a DataFrame with 1 column = 'date'
        df_timestep = pd.DataFrame(timestep_list)
        df_timestep.columns = ['timestep']
        
        ## Retrieves the list of actions (i.e., a list of 17 weights for a given time step) taken at each time step of an episode
        ## Hence, our action_list is basically n_timesteps x [a list of 17 weights] (i.e., n_timestep lists)
        action_list = self.actions_memory

        ## Creates a DataFrame using the list of actions with column names set as the stock ticker symbols (AAPL, FB, GOOGL, etc.)
        ## The dimension of df_actions is (n_dates, n_stocks), where n_dates is the number of timesteps in any episode (should be constant) and n_stocks is the number of stocks in the portfolio
        ## Each row in `df_actions` corresponds to a given date, and the value in a given row will be the portfolio weights of each stock (basically unpacked the list inside the list)
        df_actions = pd.DataFrame(action_list)
        df_actions.columns = ['ADA', 'ATOM', 'AVAX', 'BNB', 'BTC', 'DOGE', 'DOT', 'ETH', 'LINK', 'LTC', 'MATIC', 'SHIB', 'SOL', 'TRX', 'UNI', 'XRP', 'USD']
        df_actions.index = df_timestep.timestep

        # Return the generated action DataFrame
        return df_actions

    # Sets the random seed for the environment
    ## If a seed value is passed, it uses that value, otherwise it generates a new random seed using seeding.np_random(seed)
    ## It returns a list containing the generated seed value.
    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    
    # Helper function that returns a stable-baselines compatible vectorized environment (e) and the initial observation of the environment (obs)
    ## GOAL: To integrate the environment with Stable Baselines (basically translating our user defined environment to make it compatible with Stable Baselines)
    def get_sb_env(self):

        ## Creates a DummyVecEnv instance (e) with a lambda function that returns the environment itself. 
        e = DummyVecEnv([lambda: self])

        ## Calls the reset method of the DummyVecEnv to get the initial observation
        obs = e.reset()

        ## Returns the DummyVecEnv instance (e) and the initial observation
        return e, obs

NameError: ignored