In [None]:
import gym
from gym import spaces
import numpy as np

class CustomEnv(gym.Env):
    def __init__(self, matrices,prices,initial_cash=1000000000):
        super(CustomEnv, self).__init__()
        
        self.matrices = matrices
        self.price = prices
        self.current_index = 0
        self.initial_cash = initial_cash
        self.stock_quantity = np.zeros(200, dtype=np.int32)
        
        self.observation_space = spaces.Tuple((
            # 205*60 matrix: the yield of 200 stocks and 5 additional features for each of the 60 days
            spaces.Box(low=0, high=1, shape=(205 * 60,), dtype=np.float32),

            # 201-dimension vector: the closed prices of the 200 stocks
            spaces.Box(low=0, high=1, shape=(200,), dtype=np.float32),
            
            # 200-dimension vector: the number of stocks of each stock in the portfolio
            spaces.Box(low=0, high=1, shape=(200,), dtype=np.int32)
        ))

        
        # Initial action space
        # 200-dimension vector: the proportion of the portfolio value to invest in each stock and cash
        self.action_space = spaces.Box(low=0, high=1, shape=(201,), dtype=np.float32)

    
    def reset(self):
        self.current_index = 0
        self.cash=self.initial_cash
        self.stock_quantity = np.zeros(200, dtype=np.int32)

        initial_matrix = self.matrices[self.current_index].flatten()
        initial_prices = self.prices[self.current_index]
        observation = (initial_matrix, initial_prices, self.stock_quantity)

        return observation

    def step(self, action):
        # Ensure the action is a valid probability distribution
        assert np.isclose(np.sum(action), 1), "Action probabilities must sum to 1"
        
        current_matrix = self.matrices[self.current_index]
        current_prices = self.prices[self.current_index]
        
        stock_action = action[:-1]  # First 200 elements for stocks
        cash_action = action[-1]    # Last element for cash

        total_value = self.cash + np.sum(self.stock_quantity * current_prices)
        desired_quantities = (total_value * stock_action) // current_prices
        remaining_cash = total_value * stock_action % current_prices
        self.stock_quantity = desired_quantities.astype(np.int32)
        self.cash = total_value * cash_action + np.sum(remaining_cash)
        
        reward = self.calculate_reward(current_matrix, action)

        self.current_index += 1
        done = self.current_index >= len(self.matrices)
        
        if not done:
            next_matrix = self.matrices[self.current_index].flatten()
            next_prices = self.prices[self.current_index]
            observation = (next_matrix, next_prices, self.stock_quantity)
        else:
            observation = (np.zeros_like(self.matrices[0].flatten()), np.zeros(200, dtype=np.float32), np.zeros(200, dtype=np.int32))
        
        return observation, reward, done, {}

    def calculate_reward(self, matrix, action):
        current_prices = self.prices[self.current_index]
        portfolio_value = np.sum(self.stock_quantities * current_prices) + self.cash
        reward = portfolio_value - self.initial_cash
        return reward
