In [None]:
import torch
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium.envs.registration import register
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

In [None]:
class PortfolioOptimization(gym.Env):
    def __init__(self, data, n_steps):
        super(PortfolioOptimization, self).__init__()
        self.data = data
        self.n_steps = n_steps
        self.current_step = 0
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(205,), dtype=np.float32)
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(200,), dtype=np.float32)

    def step(self, action):
        action = action / np.sum(action)
        
        self.current_step += 1
        if self.current_step >= len(self.data): 
            done = True
            self.current_step = 0
        else:
            done = False

        next_state = self.data[self.current_step]
        reward = self.calculate_reward(action, next_state)
        print(reward)

        return next_state, reward, done, False, {}

    # Reset to the initial state
    def reset(self, **kwargs):
        seed = kwargs.get('seed', None)
        if seed is not None:
            np.random.seed(seed)
        self.current_step = np.random.randint(len(self.data) - self.n_steps)
        observation = self.data[self.current_step]

        if not self.observation_space.contains(observation):
            raise ValueError("Observation is not within the observation space.")

        return observation, {}
    
    # Put reward as a sharpe ratio : (expected return - riskfree rate) / standard deviation  
    def calculate_reward(self, action, state):
        return np.dot(action, state[4:-1])

In [None]:
# Register the custom environment to gym
register(
    id='PO-v0',
    entry_point = PortfolioOptimization,
    kwargs = {'data': data, 'n_steps': 512}
)


In [None]:
# Parallel environments
vec_env = make_vec_env("PO-v0", n_envs=10)

model = PPO("MlpPolicy", vec_env, verbose=1, n_steps=512, device="cuda" if torch.cuda.is_available() else "cpu")
model.learn(total_timesteps=20)
model.save("ppo_PO")

del model # remove to demonstrate saving and loading

model = PPO.load("ppo_PO")

obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
