In [1]:


import gymnasium as gym
from gymnasium import spaces
from gym.wrappers import FlattenObservation
from gym.spaces.utils import unflatten
import numpy as np
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import SAC
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.noise import NormalActionNoise
import torch.nn as nn
from stable_baselines3 import TD3
import torch
from stable_baselines3 import A2C, DQN, PPO
import datetime
import pandas as pd
import warnings
import argparse
import os
import pathlib

from AdvancedStockGame import AdvancedStockGame
from AdvancedStockGameTD3 import AdvancedStockGameTD3

In [4]:
class CustomLSTMNetwork(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim):
        super(CustomLSTMNetwork, self).__init__(observation_space, features_dim)


        stock_space = observation_space.spaces['stock_data']
        
        self.num_stocks = stock_space.shape[0]
        self.seq_length = stock_space.shape[1]
        self.n_features = stock_space.shape[2]

        portfolio_space = observation_space.spaces['portfolio']
        self.portfolio_dim = portfolio_space.shape[0]

        portfolio_value_space = observation_space.spaces['portfolio_value']
        self.portfolio_value_dim = 10
        
        stock_layer_size = 1024

        portfolio_layer_size = 32 

        portfolio_value_layer_size = 32
        # LSTM layer
        self.lstm = nn.LSTM(input_size=self.n_features * self.num_stocks, num_layers=3, hidden_size=stock_layer_size, batch_first=True)

        self.portfolio_layer = nn.Linear(self.portfolio_dim, portfolio_layer_size)  

        self.portfolio_value_layer = nn.Linear(self.portfolio_value_dim, portfolio_value_layer_size)

        self.final_layer = nn.Linear(stock_layer_size + portfolio_layer_size + portfolio_value_layer_size, features_dim)  

        self.relu = nn.ReLU()
        
    def forward(self, observations):
        main_input = observations['stock_data'].view(-1, self.seq_length, self.num_stocks * self.n_features)
        lstm_out, _ = self.lstm(main_input)
        lstm_out = lstm_out[:, -1, :] 
        
        portfolio_input = observations['portfolio']
        portfolio_out = self.relu(self.portfolio_layer(portfolio_input))

        portfolio_value_input = observations['portfolio_value']
       # portfolio_value_out = self.relu(self.portfolio_value_layer(portfolio_value_input))
        portfolio_value_out = self.relu(self.portfolio_value_layer(portfolio_value_input)).squeeze(1)
        #print(f"lstm_out shape: {lstm_out.shape}")
       # print(f"portfolio_out shape: {portfolio_out.shape}")
       # print(f"portfolio_value_out shape: {portfolio_value_out.shape}")

        combined = torch.cat((lstm_out, portfolio_out, portfolio_value_out), dim=1)  
        combined = self.relu(combined)


        return self.final_layer(combined)
        

In [3]:
def new_model(data_file_path):
    policy_kwargs = dict (
        features_extractor_class=CustomLSTMNetwork,
        features_extractor_kwargs=dict(500)
    )
    model = DQN(
    policy='MlpPolicy',  # Use the same MLP policy architecture
    env=FlattenObservation(AdvancedStockGame(data_file_path)),  # Your custom environment
    learning_rate=0.0001,  # Learning rate, similar to PPO
    buffer_size=16384,  # Size of the replay buffer
    learning_starts=1000000,  # Number of environment steps to collect before learning starts
    batch_size=128,  # Similar to PPO
    tau=0.01,  # The soft update coefficient ('Polyak update', 1.0 means hard update)
    gamma=1,  # Discount factor, similar to PPO
    train_freq=4,  # Update the model every `train_freq` steps
    gradient_steps=1,  # How many gradient update steps to take after each batch of steps
    optimize_memory_usage=False,  # Optimize memory usage by reducing data precision
    target_update_interval=16384,  # Update the target network every `target_update_interval` steps
    exploration_fraction=0.2,  # Fraction of entire training period over which the exploration rate is reduced
    exploration_initial_eps=1.0,  # Initial value of random action probability
    exploration_final_eps=0.1,  # Final value of random action probability
    max_grad_norm=2,  # Maximum norm for the gradient clipping
    tensorboard_log=None,  # Directory for Tensorboard logs
    policy_kwargs=policy_kwargs,  # Custom policy arguments
    verbose=0,  # Verbosity level
    seed=40,  # Seed for the pseudo-random generators
    device="cuda",  # Device to use for PyTorch (either 'cuda' or 'cpu')
    _init_setup_model=True  # Whether or not to build the network at the creation of the instance
    )
    return model

In [3]:
def new_model(data_file_path):
    policy_kwargs = dict (
        features_extractor_class=CustomLSTMNetwork,
        features_extractor_kwargs=dict(features_dim=500)
    )
    model = A2C(
        policy='MultiInputPolicy', 
        env=AdvancedStockGame(data_file_path),  
        n_steps=5,
        gamma=1,
        gae_lambda=1.0,
        ent_coef=0.01,
        vf_coef=0.5,
        max_grad_norm=2,
        use_rms_prop=False,
        use_sde=False,
        sde_sample_freq=-1,
        normalize_advantage=False,
        policy_kwargs=policy_kwargs,
        verbose=0,
        seed=40,
        device="cuda",
        _init_setup_model=True
    )
    return model

In [2]:
policy_kwargs = {
    "net_arch": {
        "pi": [2048, 2048],
        "vf": [2048, 2048],
    },
}
def new_model(data_file_path):
    model = PPO(
        policy='MultiInputPolicy',  # Use the same MLP policy architecture
        env=AdvancedStockGame(data_file_path, window_size=4096),  # Your custom environment
        learning_rate=0.0000000001,  # Learning rate
        n_steps=32768,  # Number of steps to run for each environment per update
        batch_size=32768,  # Batch size for the optimization process
        n_epochs=3,  # Number of epochs to optimize for each update cycle
        gamma=1,  # Discount factor
        gae_lambda=0.95,  # Factor for trade-off of bias vs variance for Generalized Advantage Estimator
        clip_range=0.2,  # Clip parameter for PPO
        ent_coef=0.01,  # Entropy coefficient for the loss calculation
        vf_coef=0.5,  # Value function coefficient for the loss calculation
        max_grad_norm=2,  # Maximum norm for the gradient clipping
        use_sde=False,  # Whether to use State Dependent Exploration
        sde_sample_freq=-1,  # Sample a new noise matrix every n steps when using gSDE
        target_kl=None,  # Target KL divergence threshold for early stopping
        tensorboard_log=None,  # Directory for Tensorboard logs
        policy_kwargs=policy_kwargs,  # Custom policy arguments
        verbose=0,  # Verbosity level
        seed=40,  # Seed for the pseudo-random generators
        device="cuda",  # Device to use for PyTorch (either 'cuda' or 'cpu')
        _init_setup_model=True  # Whether or not to build the network at the creation of the instance
    )
    return model

In [20]:


def new_model(data_file_path):
    model = TD3(
        policy='MultiInputPolicy',  # TD3 uses MLP (Multi-Layer Perceptron) policy by default
        env=AdvancedStockGameTD3(data_file_path),  # Your custom environment
        learning_rate=0.001,  # Typical starting learning rate for TD3
        buffer_size=2000,  # Size of the replay buffer
        learning_starts=2000,  # Number of steps before learning starts
        batch_size=64,  # Batch size for learning
        tau=0.005,  # Polyak averaging coefficient for updating the target network
        gamma=0.99,  # Discount factor for future rewards
        train_freq=1000,  # Train the model every `train_freq` steps
        gradient_steps=-1,  # Update the model as many times as steps done during the episode
        action_noise=None,  # Action noise added to the target policy during training
        replay_buffer_class=None,  # Custom replay buffer class
        replay_buffer_kwargs=None,  # Custom replay buffer class arguments
        optimize_memory_usage=False,  # Enable a memory efficient variant of the replay buffer
        policy_kwargs=None,  # Policy specific arguments
        tensorboard_log=None,  # Tensorboard log directory
        verbose=0,  # Verbosity level
        seed=40,  # Random seed
        device="cuda",  # PyTorch device (cuda or cpu)
        _init_setup_model=True  # Whether to initialize the model at creation
    )
    return model

In [None]:

def evaluate(model, env):
    obs, info = env.reset()
    portfolio = []
    actions = []
    total_reward = 0
    i = 0
    while True:  
        try:
            action, _states = model.predict(obs, deterministic=True)
        except:
            print(obs)
            break
        obs, reward, done, term, info = env.step(action)
        total_reward += reward
        actions.append(action)
        portfolio.append(env.get_portfolio_value())
        i += 1
        if done:
            break
    


    action, _states = model.predict(obs,deterministic=True)
    actions.append(action)

    return actions, portfolio, total_reward
env = AdvancedStockGame("full_data.csv")
model = new_model("full_data.csv")

evaluate(model, env)

In [3]:
model = new_model("full_data.csv")
model.learn(total_timesteps=10_000)
model.save('models/Cripplew4096/Cripplew4096')
del model