In [4]:
import gymnasium as gym
from tqdm import tqdm
import numpy as np
from pathlib import Path
import polars as pl
import random

print(f"{gym.__version__}")

0.28.1


In [5]:
import torch as th
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [6]:
from stable_baselines3 import PPO, A2C, SAC, TD3
from stable_baselines3.common.evaluation import evaluate_policy

In [7]:
from stock_prediction_rl.envs.numpy.stock_trading_env import StockTradingEnv
from stock_prediction_rl.sb.utils import (
    create_numpy_array,
    create_envs,
)

In [8]:
SEED = 1337
random.seed(SEED)
np.random.seed(SEED)
th.manual_seed(SEED)
th.backends.cudnn.deterministic = True

In [9]:
ticker = "SBIN.NS"
datasets = Path.cwd().parent / ("datasets")
model_name = "A2C"
num_envs = 16
seed = 1337


train_file = datasets / f"{ticker}_train"
trade_file = datasets / f"{ticker}_trade"


train_df = pl.read_parquet(train_file)
trade_df = pl.read_parquet(trade_file)
train_array = create_numpy_array(train_df)
trade_arrays = create_numpy_array(trade_df)


trade_envs = create_envs(
    StockTradingEnv, trade_arrays, num_envs=num_envs, mode="trade", seed=seed
)


In [10]:
trained_model_dir = Path.cwd().parent.parent / ("trained_models")
model_filename = trained_model_dir / f"sb_{model_name}_{ticker}_single_digit_reward_default_parameters"
a2c_expert = A2C.load(model_filename, env=trade_envs, force_reset=False)

In [20]:
obs = trade_envs.reset()
counter = 0
while counter < num_envs:
    action, _ = a2c_expert.predict(obs, deterministic=False)
    obs, rewards, dones, infos = trade_envs.step(action)

    for i in range(num_envs):
        if dones[i]:
            profit_loss = [info["cummulative_profit_loss"] for info in (infos)]
            counter += 1
print(profit_loss)

[-18.5, 0, -4.89996337890625, 0, -32.14996337890625, -33.64996337890625, -23.64996337890625, 0, 0, -32.8499755859375, 0, -0.0999755859375, 0, -6.1500244140625, -20.39996337890625, -35.75]
