In [1]:
import numpy as np
import gymnasium as gym
from gymnasium.spaces import Discrete, Box
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from sklearn.preprocessing import StandardScaler
import pandas as pd
from TraderEnvNormilized import TraderEnvNormalized
from DataProvider import DataProvider
import os
from stable_baselines3.common.callbacks import BaseCallback
from sb3_contrib import RecurrentPPO


In [2]:
log_path = os.path.join('Training', 'Logs')
PPO_Path = os.path.join('Training', 'SavedModels', 'PPO_Model_Cartpole')

In [3]:
data_file_path = 'Data/Binance_BTCUSDT_2023_minute.csv'

# Load data
data_provider = DataProvider(data_file_path)
df_raw = data_provider.get_raw_data()[:50_000]
df_normalized = data_provider.get_normalized_data()[:50_000]

# Initialize the environment
trade_env = TraderEnvNormalized(df_raw, df_normalized, trade_size_dollars=9_000, initial_capital=10_000)
env = DummyVecEnv([lambda: trade_env])

In [4]:
n_steps = 2048
n_epochs = 10
# Initialize the model
model = RecurrentPPO("MlpLstmPolicy", env, verbose=1,  tensorboard_log=log_path, ent_coef=0.01, n_steps=n_steps, batch_size=n_steps, n_epochs= n_epochs)
# model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, ent_coef=0.01)


Using cuda device


In [5]:
class StopTrainingOnMaxSteps(BaseCallback):
    def __init__(self, max_steps=49999, verbose=0):
        super(StopTrainingOnMaxSteps, self).__init__(verbose)
        self.max_steps = max_steps

    def _on_step(self) -> bool:
        # Access the environment and get current_step
        current_step = self.training_env.get_attr("current_step")[0]
        current_step = self.training_env.get_attr("sharp_ratio")[0]
        if current_step >= self.max_steps and current_step > 0:
            return False  # Return False to stop the training
        return True

In [6]:
# Usage
max_steps_callback = StopTrainingOnMaxSteps(max_steps=49_999)

model = RecurrentPPO.load(PPO_Path+"SHOCK102", env=env, tensorboard_log=log_path, ent_coef=0.01, n_steps=n_steps, batch_size=n_steps, n_epochs=n_epochs)
model.learn(total_timesteps=5_000_000, callback=max_steps_callback)

Logging to Training\Logs\RecurrentPPO_40
-----------------------------
| time/              |      |
|    fps             | 357  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 2           |
|    time_elapsed         | 39          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.005156777 |
|    clip_fraction        | 0           |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.06       |
|    explained_variance   | -0.00175    |
|    learning_rate        | 0.0003      |
|    loss                 | 67          |
|    n_updates            | 1240        |
|    policy_gradient_loss | -0.000999   |
|    value_loss           | 136         |
-----------------------------------

<sb3_contrib.ppo_recurrent.ppo_recurrent.RecurrentPPO at 0x1e2bcbc4920>

In [7]:
episodes = 1
for episode in range(1, episodes + 1):
    obs = env.reset()
    lstm_states = None  # Initialize LSTM states
    episode_starts = np.ones((1,), dtype=bool)  # Initialize episode starts
    done = False
    score = 0

    while not done:
        env.render(mode="human")
        action, lstm_states = model.predict(obs, state=lstm_states, episode_start=episode_starts)
        obs, rewards, dones, info = env.step(action)
        episode_starts = dones
        done = dones[0]  # Update done status based on the environment
        score += rewards[0]
    env_info = info[0]
    current_capital = env_info.get('current_capital', 'N/A')
    sharpe_ratio = env_info.get('sharp_ratio', 'N/A')
    current_step = env_info.get('current_step', 'N/A')
    trades_amount = env_info.get('trades_amount', 'N/A')
    print(f'Episode: {episode} Score: {score} Current Capital: {current_capital} Sharpe Ratio: {sharpe_ratio} Current step: {current_step} Trades amount: {trades_amount}')



Position Opened: Type: short, Entry Price: 16536.43, Step: 2
Position Closed: Exit Price: 17369.38, Close Step: 12564, Time in Position: 12562, Return from Last Trade: -457.8354539038962
Position Opened: Type: short, Entry Price: 17359.32, Step: 12565
Episode: 1 Score: 15048.188842773438 Current Capital: 9084.18883391383 Sharpe Ratio: -13136.113562662036 Current step: 15964 Trades amount: 2


In [8]:
# episodes = 1
# for episode in range(1, episodes + 1):
#     obs = env.reset()
#     done = False
#     score = 0
    
#     while not done:
#         env.render(mode="human")
#         action, _ = model.predict(obs)
#         obs, reward, done, info = env.step(action)
#         score += reward

#     env_info = info[0]
#     current_capital = env_info.get('current_capital', 'N/A')
#     sharpe_ratio = env_info.get('sharpe_ratio', 'N/A')
#     current_step = env_info.get('current_step', 'N/A')
#     trades_amount = env_info.get('trades_amount', 'N/A')
#     print(f'Episode: {episode} Score: {score} Current Capital: {current_capital} Sharpe Ratio: {sharpe_ratio} Current step: {current_step} Trades amount: {trades_amount}')

# env.close()

In [9]:
# # Evaluate the model
env.reset()
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5, deterministic= False)
print(f'Mean Reward: {mean_reward}, Std Reward: {std_reward}')

# Close the environments
env.close()



Mean Reward: 15568.297114229203, Std Reward: 886.4263444897309


In [10]:

model.save(PPO_Path+"SHOCK103")


In [11]:
# training_log_path = os.path.join(log_path, 'RecurrentPPO_1')


In [12]:
# training_log_path

In [13]:
# !tensorboard --logdir={training_log_path}