In [1]:
import numpy as np
import torch
from vctr.data.data_loader import get_data_with_features, scale_all_but_ohlc
from vctr.rl.agent import PPOAgent
from vctr.rl.environment import TradingEnvironment

symbol = 'ETH'
timeframe = '1h'
start = None
end = None

torch.set_default_device('mps')

data = get_data_with_features(symbol, timeframe, start=start, end=end)
data = scale_all_but_ohlc(data)

env = TradingEnvironment(data)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = PPOAgent(state_size, action_size)

num_episodes = 1000
timesteps_per_batch = 512
num_mini_batches = 100

for episode in range(num_episodes):
    states = []
    actions = []
    rewards = []
    log_probs = []
    next_states = []
    done_flags = []

    state = env.reset()
    episode_reward = 0

    for _ in range(timesteps_per_batch):
        action, log_prob = agent.get_action(state)
        next_state, reward, done, _, _ = env.step(action)
        episode_reward += reward

        states.append(state)
        actions.append(action)
        rewards.append(reward)
        log_probs.append(log_prob.item())
        next_states.append(next_state)
        done_flags.append(done)

        state = next_state

        if done:
            break

    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    log_probs_old = np.array(log_probs)
    next_states = np.array(next_states)
    done_flags = np.array(done_flags)

    returns = np.zeros_like(rewards)
    running_return = 0
    for t in reversed(range(len(rewards))):
        running_return = rewards[t] + 0.99 * running_return * (1 - done_flags[t])
        returns[t] = running_return

    advantages = returns - np.mean(returns)
    advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8)

    agent.train(states, actions, log_probs_old, returns, advantages)

    print(f'Episode {episode + 1}: Total reward = {episode_reward}')


Episode 1: Total reward = -159.0441020966626
Episode 2: Total reward = -148.87375534883876
Episode 3: Total reward = -146.31784137500017
Episode 4: Total reward = -132.41854221384637
Episode 5: Total reward = -133.32984092666123
Episode 6: Total reward = -130.8936778243714
Episode 7: Total reward = -119.70307742978265
Episode 8: Total reward = -109.90458393111126
Episode 9: Total reward = -114.44778164757476
Episode 10: Total reward = -105.20020358216323
Episode 11: Total reward = -110.75219773650696
Episode 12: Total reward = -90.1276408658876
Episode 13: Total reward = -85.08883225190998
Episode 14: Total reward = -85.4808429752658
Episode 15: Total reward = -94.87915555847856
Episode 16: Total reward = -73.12964399965884
Episode 17: Total reward = -78.62188577991598
Episode 18: Total reward = -73.2919148974748
Episode 19: Total reward = -82.62674316941765
Episode 20: Total reward = -65.08080698793458
Episode 21: Total reward = -64.4138709757507
Episode 22: Total reward = -58.6918340