In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
import gym_anytrading
from collections import deque
import matplotlib.pyplot as plt
import pandas as pd
import torch.optim as optim



In [2]:
class ActorCritic(nn.Module):
    def __init__(self, input_dim, n_actions):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
            nn.Softmax(dim=-1)
        )
        
        self.critic = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        
    def forward(self, state):
        probs = self.actor(state)
        value = self.critic(state)
        return probs, value
    
    
def compute_loss(log_probs, values, rewards, gamma):
    # Compute returns and advantage
    returns = []
    Gt = 0
    pw = 0
    for reward in reversed(rewards):
        Gt = reward + gamma ** pw * Gt
        returns.insert(0, Gt)
        pw += 1
    
    log_probs = torch.stack(log_probs)
    returns = torch.tensor(returns)
    values = torch.cat(values)
    
    advantage = returns - values
    actor_loss = -(log_probs * advantage).mean()
    critic_loss = advantage.pow(2).mean()
    
    return actor_loss, critic_loss



In [3]:
def train(env, model, episode, T, actor_lr=0.01, critic_lr=0.05, gamma=0.99):
    state, _ = env.reset()
    
    
    log_probs = []
    values = []
    rewards = []
    episode_reward = 0

    for t in range(T):
        state = state.flatten()
        state = torch.tensor([state], dtype=torch.float32)
        
        probs, value = model(state)
        distribution = torch.distributions.Categorical(probs)
        action = distribution.sample()
        next_state, reward, terminated, truncated, info = env.step(action.item())
        done = terminated or truncated
        
        log_prob = distribution.log_prob(action)
        log_probs.append(log_prob)
        values.append(value)
        rewards.append(reward)
        episode_reward += reward
        if done:
            #print(f"Episode {episode} finished with total reward {episode_reward}")
            break
        state = next_state
    actor_loss, critic_loss = compute_loss(log_probs, values, rewards, gamma)
    
    for param in model.actor.parameters():
        if param.grad is not None:
            param.grad.data.zero_()
    actor_loss.backward(retain_graph=True)
    
    for param in model.actor.parameters():
        param.data -= actor_lr * param.grad.data
        
    
    for param in model.critic.parameters():
        if param.grad is not None:
            param.grad.data.zero_()
    critic_loss.backward()
    for param in model.critic.parameters():
        param.data -= critic_lr * param.grad.data
    
    return episode_reward, info


In [4]:
env = gym.make('forex-v0', frame_bound=(10, 500), window_size=10)
input_dim = env.observation_space.shape[1] * env.observation_space.shape[0]
n_actions = env.action_space.n

model = ActorCritic(input_dim, n_actions)
optimizer = optim.Adam(model.parameters(), lr=0.001)
T = env.frame_bound[1] - env.frame_bound[0]

num_episodes = 300
for episode in range(num_episodes):
    reward, info = train(env, model, episode, T)
    print(f"Episode: {episode+1}, Reward: {reward}")
    print("info: ", info)
    print()

  logger.warn(
  state = torch.tensor([state], dtype=torch.float32)


Episode: 1, Reward: 274.09911155700684
info:  {'total_reward': 274.09911155700684, 'total_profit': 0.9732487103798101, 'position': <Positions.Long: 1>}

Episode: 2, Reward: 109.48061943054199
info:  {'total_reward': 109.48061943054199, 'total_profit': 0.9634638999985708, 'position': <Positions.Short: 0>}

Episode: 3, Reward: -464.91026878356934
info:  {'total_reward': -464.91026878356934, 'total_profit': 0.9383310757435527, 'position': <Positions.Long: 1>}

Episode: 4, Reward: -439.9096965789795
info:  {'total_reward': -439.9096965789795, 'total_profit': 0.9411874808597205, 'position': <Positions.Short: 0>}

Episode: 5, Reward: 351.5148162841797
info:  {'total_reward': 351.5148162841797, 'total_profit': 0.9752010745711761, 'position': <Positions.Short: 0>}

Episode: 6, Reward: -300.49800872802734
info:  {'total_reward': -300.49800872802734, 'total_profit': 0.9466241312930493, 'position': <Positions.Long: 1>}

Episode: 7, Reward: -18.918514251708984
info:  {'total_reward': -18.918514251

Episode: 55, Reward: -290.50350189208984
info:  {'total_reward': -290.50350189208984, 'total_profit': 0.9488097327349099, 'position': <Positions.Long: 1>}

Episode: 56, Reward: -224.1194248199463
info:  {'total_reward': -224.1194248199463, 'total_profit': 0.9492979246188287, 'position': <Positions.Short: 0>}

Episode: 57, Reward: -2.6917457580566406
info:  {'total_reward': -2.6917457580566406, 'total_profit': 0.9598276065555743, 'position': <Positions.Short: 0>}

Episode: 58, Reward: 183.29977989196777
info:  {'total_reward': 183.29977989196777, 'total_profit': 0.9683023625003251, 'position': <Positions.Short: 0>}

Episode: 59, Reward: 337.0988368988037
info:  {'total_reward': 337.0988368988037, 'total_profit': 0.9743498378023, 'position': <Positions.Short: 0>}

Episode: 60, Reward: -616.985559463501
info:  {'total_reward': -616.985559463501, 'total_profit': 0.9298250679113595, 'position': <Positions.Short: 0>}

Episode: 61, Reward: -115.08822441101074
info:  {'total_reward': -115.0882

Episode: 108, Reward: -25.103092193603516
info:  {'total_reward': -25.103092193603516, 'total_profit': 0.9568178816568007, 'position': <Positions.Short: 0>}

Episode: 109, Reward: 248.11029434204102
info:  {'total_reward': 248.11029434204102, 'total_profit': 0.970580551068213, 'position': <Positions.Long: 1>}

Episode: 110, Reward: -137.09783554077148
info:  {'total_reward': -137.09783554077148, 'total_profit': 0.9528115270078751, 'position': <Positions.Long: 1>}

Episode: 111, Reward: 234.89952087402344
info:  {'total_reward': 234.89952087402344, 'total_profit': 0.9739205117818599, 'position': <Positions.Short: 0>}

Episode: 112, Reward: 128.5099983215332
info:  {'total_reward': 128.5099983215332, 'total_profit': 0.9661315301959953, 'position': <Positions.Long: 1>}

Episode: 113, Reward: -334.31291580200195
info:  {'total_reward': -334.31291580200195, 'total_profit': 0.9439410465882039, 'position': <Positions.Long: 1>}

Episode: 114, Reward: 380.91421127319336
info:  {'total_reward': 

Episode: 161, Reward: 106.10818862915039
info:  {'total_reward': 106.10818862915039, 'total_profit': 0.9676567861904711, 'position': <Positions.Short: 0>}

Episode: 162, Reward: 397.20892906188965
info:  {'total_reward': 397.20892906188965, 'total_profit': 0.9734347053896356, 'position': <Positions.Short: 0>}

Episode: 163, Reward: -212.30816841125488
info:  {'total_reward': -212.30816841125488, 'total_profit': 0.9539384870045253, 'position': <Positions.Short: 0>}

Episode: 164, Reward: -59.32331085205078
info:  {'total_reward': -59.32331085205078, 'total_profit': 0.9554947576363391, 'position': <Positions.Long: 1>}

Episode: 165, Reward: -95.39484977722168
info:  {'total_reward': -95.39484977722168, 'total_profit': 0.9522449652118995, 'position': <Positions.Short: 0>}

Episode: 166, Reward: -340.9159183502197
info:  {'total_reward': -340.9159183502197, 'total_profit': 0.9419416342592505, 'position': <Positions.Short: 0>}

Episode: 167, Reward: -300.50039291381836
info:  {'total_reward

Episode: 214, Reward: -131.49261474609375
info:  {'total_reward': -131.49261474609375, 'total_profit': 0.9532702114897995, 'position': <Positions.Short: 0>}

Episode: 215, Reward: -608.7207794189453
info:  {'total_reward': -608.7207794189453, 'total_profit': 0.9316905563152768, 'position': <Positions.Long: 1>}

Episode: 216, Reward: -143.88680458068848
info:  {'total_reward': -143.88680458068848, 'total_profit': 0.9497592196942392, 'position': <Positions.Short: 0>}

Episode: 217, Reward: -318.4974193572998
info:  {'total_reward': -318.4974193572998, 'total_profit': 0.9455898040133228, 'position': <Positions.Short: 0>}

Episode: 218, Reward: -18.89824867248535
info:  {'total_reward': -18.89824867248535, 'total_profit': 0.9613478419960609, 'position': <Positions.Long: 1>}

Episode: 219, Reward: 86.49349212646484
info:  {'total_reward': 86.49349212646484, 'total_profit': 0.9655894166544424, 'position': <Positions.Long: 1>}

Episode: 220, Reward: -608.1032752990723
info:  {'total_reward': 

Episode: 267, Reward: -450.87695121765137
info:  {'total_reward': -450.87695121765137, 'total_profit': 0.9368587998904923, 'position': <Positions.Short: 0>}

Episode: 268, Reward: 220.27850151062012
info:  {'total_reward': 220.27850151062012, 'total_profit': 0.9709945824977978, 'position': <Positions.Long: 1>}

Episode: 269, Reward: -478.6956310272217
info:  {'total_reward': -478.6956310272217, 'total_profit': 0.93833144178393, 'position': <Positions.Short: 0>}

Episode: 270, Reward: -138.08727264404297
info:  {'total_reward': -138.08727264404297, 'total_profit': 0.9558979442790017, 'position': <Positions.Long: 1>}

Episode: 271, Reward: 76.10321044921875
info:  {'total_reward': 76.10321044921875, 'total_profit': 0.966149726956486, 'position': <Positions.Long: 1>}

Episode: 272, Reward: -221.70066833496094
info:  {'total_reward': -221.70066833496094, 'total_profit': 0.9511785353627958, 'position': <Positions.Short: 0>}

Episode: 273, Reward: 248.08049201965332
info:  {'total_reward': 2