In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gymnasium as gym
import gym_anytrading
from collections import deque
import matplotlib.pyplot as plt

class REINFORCE(nn.Module):
    def __init__(self, input_dim, n_actions):
        super(REINFORCE, self).__init__()
        
        self.actor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions),
            nn.Softmax(dim=-1)
        )
        
    def forward(self, state):
        probs = self.actor(state)
        return probs
    

    def get_log_prob_gradient(self, state, action):
        
        probs = self.forward(state)
        log_probs = torch.log(probs)
        log_prob_selected_action = log_probs[0, action]
        log_prob_selected_action.backward()
        
        gradients = {}
        for param in self.parameters():
            gradients[param] = param.grad.clone()
        self.zero_grad()
        
        return gradients

    def get_log_prob_gradient1(self, state, action):
        
        probs = self.forward(state)
        log_probs = torch.log(probs)
        log_prob_selected_action = log_probs[0, action]
        log_prob_selected_action.backward()
        
        gradients = {}
        for name, param in self.named_parameters():
            gradients[name] = param.grad.clone()
        self.zero_grad()
        
        
        return gradients
    
def calculate_cumulative_rewards(rewards):
    returns = []
    tot = 0
    for r in reversed(rewards):
        tot = r + 0.99 * tot
        returns.insert(0, tot)
    return returns

In [8]:
def train(env, model, episode, T, lr=0.01, gamma=0.99):
    state, _ = env.reset()
    state = state.flatten()
    state = torch.tensor([state], dtype=torch.float32)
    episode_rewards = []
    episode_actions = []
    episode_states = [state]
    total_rewards = 0
    
    for t in range(T):
        
        probs = model(state)
        distribution = torch.distributions.Categorical(probs)
        action = distribution.sample()
        next_state, reward, terminated, truncated, info = env.step(action.item())
        done = terminated or truncated
        total_rewards += reward
        
        episode_rewards.append(reward)
        episode_actions.append(action)
        episode_states.append(next_state)
        if done:
            break
        
        state = torch.tensor(next_state.flatten(), dtype=torch.float32).unsqueeze(0)

    discounted_values = calculate_cumulative_rewards(episode_rewards)
    for i in range(len(episode_actions)):
        
        curr_update_state = episode_states[i]
        curr_update_reward = discounted_values[i]
        curr_update_action = episode_actions[i]
        
        gradients = model.get_log_prob_gradient(curr_update_state, curr_update_action)
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        with torch.no_grad():
            for param in model.parameters():
                param += lr * (gamma ** (i+1)) * curr_update_reward * gradients[param]
    
    return total_rewards, info



In [9]:
# Get the environment
env = gym.make('forex-v0', frame_bound=(10, 500), window_size=10)

input_dim = env.observation_space.shape[1] * env.observation_space.shape[0]
n_actions = env.action_space.n

model = REINFORCE(input_dim, n_actions)
T = env.frame_bound[1] - env.frame_bound[0]

num_episodes = 300
for episode in range(num_episodes):
    reward, info = train(env, model, episode, T)
    print(f"Episode: {episode+1}, Reward: {reward}")
    print("info: ", info)
    print()

TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray