PSEUDO CODE:

Algorithm: REINFORCE
Initialise parameters: step size 𝛼 ∈ (0,1]
Initialise policy network 𝜋 with parameters 𝜽

For episode = 1, 𝑀 do
    Generate an episode trajectory 𝜏~𝜋𝜽
    For 𝑡 = 1, 𝑇 − 1 do
       G <- Sum of rewards
       𝜽 <- update parameters
    End For
End For



In [4]:
import gym
import numpy as np
from collections import deque
import torch 
import torch.nn as nn

# Initialize the environment
env = gym.make('CartPole-v1')

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size, hidden_size=32):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        action_probs = torch.softmax(self.fc2(x), dim=-1)
        return action_probs

# Reinforce algorithm
def reinforce(num_episodes, max_steps_per_episode, gamma=0.99, lr=0.01):
    policy_net = PolicyNetwork(state_size=env.observation_space.shape[0],
                               action_size=env.action_space.n)
    optimizer = torch.optim.Adam(policy_net.parameters(), lr=lr)
    
    for episode in range(num_episodes):
        state = env.reset()

        state = state[0]
        episode_rewards = deque()

        
        for step in range(max_steps_per_episode):
            action_probs = policy_net(torch.from_numpy(state).float())
            action = np.random.choice(env.action_space.n, p=action_probs.detach().numpy())
            
            next_state, reward, done, _ = env.step(action)
            episode_rewards.append(reward)
            
            if done:
                break
                
            state = next_state
        
        # Compute the discounted cumulative reward
        discounted_rewards = []
        R = 0
        for r in episode_rewards[::-1]:
            R = r + gamma * R
            discounted_rewards.insert(0, R)
        
        # Normalize the discounted rewards
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)
        
        # Compute the policy gradient and update the policy network
        policy_losses = []
        for reward, log_prob in zip(discounted_rewards, log_probs):
            policy_losses.append(-log_prob * reward)
        
        policy_loss = torch.stack(policy_losses).sum()
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
    return policy_net

# Train the policy network
policy_net = reinforce(num_episodes=1000, max_steps_per_episode=200)





  if not isinstance(terminated, (bool, np.bool8)):


ValueError: too many values to unpack (expected 4)

In [None]:
# Please write your code for Exercise 1 in this cell or in as many cells as you want ABOVE this cell.
# You should implement your MC agent here.
# Do NOT delete this cell.
from statistics import mean
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical



class Policynetwork(nn.Module):
    def __init__(self, input_size=4, hidden_size1=16, hidden_size2=8, output_size=9):
        super(Policynetwork, self).__init__()
        # Define your network structure
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, x):
        # Define the forward pass
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        logits = self.fc3(x)
        logits = torch.clamp(logits, -10, 10)  # Clip values to avoid overflow
        return self.softmax(logits)
        


# YOUR CODE HERE
class MCAgent:
    def __init__(self, env, gamma=0.9, learning_rate=0.1, epsilon = 0.3):
        self.gamma = gamma
        self.env = env
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        
        
    def generate_episode(self, policy_net):        
        # initialised lists and variable
        rewards = []
        next_states = []
        summed_rewards = 0
        actions = []
        
        # reset the env to get initial state
        state = self.env.reset()
        state = torch.tensor(state)
        terminal = False
        episode_reward = 0


        print("start episode")
        # generate an entire episode
        while not terminal:
            if random.random() < self.epsilon:  # With probability epsilon, choose a random action
                action = random.choice(self.env.get_actions().copy())
                actions.append(action)
            else:
                state = state.float()
                # we get the action depending on our policy network
                probs = policy_net(state)
                m = Categorical(probs)
                action = m.sample()
                actions.append(action)
                action = action.item()
  
            next_state, reward, terminal = self.env.step(action)
            
            rewards.append(reward)
            next_states.append(torch.tensor(next_state, dtype=torch.float32))
            
            summed_rewards += reward
#             print(summed_rewards)
            state = torch.tensor(next_state)
            
            
    
            if terminal:
                self.epsilon = max(0.01, self.epsilon * 0.995)
                break
        return rewards, next_states, summed_rewards, actions
    
    
    def learn(self, rewards, next_states, actions, policy_net):
        """
        After each episode, this function will be called. This function allows the agent to 
        calculate the total discounted return from each episode, and append the value into the list that corresponds
        to the first-visited (state, action) tuple in the sample-returns dict.
        """
        returns = [0] * len(rewards)
        
        G_t = 0
        for i in reversed(range(len(rewards))):
            G_t = rewards[i] + self.gamma * G_t
            returns[i] = G_t
        
        # convert lists to tensors
        next_states = torch.stack(next_states)
        actions = torch.tensor(actions)
        returns = torch.tensor(returns)

        # calculate log probs and multiply by returns
        log_probs = torch.log(policy_net(next_states))
        selected_log_probs = log_probs[range(len(actions)), actions]  # Only the log_probs of taken actions
        loss = -torch.sum(selected_log_probs * returns)  # Negative for gradient ascent
        
        optimiser = torch.optim.Adam(policy_net.parameters(), lr=self.learning_rate)
        optimiser.zero_grad()  # Clear previous gradients
        loss.backward()  # Compute new gradients
        optimiser.step()  # Update the parameters


num_agents = 20
num_episodes = 150
env = RacetrackEnv()
mc_rewards = []
policy_net = Policynetwork()

for i in range(num_agents):
    mcagent = MCAgent(env)
    episode_rewards = []
    print(f"Agent {i + 1}")
    
    for episode in range(num_episodes):
        # first generate an episode
        rewards, next_states, summed_rewards, actions = mcagent.generate_episode(policy_net)
        
        #learn from that episode
        mcagent.learn(rewards, next_states, actions, policy_net)
        
        episode_rewards.append(summed_rewards)
        
    mc_rewards.append(episode_rewards)
    