In [32]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal
from collections import namedtuple

env = gym.make("MountainCarContinuous-v0",render_mode = "rgb_array",goal_velocity =  0.1)

In [33]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super(Actor, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim)
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))  

    def forward(self, state):
        mean = self.net(state)
        std = torch.exp(self.log_std)
        return mean, std

class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim=128):
        super(Critic, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, state):
        return self.net(state)

In [34]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
print(device)

cuda


In [35]:
def train_actor_critic(env, actor, critic, actor_optimizer, critic_optimizer, num_episodes=500, gamma=0.99,num_steps = 1000):
    Transition = namedtuple('Transition', ['state', 'action', 'reward', 'next_state', 'done'])
    memory = []

    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor(state, dtype=torch.float32).to(device)
        total_reward = 0

        for t in range(num_steps):  
            mean, std = actor(state)
            dist = Normal(mean, std)
            action = dist.sample()
            action_clipped = action.clamp(env.action_space.low[0], env.action_space.high[0])

    
            next_state, reward, done, _, _ = env.step([action_clipped.item()])
            next_state = torch.tensor(next_state, dtype=torch.float32).to(device)
            reward = torch.tensor(reward, dtype=torch.float32).to(device)
            done = torch.tensor(done, dtype=torch.float32).to(device)

          
            memory.append(Transition(state, action, reward, next_state, done))

          
            state = next_state
            total_reward += reward.item()

            if done  == True:
                break

        
        returns = []
        R = 0
        for transition in reversed(memory):
            R = transition.reward + gamma * R * (1 - transition.done)
            returns.insert(0, R)

        returns = torch.tensor(returns, dtype=torch.float32).to(device)
        states = torch.stack([transition.state for transition in memory]).to(device)
        actions = torch.stack([transition.action for transition in memory]).to(device)

 
        values = critic(states).squeeze()
        critic_loss = ((returns - values)*(returns-values)).mean()
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()

      
        advantages = returns - values.detach()
        mean, std = actor(states)
        dist = Normal(mean, std)
        log_probs = dist.log_prob(actions).sum(dim=-1)
        actor_loss = -(log_probs * advantages).mean()
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

        if episode%50 == 0:
            print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Steps: {t + 1}")

        memory.clear()

    print("Training completed!")
    return actor

In [36]:
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import torch
from torch.distributions import Normal
import time

def observe_policy(env, actor, num_episodes=5):
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = torch.tensor(state, dtype=torch.float32)
        total_reward = 0

        # Set up the plot
        fig, ax = plt.subplots(figsize=(6, 4))

        while True:
            # Get the action from the actor
            mean, std = actor(state)
            dist = Normal(mean, std)
            action = dist.sample()
            action_clipped = action.clamp(env.action_space.low[0], env.action_space.high[0])

            # Take the action and observe the next state and reward
            next_state, reward, done, _, _ = env.step([action_clipped.item()])
            state = torch.tensor(next_state, dtype=torch.float32)
            total_reward += reward

            # Render the environment as an RGB array
            frame = env.render()

            # Display the frame in the notebook
            ax.clear()
            ax.imshow(frame)
            ax.axis('off')
            display(fig)
            clear_output(wait=True)

            # End the episode if done
            if done:
                print(f"Episode {episode + 1} finished with reward: {total_reward:.2f}")
                break

            time.sleep(0.002)  # Slow down the animation for visibility

        plt.close(fig)  # Close the figure after each episode

    env.close()


In [None]:

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]

actor = Actor(state_dim, action_dim).to(device)
critic = Critic(state_dim).to(device).to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-1)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-1)

trained_actor = train_actor_critic(env, actor, critic, actor_optimizer, critic_optimizer,num_episodes=800,num_steps=1000)


Episode 0, Total Reward: -5.12, Steps: 100
Episode 50, Total Reward: -5.45, Steps: 100
Episode 100, Total Reward: -3.46, Steps: 100
Episode 150, Total Reward: -1.43, Steps: 100
Episode 200, Total Reward: -0.46, Steps: 100
Episode 250, Total Reward: -0.34, Steps: 100
Episode 300, Total Reward: -0.30, Steps: 100
Episode 350, Total Reward: -0.22, Steps: 100
Episode 400, Total Reward: -0.14, Steps: 100
Episode 450, Total Reward: -0.14, Steps: 100
Episode 500, Total Reward: -0.12, Steps: 100
Episode 550, Total Reward: -0.10, Steps: 100
Episode 600, Total Reward: -0.17, Steps: 100
Episode 650, Total Reward: -10.00, Steps: 100
Episode 700, Total Reward: -10.00, Steps: 100
Episode 750, Total Reward: -10.00, Steps: 100
Training completed!
