In [None]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical,normal

# env = gym.make("CartPole-v1")
env = gym.make("Hopper-v5")


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [None]:
eps = np.finfo(np.float32).eps.item()

In [None]:
class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(11, 128)

        # actor's layer
        self.action_head = nn.Linear(128, 3)  # 3 actions, mean for each action
        self.action_std = nn.Linear(128, 3)  # std for each action
        # critic's layer
        self.value_head = nn.Linear(128, 1)

        # action & reward buffer
        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        x = self.affine1(x)

        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = F.relu(self.action_head(x))
        action_std = F.softplus(self.action_std(x))+eps
        # critic: evaluates being in the state s_t
        state_values = self.value_head(x)

        # return values for both actor and critic as a tuple of 2 values:
        # 1. a list with the probability of each action over the action space
        # 2. the value from state s_t
        return action_prob, action_std,state_values

In [None]:
model = Policy()
optimizer = optim.Adam(model.parameters(), lr=3e-2)


In [None]:
gamma = 0.99 # discount factor for rewards
def select_action(state):
    state = torch.from_numpy(state).float()
    probs_mean, probs_std,state_value = model(state)
    actions=[]
    # create a categorical distribution over the list of probabilities of actions
    for i in range(len(probs_mean)):
        m = normal.Normal(probs_mean[i],probs_std[i])  # using normal distribution for sampling

    # and sample an action using the distribution
        action = m.sample()
        actions.append(action)
    action = torch.tensor(actions)
    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))

    # the action to take (left or right)
    return action.numpy()


def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = [] # list to save actor (policy) loss
    value_losses = [] # list to save critic (value) loss
    returns = [] # list to save the true values

    # calculate the true value using rewards returned from the environment
    for r in model.rewards[::-1]:
        # calculate the discounted value
        R = r + gamma * R
        returns.insert(0, R)

    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()

        # calculate actor (policy) loss
        policy_losses.append(-log_prob * advantage)

        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))

    # reset gradients
    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # perform backprop
    loss.backward()
    optimizer.step()

    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]



In [None]:

running_reward = 10
num_episodes = 10000
log_interval = 10
render=True
# run infinitely many episodes
for i_episode in range(num_episodes):

    # reset environment and episode reward
    state, _ = env.reset()
    ep_reward = 0

    # for each episode, only run 9999 steps so that we don't
    # infinite loop while learning
    for t in range(1, 10000):

        # select action from policy
        action = select_action(state)

        # take the action
        state, reward, terminated, truncated, _ = env.step(action)

        # if render:
        #     env.render()

        model.rewards.append(reward)
        ep_reward += reward
        if terminated or truncated:
            break

    # update cumulative reward
    running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

    # perform backprop
    finish_episode()

    # log results
    if i_episode % log_interval == 0:
        print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                i_episode, ep_reward, running_reward))

    # check if we have "solved" the cart pole problem
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, t))
        break

