In [1]:
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical,normal

# env = gym.make("CartPole-v1")
env = gym.make("Hopper-v5")


# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if GPU is to be used
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

In [None]:
gamma = 0.99

In [None]:
class Actor(nn.Module):

    def __init__(self):
        super(Actor, self).__init__()
        self.linear = nn.Linear(11, 128)

        self.action_mean = nn.Linear(128, 3)  # 3 actions, mean for each action
        self.action_std = nn.Linear(128, 3)  # std for each action
        # self.value_head = nn.Linear(128, 1)

    def forward(self, x):

        x = self.linear(x)

        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = F.relu(self.action_mean(x))
        action_std = F.softplus(self.action_std(x))

        # return values for both actor and critic as a tuple of 2 values:
        # 1. a list with the probability of each action over the action space
        # 2. the value from state s_t
        return action_prob, action_std
    
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.linear = nn.Linear(11, 128)
        self.value = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.linear(x))
        state_values = self.value(x)
        return state_values

In [None]:
actor = Actor().to(device)
critic = Critic().to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=3e-2)
critic_optimizer = optim.Adam(critic.parameters(), lr=3e-2)
# hyperparameters
eps = np.finfo(np.float32).eps.item()

In [3]:
def get_action(state):
    state = torch.from_numpy(state).float()
    probs_mean, probs_std= actor_optimizer(state)
    actions=[]
    act_log_prob = []
    # create a categorical distribution over the list of probabilities of actions
    for i in range(len(probs_mean)):
        m = normal.Normal(probs_mean[i],probs_std[i])  # using normal distribution for sampling

    # and sample an action using the distribution
        action = m.sample()
        actions.append(action)
        act_log_prob.append(m.log_prob(action))
    actions = torch.tensor(actions)
    logp = np.sum(act_log_prob)
    return actions.numpy(),logp

def get_value(state):
    state = torch.from_numpy(state).float()
    state_value = critic(state)
    return state_value.item()

In [None]:
def compute_loss(rewards,acts,values,logp):
    policy_losses = []
    
    for r in rewards[::-1]:
        # calculate the discounted value
        R = r + gamma * R
        returns.insert(0, R)

    for (log_prob, value), R in zip(logp, values,rewards):
        advantage = R - value.item()

        # calculate actor (policy) loss
        policy_losses.append(-log_prob * advantage)


        # sum up all the values of policy_losses and value_losses
        loss = torch.stack(policy_losses).sum()

In [None]:
num_episodes = 10000
log_interval = 10
render=False
# run infinitely many episodes
for i_episode in range(num_episodes):

    # reset environment and episode reward
    state, _ = env.reset()
    episode_rewards = []
    episode_acts = []
    episode_values = []
    episode_logp = []
    # for each episode, only run 9999 steps so that we don't
    # infinite loop while learning
    for t in range(1, 10000):

        # select action from policy
        action,logp = get_action(state)
        value = get_value(state)
        # take the action
        state, reward, terminated, truncated, _ = env.step(action)

        episode_rewards.append(reward)
        episode_acts.append(action)
        episode_values.append(value)
        episode_logp.append(logp)
        if terminated or truncated:
            break


    # perform backprop
    compute_loss(episode_rewards, episode_acts, episode_values,episode_logp)
    actor_optimizer.zero_grad()
    batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                              act=torch.as_tensor(batch_acts, dtype=torch.int32),
                              weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                              )
    batch_loss.backward()
    critic_optimizer.step()
    # log results
    if i_episode % log_interval == 0:
        print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                i_episode, ep_reward, running_reward))

    # check if we have "solved" the cart pole problem
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, t))
        break



In [None]:
def train_one_epoch():
    # make some empty lists for logging.
    batch_states = []          # for observations
    batch_acts = []         # for actions
    batch_weights = []      # for R(tau) weighting in policy gradient
    batch_rets = []         # for measuring episode returns
    batch_lens = []         # for measuring episode lengths
    batch_size = 10000      # number of steps to take in each epoch
    # reset episode-specific variables
    states = env.reset()       # first obs comes from starting distribution
    done = False            # signal from environment that episode is over
    epoch_rewards = []            # list for rewards accrued throughout ep


    # collect experience by acting in the environment with current policy
    while True:

        # save states
        batch_states.append(states.copy())

        # act in the environment
        act = get_action(torch.as_tensor(states, dtype=torch.float32))
        states, reward, terminated, truncated,_ = env.step(act)

        # save action, reward
        batch_acts.append(act)
        epoch_rewards.append(reward)

        if terminated or truncated:
            # if episode is over, record info about episode
            epoch_return, epoch_len = sum(epoch_rewards), len(epoch_rewards)
            batch_rets.append(ep_ret)
            batch_lens.append(ep_len)

            # the weight for each logprob(a|s) is R(tau)
            batch_weights += [ep_ret] * ep_len

            # reset episode-specific variables
            obs, done, ep_rews = env.reset(), False, []


            # end experience loop if we have enough of it
            if len(batch_obs) > batch_size:
                break

    # take a single policy gradient update step
    actor_optimizer.zero_grad()
    batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                              act=torch.as_tensor(batch_acts, dtype=torch.int32),
                              weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                              )
    batch_loss.backward()
    critic_optimizer.step()
    return batch_loss, batch_rets, batch_lens