In [15]:
from collections import namedtuple, deque
import math
import random
from tqdm import tqdm

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. CartPole environment

In [2]:
""" Environment Information
ref: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py#L17

Action Space
    The action is a `ndarray` with shape `(1,)` which can take values `{0, 1}` indicating the direction
     of the fixed force the cart is pushed with.
    | Num | Action                 |
    |-----|------------------------|
    | 0   | Push cart to the left  |
    | 1   | Push cart to the right |
    **Note**: The velocity that is reduced or increased by the applied force is not fixed and it depends on the angle
     the pole is pointing. The center of gravity of the pole varies the amount of energy needed to move the cart underneath it
Observation Space
    The observation is a `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:
    | Num | Observation           | Min                 | Max               |
    |-----|-----------------------|---------------------|-------------------|
    | 0   | Cart Position         | -4.8                | 4.8               |
    | 1   | Cart Velocity         | -Inf                | Inf               |
    | 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |
    | 3   | Pole Angular Velocity | -Inf                | Inf               |
    **Note:** While the ranges above denote the possible values for observation space of each element,
        it is not reflective of the allowed values of the state space in an unterminated episode. Particularly:
    -  The cart x-position (index 0) can be take values between `(-4.8, 4.8)`, but the episode terminates
       if the cart leaves the `(-2.4, 2.4)` range.
    -  The pole angle can be observed between  `(-.418, .418)` radians (or **±24°**), but the episode terminates
       if the pole angle is not in the range `(-.2095, .2095)` (or **±12°**)
Rewards
    Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken,
    including the termination step, is allotted. The threshold for rewards is 475 for v1.
Episode End
    The episode ends if any one of the following occurs:
    1. Termination: Pole Angle is greater than ±12°
    2. Termination: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)
    3. Truncation: Episode length is greater than 500 (200 for v0)
"""
env = gym.make("CartPole-v1")
# env.reset()
# env.render()

Sample environment image

<img width=300 src="cartpole.png" />

In [3]:
print("observation_space: ", env.observation_space)
print("action_space: ", env.action_space)
state = env.reset()
print("sample obs: ", state)

observation_space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action_space:  Discrete(2)
sample obs:  [ 0.01738542 -0.02711608 -0.03005168 -0.00942616]


In [4]:
# sample step
new_state, reward, terminate, info = env.step(env.action_space.sample())
print("sample step: ", (new_state, reward, terminate, info))

sample step:  (array([ 0.0168431 ,  0.16842367, -0.0302402 , -0.31143722], dtype=float32), 1.0, False, {})


# 2. Actor-Critic (base class)

In [5]:
class ActorNN(nn.Module):
    """ Actor Network represents the agent's policy
    forward output: probability distribution over actions
    """

    def __init__(self, n_observations, n_actions):
        super().__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.softmax(self.layer3(x), dim=-1)  # dim need to be -1 to prevent NaN results
        return x
    
class CriticNN(nn.Module):
    """ Critic Network represents the agent's value function, typically represented as a function approximator 
    forward output: expected cumulative reward from a given state (or state-action pair)
    """

    def __init__(self, n_observations, n_actions, is_value=False):
        super().__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        if is_value:
            self.layer3 = nn.Linear(128, 1)
        else:
            self.layer3 = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [6]:
sample_actor = ActorNN(n_observations=env.observation_space.shape[0], n_actions=env.action_space.n)
pred = sample_actor(torch.tensor(state).float())
print("actor net (probs): ", pred)

sample_critic = CriticNN(n_observations=env.observation_space.shape[0], n_actions=env.action_space.n)
pred = sample_critic(torch.tensor(state).float())
print("critic net (value): ", pred)

sample_critic = CriticNN(n_observations=env.observation_space.shape[0], n_actions=env.action_space.n, is_value=True)
pred = sample_critic(torch.tensor(state).float())
print("critic net (value): ", pred)

actor net (probs):  tensor([0.4941, 0.5059], grad_fn=<SoftmaxBackward0>)
critic net (value):  tensor([-0.0517, -0.0616], grad_fn=<AddBackward0>)
critic net (value):  tensor([-0.0672], grad_fn=<AddBackward0>)


In [7]:
class ActorCritic:
    def __init__(self, env, gamma=0.99, learning_rate=3e-4, critic_value=False, device="cpu"):
        self.env = env
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.critic_value = critic_value
        self.device = device

        # get number of actions and observations
        self._n_observations = self.env.observation_space.shape[0]
        self._n_actions = self.env.action_space.n

        # setup NN model
        self.actor_net = ActorNN(self._n_observations, self._n_actions).to(self.device)
        self.critic_net = CriticNN(self._n_observations, self._n_actions, is_value=self.critic_value).to(self.device)

        # setup optimizer
        self.actor_optimizer = optim.Adam(self.actor_net.parameters(), lr=self.learning_rate)
        self.critic_optimizer = optim.Adam(self.critic_net.parameters(), lr=self.learning_rate)
        
    def choose_action(self, state):
        # TODO: apply epsilon-greedy for exploration
        _state = torch.from_numpy(state).float()
        action_probs = self.actor_net(_state)
        m = Categorical(action_probs)
        selected_action = m.sample()
        log_prop = m.log_prob(selected_action)
        return selected_action.item(), log_prop

    def compute_loss(self):
        """loss calculation for actor and critic network"""
        # return actor_loss, critic_loss
        raise NotImplementedError()

    def train(self, num_episodes=1000, max_steps=500, batch_size=1, log_interval=100, solve_score=450):
        episode_rewards = list()
        experience_buffer = list()
        running_rewards = deque(maxlen=100)  # track recent last return to identify if the environment is solved

        for episode in range(num_episodes):
            state = env.reset()
            episode_reward = 0

            for _ in range(max_steps):
                action, log_prop = self.choose_action(state)
                next_state, reward, done, _ = env.step(action)
                episode_reward += reward

                # store transitions
                experience_buffer.append((state, action, reward, next_state, done, log_prop))

                state = next_state

                if done or len(experience_buffer) >= batch_size:
                    # update actor and critic network using the collected experiences
                    states, actions, rewards, next_states, dones, log_props = zip(*experience_buffer)
                    self.update(states, actions, rewards, next_states, dones, log_props)
                    # clear the buffer
                    experience_buffer.clear()

                    if done:
                        break

            episode_rewards.append(episode_reward)
            
            # log results
            running_rewards.append(episode_reward)
            running_mean = np.array(running_rewards).mean()
            running_std_dev = np.array(running_rewards).std()
            running_max = np.array(running_rewards).max()
            if (episode+1) % log_interval == 0:
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}, max: {running_max:.2f}")
            
            # check for early stopping
            if running_mean >= solve_score and len(running_rewards) >= 100:
                print(f"Solved! Running reward is now {running_mean:.2f}")
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}, max: {running_max:.2f}")
                break
            
        return episode_rewards

    def update(self, states, actions, rewards, next_states, dones, log_props):
        # preprocess transitions (change to numpy-array first to speed up the tensor conversion process)
        states = torch.tensor(np.array(states), dtype=torch.float32)
        actions = torch.tensor(np.array(actions), dtype=torch.int64)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32)
        dones = torch.tensor(np.array(dones), dtype=torch.float32)
        log_props = torch.stack(log_props)

        # calculate loss on actor and critic network
        actor_loss, critic_loss = self.compute_loss(states, actions, rewards, next_states, dones, log_props)

        # perform one step of the optimization on the actor network
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # perform one step of the optimization on the critic network
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def get_trajectory(self, max_steps=500, render=False):
        trajectory = list()
        done = False  # incase of early loop-termination (max_steps) before the environment terminated
        state = self.env.reset()

        for _ in range(max_steps):
            selected_action, _ = self.choose_action(state)
            next_state, reward, done, _ = self.env.step(selected_action)
            trajectory.append((state, selected_action, reward))
            state = next_state
            
            if render:
                self.env.render()

            if done:
                break

        return trajectory, done
    
    def visualize_policy(self, num_episodes=1, max_steps=500):
        for _ in range(num_episodes):
            _ = self.get_trajectory(max_steps=max_steps, render=True)
            
    def evaluate_policy(self, num_episodes=100, max_steps=500):
        win = 0
        for _ in range(num_episodes):
            trajectory, _ = self.get_trajectory(max_steps=max_steps)
            # CUSTOMIZE: for cartpole environment
            if len(trajectory) >= 490:
                win += 1
        return win / num_episodes

# 3. Advantage Actor-Critic (A2C)

In [58]:
class AdvantageActorCritic(ActorCritic):
    critic_value = True  # set Critic Network to estimate the Value of a given state

    def __init__(self, env, gamma=0.99, learning_rate=3e-3, device="cpu"):
        super().__init__(env, gamma, learning_rate, self.critic_value, device)

    def _compute_returns(self, next_value, rewards, masks):
        R = next_value
        returns = list()
        for idx in reversed(range(len(rewards))):
            R = rewards[idx] + self.gamma * R * masks[idx]
            returns.insert(0, R)
        return torch.tensor(returns, dtype=torch.float32)

    def compute_loss(self, states, actions, rewards, next_states, dones, log_props):
        masks = 1 - dones  # prevent the target Q-value from being updated when an episode ends (done: mask=0, not done: mask=1)

        # estimate the Values of current state and next state using Critic Network
        values = self.critic_net(states).squeeze(-1)
        batch_next_value = self.critic_net(next_states[-1])  # for the last state in the batch

        # calculate the target value (expected real value at current state)
        expected_returns = self._compute_returns(batch_next_value, rewards, masks)
        target_values = rewards + self.gamma * expected_returns * masks

        # compute MSE loss for Critic Network
        mse_loss = nn.MSELoss()
        critic_loss = mse_loss(values, target_values.detach())

        # compute loss for Actor Network 
        # calculate advantage function
        advantages = expected_returns - values.detach()
        actor_loss = -(log_props * advantages.detach()).mean()

        return actor_loss, critic_loss

In [59]:
env = gym.make("CartPole-v1")
a2c_agent = AdvantageActorCritic(env)
episode_rewards = a2c_agent.train(num_episodes=int(1e3), max_steps=200, batch_size=12, log_interval=100, solve_score=200)

Episode 100	average reward: 63.53, std dev: 52.98, max: 200.00
Episode 200	average reward: 174.20, std dev: 43.24, max: 200.00
Solved! Running reward is now 200.00
Episode 294	average reward: 200.00, std dev: 0.00, max: 200.00


In [60]:
# test the trained agent in 1 episode
trajectory, done = a2c_agent.get_trajectory(max_steps=501)
print("done status: ", done)
print("step taken (maximum 500): ", len(trajectory))

done status:  True
step taken (maximum 500):  500


In [61]:
# test the trained agent
test_episodes = 100
win_rate = a2c_agent.evaluate_policy(num_episodes=test_episodes)
print(f'agent win rate: {win_rate*100 :.2f}% from {test_episodes} test episodes')

agent win rate: 100.00% from 100 test episodes


In [63]:
# visualize the trained agent on a separate window
a2c_agent.visualize_policy(num_episodes=10, max_steps=200)

# 4. TD Actor-Critic

In [69]:
class TDActorCritic(ActorCritic):
    critic_value = True  # set Critic Network to estimate the Value of a given state

    def __init__(self, env, gamma=0.99, learning_rate=3e-3, device="cpu"):
        super().__init__(env, gamma, learning_rate, self.critic_value, device)

    def compute_loss(self, states, actions, rewards, next_states, dones, log_props):
        masks = 1 - dones  # prevent the target Q-value from being updated when an episode ends (done: mask=0, not done: mask=1)

        # estimate the Values of current state and next state using Critic Network
        values = self.critic_net(states).squeeze(-1)
        batch_next_value = self.critic_net(next_states[-1])  # for the last state in the batch

        # calculate the target value (expected real value at current state)
        expected_returns = self._compute_returns(batch_next_value, rewards, masks)
        target_values = rewards + self.gamma * expected_returns * masks

        # compute MSE loss for Critic Network
        mse_loss = nn.MSELoss()
        critic_loss = mse_loss(values, target_values.detach())

        # compute loss for Actor Network
        # calculate TD error
        td_error = target_values - values.detach()
        actor_loss = -(log_props * td_error.detach()).mean()

        return actor_loss, critic_loss

In [73]:
env = gym.make("CartPole-v1")
td_ac_agent = TDActorCritic(env)
episode_rewards = td_ac_agent.train(num_episodes=int(1e3), max_steps=200, batch_size=12, log_interval=100, solve_score=200)

Episode 100	average reward: 62.69, std dev: 66.82, max: 200.00
Episode 200	average reward: 136.11, std dev: 75.04, max: 200.00
Episode 300	average reward: 168.15, std dev: 44.55, max: 200.00
Solved! Running reward is now 200.00
Episode 349	average reward: 200.00, std dev: 0.00, max: 200.00


In [74]:
# test the trained agent in 1 episode
trajectory, done = td_ac_agent.get_trajectory(max_steps=501)
print("done status: ", done)
print("step taken (maximum 500): ", len(trajectory))

done status:  True
step taken (maximum 500):  500


In [75]:
# test the trained agent
test_episodes = 100
win_rate = td_ac_agent.evaluate_policy(num_episodes=test_episodes)
print(f'agent win rate: {win_rate*100 :.2f}% from {test_episodes} test episodes')

agent win rate: 100.00% from 100 test episodes


In [76]:
# visualize the trained agent on a separate window
td_ac_agent.visualize_policy(num_episodes=10, max_steps=500)

# 5. Q Actor-Critic (not success)

In [8]:
class ActorNN2(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(ActorNN2, self).__init__()
        self.layer1 = nn.Linear(n_observations, 64)
        self.dropout = nn.Dropout(p=0.6)
        self.layer2 = nn.Linear(64, n_actions)

    def forward(self, x):
        x = self.layer1(x)
        x = F.relu(self.dropout(x))
        x = F.softmax(self.layer2(x), dim=-1)  # dim need to be -1 to prevent NaN results
        return x

In [38]:
class QActorCritic(ActorCritic):
    critic_value = False  # set Critic Network to estimate Q-Values
    
    def __init__(self, env, gamma=0.99, learning_rate=3e-4, epsilon=0.1, device="cpu"):
        super().__init__(env, gamma, learning_rate, self.critic_value, device)
        self.actor_net = ActorNN2(self._n_observations, self._n_actions).to(self.device)
        self.epsilon = epsilon

    # apply epsilon-greedy policy
    def choose_action(self, state):
        _state = torch.from_numpy(state).float()
        action_probs = self.actor_net(_state)
        if random.random() < self.epsilon:
            # Random action
            selected_action = np.random.choice(self._n_actions)
        else:
            m = Categorical(action_probs)
            selected_action = m.sample().item()
        log_prop = torch.log(action_probs[selected_action])
        return selected_action, log_prop

    def compute_loss(self, states, actions, rewards, next_states, dones, log_props):
        masks = 1 - dones  # prevent the target Q-value from being updated when an episode ends (done: mask=0, not done: mask=1)

        # estimate the Q-values of current state and next state using Critic Network
        q_values = self.critic_net(states)
        batch_next_q_values = self.critic_net(next_states[-1])
        
        # calculate the target Q-value (expected real Q-value at current state)
        selected_q_values = q_values.gather(1, actions.unsqueeze(-1)).squeeze(-1)
        
        # calculate the target value (expected real value at current state)
        batch_max_q_value = batch_next_q_values.max(dim=-1).values
        target_q_values = rewards + self.gamma * batch_max_q_value * masks

        # compute MSE loss for Critic Network
        mse_loss = nn.MSELoss()
        critic_loss = mse_loss(selected_q_values, target_q_values.detach())

        # compute loss for Actor Network
        # calculate advantage function
        qadvantages = target_q_values - selected_q_values.detach()
        actor_loss = -(log_props * qadvantages.detach()).mean()

        return actor_loss, critic_loss

In [39]:
env = gym.make("CartPole-v1")
q_ac_agent = QActorCritic(env)
episode_rewards = q_ac_agent.train(num_episodes=int(5e3), max_steps=200, batch_size=12, log_interval=250, solve_score=200)

Episode 250	average reward: 23.06, std dev: 11.73, max: 59.00
Episode 500	average reward: 20.50, std dev: 10.75, max: 67.00
Episode 750	average reward: 20.03, std dev: 8.41, max: 64.00
Episode 1000	average reward: 20.99, std dev: 10.93, max: 62.00
Episode 1250	average reward: 20.57, std dev: 12.03, max: 77.00
Episode 1500	average reward: 20.40, std dev: 10.16, max: 62.00
Episode 1750	average reward: 20.94, std dev: 10.54, max: 57.00
Episode 2000	average reward: 22.29, std dev: 10.62, max: 77.00
Episode 2250	average reward: 22.62, std dev: 12.30, max: 66.00
Episode 2500	average reward: 21.78, std dev: 10.15, max: 58.00
Episode 2750	average reward: 19.85, std dev: 8.87, max: 47.00
Episode 3000	average reward: 21.18, std dev: 10.35, max: 75.00
Episode 3250	average reward: 19.55, std dev: 10.44, max: 77.00
Episode 3500	average reward: 19.92, std dev: 10.19, max: 72.00
Episode 3750	average reward: 19.40, std dev: 10.95, max: 59.00
Episode 4000	average reward: 21.70, std dev: 9.90, max: 54.0

In [40]:
# test the trained agent in 1 episode
trajectory, done = q_ac_agent.get_trajectory(max_steps=501)
print("done status: ", done)
print("step taken (maximum 500): ", len(trajectory))

done status:  True
step taken (maximum 500):  11


In [643]:
# test the trained agent
test_episodes = 100
win_rate = q_ac_agent.evaluate_policy(num_episodes=test_episodes)
print(f'agent win rate: {win_rate*100 :.2f}% from {test_episodes} test episodes')

agent win rate: 0.00% from 100 test episodes


In [644]:
# visualize the trained agent on a separate window
q_ac_agent.visualize_policy(num_episodes=10)