In [1]:
from tqdm import tqdm
from collections import deque

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Lunar Lander Environment

In [2]:
""" Environment Information
ref: https://github.com/openai/gym/blob/master/gym/envs/box2d/lunar_lander.py#L75

Action Space
    There are four discrete actions available: do nothing, fire left
    orientation engine, fire main engine, fire right orientation engine.
Observation Space
    The state is an 8-dimensional vector: the coordinates of the lander in `x` & `y`, its linear
    velocities in `x` & `y`, its angle, its angular velocity, and two booleans
    that represent whether each leg is in contact with the ground or not.
Rewards
    For each step, the reward:
    - is increased/decreased the closer/further the lander is to the landing pad.
    - is increased/decreased the slower/faster the lander is moving.
    - is decreased the more the lander is tilted (angle not horizontal).
    - is increased by 10 points for each leg that is in contact with the ground.
    - is decreased by 0.03 points each frame a side engine is firing.
    - is decreased by 0.3 points each frame the main engine is firing.
    The episode receive an additional reward of -100 or +100 points for crashing or landing safely respectively.
    An episode is considered a solution if it scores at least 200 points.
"""
env = gym.make("LunarLander-v2")
# env.reset()
# env.render()

Sample environment image

<img width=300 src="lunar_lander.png" />

In [3]:
print("observation_space: ", env.observation_space)
print("action_space: ", env.action_space)
state = env.reset()
print("sample obs: ", state)

observation_space:  Box([-inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf], (8,), float32)
action_space:  Discrete(4)
sample obs:  [ 0.00572481  1.402558    0.57985747 -0.37165737 -0.00662694 -0.13134624
  0.          0.        ]


In [4]:
# sample step
new_state, reward, done, info = env.step(1)
print("sample step: ", (new_state, reward, done, info))

sample step:  (array([ 0.01137152,  1.3936093 ,  0.5692361 , -0.39774814, -0.01114556,
       -0.09038049,  0.        ,  0.        ], dtype=float32), -0.1594018753271473, False, {})


# 2. REINFORCE: Vanilla Policy Gradient

In [7]:
class VanillaNN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(VanillaNN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 64)
        self.dropout = nn.Dropout(p=0.6)
        self.layer2 = nn.Linear(64, n_actions)

    def forward(self, x):
        x = self.layer1(x)
        x = F.relu(self.dropout(x))
        x = F.softmax(self.layer2(x), dim=-1)  # dim need to be -1 to prevent NaN results
        return x

In [22]:
sample_nn = VanillaNN(n_observations=8, n_actions=4)
pred = sample_nn(torch.tensor(state).float())
print(pred)

tensor([0.2157, 0.3474, 0.2500, 0.1869], grad_fn=<SoftmaxBackward0>)


In [8]:
class Reinforce:
    def __init__(self, env, gamma=0.99, learning_rate=1e-2):
        self.env = env
        self.gamma = gamma
        self.learning_rate = learning_rate

        # get number of actions and observations
        self._n_observations = self.env.observation_space.shape[0]
        self._n_actions = self.env.action_space.n

        # setup NN model
        self.policy_net = VanillaNN(self._n_observations, self._n_actions).to(device)

        # setup optimizer
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.esp = np.finfo(np.float32).eps.item() # prevent division by zero

    def get_trajectory(self, max_steps=500, render=False):
        trajectory = list()
        done = False  # incase of early loop-termination (max_steps) before the environment terminated
        state = self.env.reset()

        for _ in range(max_steps):
            _state = torch.from_numpy(state).float()
            action_probs = self.policy_net(_state)
            m = Categorical(action_probs)
            selected_action = m.sample()
            log_prop = m.log_prob(selected_action)
            next_state, reward, done, _ = self.env.step(selected_action.item())
            trajectory.append((state, selected_action, reward, log_prop))
            state = next_state
            
            if render:
                self.env.render()

            if done:
                break

        return trajectory, done

    def train(self, num_episodes=1000, max_steps=500, SEED=123):
        self.env.seed(SEED)
        _ = torch.manual_seed(SEED)
        self.num_returns = 0
        self.sum_returns = 0
        self.sum_returns_squared = 0

        running_rewards = deque(maxlen=100)
        
        for episode in range(num_episodes):
            # get sample trajectory from the policy network
            trajectory, _ = self.get_trajectory(max_steps=max_steps)
            
            # prepare data for training
            states, actions, rewards, log_props = zip(*trajectory)
            states = torch.Tensor(states)
            actions = torch.Tensor(actions)
            log_props = torch.stack(log_props)
            
            # calculate expected discounted returns
            expected_returns_batch = list()
            for idx in range(len(rewards)):
                discounted_rewards = [self.gamma**i * reward for i, reward in enumerate(rewards[idx:])]
                expected_returns_batch.append(sum(discounted_rewards))
            
            # normalize and reformat expected returns
            expected_returns = torch.tensor(expected_returns_batch)
            
            self.num_returns += len(expected_returns)
            self.sum_returns += sum(expected_returns)
            self.sum_returns_squared += sum(expected_returns**2)
            
            _return_mean = self.sum_returns / self.num_returns
            _return_std_dev = np.sqrt(self.sum_returns_squared / self.num_returns - _return_mean**2)
            normalized_expected_returns = (expected_returns - _return_mean) / (_return_std_dev + self.esp)
            
            # calculate loss
            loss = - torch.sum(log_props * normalized_expected_returns)
            
            # log results
            _episode_reward = sum(rewards)
            running_rewards.append(_episode_reward)
            running_mean = np.array(running_rewards).mean()
            running_std_dev = np.array(running_rewards).std()
            if (episode+1) % 100 == 0:
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}")
                
            if running_mean > 150:
                print(f"Solved! Running reward is now {running_mean:.2f}")
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}")
                break
            
            # optimize the model with backpropagation
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
    def visualize_policy(self, num_episodes=1, max_steps=500):
        for _ in range(num_episodes):
            trajectory, done = self.get_trajectory(max_steps=max_steps, render=True)
            
    def evaluate_policy(self, num_episodes=100, max_steps=500):
        win = 0
        for _ in range(num_episodes):
            trajectory, done = self.get_trajectory(max_steps=max_steps)
            if done and trajectory[-1][2] == 100:
                win += 1
        return win / num_episodes

In [24]:
env = gym.make("LunarLander-v2")
reinforce_agent = Reinforce(env)
reinforce_agent.train(num_episodes=int(3e3), max_steps=5000)

Episode 100	average reward: -170.52, std dev: 93.68
Episode 200	average reward: -127.20, std dev: 118.32
Episode 300	average reward: -75.02, std dev: 68.39
Episode 400	average reward: -51.39, std dev: 50.13
Episode 500	average reward: -49.71, std dev: 71.61
Episode 600	average reward: -32.07, std dev: 46.09
Episode 700	average reward: -4.18, std dev: 44.44
Episode 800	average reward: 36.79, std dev: 100.43
Episode 900	average reward: 3.80, std dev: 88.15
Episode 1000	average reward: 33.73, std dev: 111.47
Episode 1100	average reward: 4.97, std dev: 121.44
Episode 1200	average reward: -6.69, std dev: 117.71
Episode 1300	average reward: 45.07, std dev: 84.62
Episode 1400	average reward: 72.18, std dev: 68.44
Episode 1500	average reward: 69.10, std dev: 78.80
Episode 1600	average reward: 84.25, std dev: 67.69
Episode 1700	average reward: 102.80, std dev: 67.47
Solved! Running reward is now 150.96
Episode 1765	average reward: 150.96, std dev: 99.37


In [29]:
# test the trained agent
test_episodes = 100
win_rate = reinforce_agent.evaluate_policy(num_episodes=test_episodes, max_steps=5000)
print(f'agent win rate: {win_rate*100 :.2f}% from {test_episodes} test episodes')

agent win rate: 94.00% from 100 test episodes


In [33]:
# visualize the trained agent on a separate window
reinforce_agent.visualize_policy(num_episodes=10)

# 3. REINFORCE with adaptive Baseline (state-value function)
ref: https://medium.com/nerd-for-tech/policy-gradients-reinforce-with-baseline-6c871a3a068

ref: https://github.com/riccardocadei/LunarLander-v2-REINFORCE/


In [11]:
class ValueFunctionNN(nn.Module):
    def __init__(self, n_observations):
        super(ValueFunctionNN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x

In [14]:
sample_nn = ValueFunctionNN(n_observations=8)
pred = sample_nn(torch.tensor(state).float())
print(pred)

tensor([-0.1136], grad_fn=<AddBackward0>)


In [16]:
class ReinforceWithBaseline(Reinforce):
    def __init__(self, env, gamma=0.99, learning_rate=1e-2):
        super(ReinforceWithBaseline, self).__init__(env, gamma, learning_rate)
        
        # setup state-value function network and optimizer
        self.value_net = ValueFunctionNN(self._n_observations).to(device)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.learning_rate)

    def train(self, num_episodes=1000, max_steps=500, SEED=123):
        self.env.seed(SEED)
        _ = torch.manual_seed(SEED)
        self.num_returns = 0
        self.sum_returns = 0
        self.sum_returns_squared = 0

        running_rewards = deque(maxlen=100)
        
        for episode in range(num_episodes):
            # get sample trajectory from the policy network
            trajectory, _ = self.get_trajectory(max_steps=max_steps)
            
            # prepare data for training
            states, actions, rewards, log_props = zip(*trajectory)
            states = torch.Tensor(states)
            actions = torch.Tensor(actions)
            log_props = torch.stack(log_props)
            
            # calculate expected discounted returns
            expected_returns_batch = list()
            for idx in range(len(rewards)):
                discounted_rewards = [self.gamma**i * reward for i, reward in enumerate(rewards[idx:])]
                expected_returns_batch.append(sum(discounted_rewards))
            
            # normalize and reformat expected returns
            expected_returns = torch.tensor(expected_returns_batch)
            
            # calculate advantage function from value function
            values = self.value_net(states)
            advantages = expected_returns - values.squeeze().detach()

            # calculate loss
            loss = - torch.sum(log_props * advantages)
            
            # update value function with MSE loss
            mse_loss = nn.MSELoss()
            value_loss = mse_loss(values, expected_returns.detach())
            
            # log results
            _episode_reward = sum(rewards)
            running_rewards.append(_episode_reward)
            running_mean = np.array(running_rewards).mean()
            running_std_dev = np.array(running_rewards).std()
            if (episode+1) % 100 == 0:
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}")
                
            if running_mean > 150:
                print(f"Solved! Running reward is now {running_mean:.2f}")
                print(f"Episode {episode+1}\taverage reward: {running_mean:.2f}, std dev: {running_std_dev:.2f}")
                break
            
            # optimize the model with backpropagation
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            
            # optimize the model with backpropagation
            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()

In [39]:
env = gym.make("LunarLander-v2")
reinforce_baseline_agent = ReinforceWithBaseline(env)
reinforce_baseline_agent.train(num_episodes=int(3e3), max_steps=5000)

Episode 100	average reward: -118.09, std dev: 81.62
Episode 200	average reward: -77.20, std dev: 30.90
Episode 300	average reward: -55.22, std dev: 84.67
Episode 400	average reward: -23.79, std dev: 54.12
Episode 500	average reward: 31.83, std dev: 91.50
Episode 600	average reward: 56.44, std dev: 119.68
Episode 700	average reward: 10.43, std dev: 124.32
Episode 800	average reward: 28.55, std dev: 66.03
Episode 900	average reward: 50.65, std dev: 78.87
Episode 1000	average reward: 94.40, std dev: 48.34
Episode 1100	average reward: 48.90, std dev: 84.25
Episode 1200	average reward: 76.41, std dev: 62.40
Episode 1300	average reward: 74.27, std dev: 79.07
Episode 1400	average reward: 96.90, std dev: 65.42
Episode 1500	average reward: 75.55, std dev: 89.40
Episode 1600	average reward: 88.47, std dev: 69.44
Solved! Running reward is now 150.61
Episode 1679	average reward: 150.61, std dev: 62.11
