In [11]:
import numpy as np

from collections import deque

from PIL import Image

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gym
import gym_pygame

import imageio

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [13]:
env_id = "Pixelcopter-PLE-v0"
env = gym.make(env_id)
eval_env = gym.make(env_id)
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

In [14]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample())  # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  7
Sample observation [-0.7926859  -0.20177512 -0.19728962  0.71813095 -0.65745956  2.0722008
  1.0338264 ]


In [15]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample())  # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 0


In [16]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size * 2)
        self.fc3 = nn.Linear(h_size * 2, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [17]:
# Define a variable to keep track of the best average score
best_loss = np.Inf
best_policy_state = None

def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
    global best_loss, best_policy_state  # To access and modify the global variables
    # Set the policy to training mode
    policy.train()
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []
    # Line 3 of pseudocode
    for i_episode in range(1, n_training_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Line 4 of pseudocode
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))

        # Line 6 of pseudocode: calculate the return
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)
        # Compute the discounted returns at each timestep,
        # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t

        # In O(N) time, where N is the number of time steps
        # (this definition of the discounted return G_t follows the definition of this quantity
        # shown at page 44 of Sutton&Barto 2017 2nd draft)
        # G_t = r_(t+1) + r_(t+2) + ...

        # Given this formulation, the returns at each timestep t can be computed
        # by re-using the computed future returns G_(t+1) to compute the current return G_t
        # G_t = r_(t+1) + gamma*G_(t+1)
        # G_(t-1) = r_t + gamma* G_t
        # (this follows a dynamic programming approach, with which we memorize solutions in order
        # to avoid computing them multiple times)

        # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
        # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...


        ## Given the above, we calculate the returns at timestep t as:
        #               gamma[t] * return[t] + reward[t]
        #
        ## We compute this starting from the last timestep to the first, in order
        ## to employ the formula presented above and avoid redundant computations that would be needed
        ## if we were to do it from first to last.

        ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
        ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
        ## a normal python list would instead require O(N) to do this.
        for t in range(n_steps)[::-1]:
            disc_return_t = (returns[0] if len(returns)>0 else 0)
            returns.appendleft(gamma * disc_return_t + rewards[t])

        ## standardization of the returns is employed to make training more stable
        eps = np.finfo(np.float32).eps.item()

        ## eps is the smallest representable float, which is
        # added to the standard deviation of the returns to avoid numerical instabilities
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Line 7:
        policy_loss = []
        for log_prob, disc_return in zip(saved_log_probs, returns):
            policy_loss.append(-log_prob * disc_return)
        policy_loss = torch.cat(policy_loss).sum()

        # Line 8: PyTorch prefers gradient descent
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        # Check if the current average score is better than the best
        if policy_loss < best_loss:
            best_loss = policy_loss
            torch.save(policy.state_dict(), 'policy.pth')

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))

    return scores

In [18]:
pixelcopter_hyperparameters = {
    "h_size": 64,
    "n_training_episodes": 5000,
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 1.01,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [19]:
# Create policy and place it to the device
# torch.manual_seed(50)
pixelcopter_policy = Policy(
    pixelcopter_hyperparameters["state_space"],
    pixelcopter_hyperparameters["action_space"],
    pixelcopter_hyperparameters["h_size"],
).to(device)
pixelcopter_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])

In [25]:
scores = reinforce(
    pixelcopter_policy,
    pixelcopter_optimizer,
    pixelcopter_hyperparameters["n_training_episodes"],
    pixelcopter_hyperparameters["max_t"],
    pixelcopter_hyperparameters["gamma"],
    100,
)

Episode 100	Average Score: 9.06
Episode 200	Average Score: 8.76
Episode 300	Average Score: 9.59
Episode 400	Average Score: 10.17
Episode 500	Average Score: 10.03
Episode 600	Average Score: 9.14
Episode 700	Average Score: 7.99
Episode 800	Average Score: 9.21
Episode 900	Average Score: 9.26
Episode 1000	Average Score: 8.47
Episode 1100	Average Score: 8.64
Episode 1200	Average Score: 8.83
Episode 1300	Average Score: 7.97
Episode 1400	Average Score: 8.33
Episode 1500	Average Score: 10.45
Episode 1600	Average Score: 6.37
Episode 1700	Average Score: 8.11
Episode 1800	Average Score: 7.78
Episode 1900	Average Score: 7.18
Episode 2000	Average Score: 9.90
Episode 2100	Average Score: 7.47
Episode 2200	Average Score: 9.37
Episode 2300	Average Score: 8.26
Episode 2400	Average Score: 7.09
Episode 2500	Average Score: 8.04
Episode 2600	Average Score: 8.82
Episode 2700	Average Score: 9.90
Episode 2800	Average Score: 8.01
Episode 2900	Average Score: 8.33
Episode 3000	Average Score: 9.98
Episode 3100	Ave

In [26]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param policy: The Reinforce agent
    """
    # Set the policy to evaluation mode
    policy.eval()
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            new_state, reward, done, info = env.step(action)
            total_rewards_ep += reward

            if done:
                break
            state = new_state
        episode_rewards.append(total_rewards_ep)
    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward

In [27]:
evaluate_agent(
    eval_env, pixelcopter_hyperparameters["max_t"], pixelcopter_hyperparameters["n_evaluation_episodes"], pixelcopter_policy
)

(10.4, 15.837929157563499)

In [28]:
def record_video(env, policy, out_directory, fps=30):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    upscale_factor = 10
    images = []  
    done = False
    state = env.reset()
    img = env.render(mode='rgb_array')
    # Resize the image using PIL
    img_pil = Image.fromarray(img)
    img_pil = img_pil.resize((img_pil.width * upscale_factor, img_pil.height * upscale_factor), Image.LANCZOS)
    img_resized = np.array(img_pil)
    images.append(img_resized)
    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
        img = env.render(mode='rgb_array')
        # Resize the image using PIL
        img_pil = Image.fromarray(img)
        img_pil = img_pil.resize((img_pil.width * upscale_factor, img_pil.height * upscale_factor), Image.LANCZOS)
        img_resized = np.array(img_pil)
        images.append(img_resized)
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [29]:
state_dict = torch.load('policy.pth')
pixelcopter_policy = Policy(
    pixelcopter_hyperparameters["state_space"],
    pixelcopter_hyperparameters["action_space"],
    pixelcopter_hyperparameters["h_size"],
).to(device) # Re-create the policy model
pixelcopter_policy.load_state_dict(state_dict)

record_video(eval_env, pixelcopter_policy, 'video.mp4')