In [1]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip install pyvirtualdisplay
!pip install pyglet==1.5.1

In [2]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7e56f86d7550>

In [3]:
!pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt

Collecting git+https://github.com/ntasfi/PyGame-Learning-Environment.git (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 1))
  Cloning https://github.com/ntasfi/PyGame-Learning-Environment.git to /tmp/pip-req-build-a8iud646
  Running command git clone --filter=blob:none --quiet https://github.com/ntasfi/PyGame-Learning-Environment.git /tmp/pip-req-build-a8iud646
  Resolved https://github.com/ntasfi/PyGame-Learning-Environment.git to commit 3dbe79dc0c35559bb441b9359948aabf9bb3d331
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting git+https://github.com/simoninithomas/gym-games (from -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit4/requirements-unit4.txt (line 2))
  Cloning https://github.com/simoninithomas/gym-games to /tmp/pip-req-build-_z6al8by
  Running command git clone --filter=blob:none --quiet https://github.com/simoninithomas/gym-games /tmp/pip-req-build

In [4]:
from collections import deque

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.distributions import Categorical

# Gym
import gym
import gym_pygame

# HuggingFace Hub
from huggingface_hub import notebook_login, login
import imageio

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
env_id = "Pixelcopter-PLE-v0"
env = gym.make(env_id)
eval_env = gym.make(env_id)
s_size = env.observation_space.shape[0]
a_size = env.action_space.n

couldn't import doomish
Couldn't import doom


In [7]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample())  # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  7
Sample observation [-0.1123758   0.26218843  0.83339405  0.70509326 -1.598042    0.01823723
  0.8897045 ]


In [8]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample())  # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  2
Action Space Sample 1


In [9]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__() 
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size * 2)
        self.fc3 = nn.Linear(h_size * 2, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

In [10]:
# Estimates a value given a state. Used as baseline
class Value(nn.Module):
    def __init__(self, s_size, h_size):
        super(Value, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        # output size is 1 because we are estimating a numeric value for each state
        self.fc2 = nn.Linear(h_size, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [11]:
def reinforce(policy, baseline, policy_optimizer, baseline_optimizer, n_training_episodes, max_t, gamma, print_every):
    # Help us to calculate the score during the training
    scores_deque = deque(maxlen=100)
    scores = []

    # Line 3 of pseudocode
    for i_episode in range(1, n_training_episodes + 1):
        saved_log_probs = []
        rewards = []
        states = []

        # reset the environment
        state = env.reset()

        # Line 4 of pseudocode (Generate an episode)
        for t in range(max_t):
            # get the action
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            states.append(state)
            # Take a step in the environment
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break

        total_rewards = sum(rewards)
        scores_deque.append(total_rewards)
        scores.append(total_rewards)

        # Line 5 and 6 of pseudocode: calculate the return
        returns = deque(maxlen=max_t)
        n_steps = len(rewards)

        # Calculate the sum of discounted rewards starting at timestep t
        # G_t = r_(t+1) + gamma * r_(t+2) + ... + gamma ^n * r_(T-1)
        # We can do it backwards from max_t - 1 to 0 to avoid recomputing redundant values
        # G_t = r_(t+1) + gamma * G_(t+1)
        # The queue "returns" will hold the returns in chronological order from t=0
        for t in range(n_steps-1, -1, -1):
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(rewards[t] + gamma * disc_return_t)

        # eps is the smallest representable float (machine epsilon)
        eps = np.finfo(np.float32).eps.item()

        # standardization of the returns is employed to make training more stable
        returns = torch.tensor(returns).to(device)
        returns = (returns - returns.mean()) / (returns.std() + eps)

        # Calculate delta to use in place of returns
        states_tensor = torch.from_numpy(np.array(states)).float().to(device)
        baseline_preds = baseline(states_tensor).squeeze()
        deltas = returns - baseline_preds.detach()

        # Update value network (also baseline network)
        baseline_loss = nn.MSELoss()(baseline_preds, returns)
        baseline_optimizer.zero_grad()
        baseline_loss.backward()
        baseline_optimizer.step()

        # Line 7 of pseudocode (Calculate gradient delta)
        # Negative because we are performing gradient descent instead of ascent
        policy_loss = -torch.stack(saved_log_probs) * deltas
        policy_loss = policy_loss.sum()

        # Line 8: Update policy network
        # PyTorch perfers gradient descent
        # Set gradients to zero before GD
        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()
        

        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'
                  .format(i_episode, np.mean(scores_deque)))

    return scores

In [12]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    """
    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
    :param env: The evaluation environment
    :param n_eval_episodes: Number of episode to evaluate the agent
    :param policy: The Reinforce agent
    """
    episode_rewards = []
    for episode in range(n_eval_episodes):
        state = env.reset()
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            state, reward, done, _ = env.step(action)
            total_rewards_ep += reward
            if done:
                break
        episode_rewards.append(total_rewards_ep)

    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)
    return mean_reward, std_reward

In [13]:
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save
from pathlib import Path
import datetime
import json
import imageio
import tempfile
import os

def record_video(env, policy, out_directory, fps=30):
    """
    Generate a replay video of the agent
    :param env
    :param Qtable: Qtable of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """

    images = []
    done = False
    state = env.reset()
    img = env.render(mode="rgb_array")
    images.append(np.array(img))

    while not done:
        # Take the action (index) that have the maximum expected future reward given that state
        action, _ = policy.act(state)
        state, reward, done, info = env.step(action)  # We directly put next_state = state for recording logic  
        img = env.render(mode="rgb_array")
        images.append(np.array(img))
    
    imageio.mimsave(out_directory, images, fps=fps)

def push_to_hub(repo_id,
                model,
                hyperparameters,
                eval_env,
                video_fps=30
                ):
  """
  Evaluate, Generate a video and Upload a model to Hugging Face Hub.
  This method does the complete pipeline:
  - It evaluates the model
  - It generates the model card
  - It generates a replay video of the agent
  - It pushes everything to the Hub

  :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
  :param model: the pytorch model we want to save
  :param hyperparameters: training hyperparameters
  :param eval_env: evaluation environment
  :param video_fps: how many frame per seconds to record our video replay
  """

  _, repo_name = repo_id.split("/")
  api = HfApi()

  # Step 1: Create the repo
  repo_url = api.create_repo(
        repo_id=repo_id,
        exist_ok=True,
  )

  with tempfile.TemporaryDirectory() as tmpdirname:
    local_directory = Path(tmpdirname)

    # Step 2: Save the model
    torch.save(model, local_directory / "model.pt")

    # Step 3: Save the hyperparameters to JSON
    with open(local_directory / "hyperparameters.json", "w") as outfile:
      json.dump(hyperparameters, outfile)

    # Step 4: Evaluate the model and build JSON
    mean_reward, std_reward = evaluate_agent(eval_env,
                                            hyperparameters["max_t"],
                                            hyperparameters["n_evaluation_episodes"],
                                            model)

    # Get datetime
    eval_datetime = datetime.datetime.now()
    eval_form_datetime = eval_datetime.isoformat()

    evaluate_data = {
          "env_id": hyperparameters["env_id"],
          "mean_reward": mean_reward,
          "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
          "eval_datetime": eval_form_datetime,
    }

    # Write a JSON file
    with open(local_directory / "results.json", "w") as outfile:
        json.dump(evaluate_data, outfile)

    # Step 5: Create the model card
    env_name = hyperparameters["env_id"]

    metadata = {}
    metadata["tags"] = [
          env_name,
          "reinforce",
          "reinforcement-learning",
          "custom-implementation",
          "deep-rl-class"
      ]

    # Add metrics
    eval = metadata_eval_result(
        model_pretty_name=repo_name,
        task_pretty_name="reinforcement-learning",
        task_id="reinforcement-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_name,
        dataset_id=env_name,
      )

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    model_card = f"""
  # **Reinforce** Agent playing **{env_id}**

  This is a trained model of a **Reinforce** agent playing **{env_id}** .

  To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction

  """

    readme_path = local_directory / "README.md"
    readme = ""
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
          readme = f.read()
    else:
      readme = model_card


    with readme_path.open("w", encoding="utf-8") as f:
      f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)

    # Step 6: Record a video
    video_path =  local_directory / "replay.mp4"
    record_video(env, model, video_path, video_fps)

    # Step 7. Push everything to the Hub
    api.upload_folder(
          repo_id=repo_id,
          folder_path=local_directory,
          path_in_repo=".",
    )

    print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")

In [14]:
pixelcopter_hyperparameters = {
    "h_size": 128, 
    "n_training_episodes": 50000, #originally 50k
    "n_evaluation_episodes": 10,
    "max_t": 10000,
    "gamma": 0.99,
    "lr": 1e-4,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [15]:
pixelcopter_policy = Policy(
    pixelcopter_hyperparameters["state_space"],
    pixelcopter_hyperparameters["action_space"],
    pixelcopter_hyperparameters["h_size"],
).to(device)

pixelcopter_baseline = Value(
    pixelcopter_hyperparameters["state_space"],
    pixelcopter_hyperparameters["h_size"],
).to(device)

policy_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])
baseline_optimizer = optim.Adam(pixelcopter_baseline.parameters(), lr=pixelcopter_hyperparameters["lr"])

In [16]:
scores = reinforce(
    policy=pixelcopter_policy,
    baseline=pixelcopter_baseline,
    policy_optimizer=policy_optimizer,
    baseline_optimizer=baseline_optimizer,
    n_training_episodes=pixelcopter_hyperparameters["n_training_episodes"],
    max_t=pixelcopter_hyperparameters["max_t"],
    gamma=pixelcopter_hyperparameters["gamma"],
    print_every=1000,
)

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(
  if not isinstance(done, (bool, np.bool8)):
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Episode 1000	Average Score: -5.00
Episode 2000	Average Score: -5.00
Episode 3000	Average Score: -5.00
Episode 4000	Average Score: -5.00
Episode 5000	Average Score: -5.00
Episode 6000	Average Score: -5.00
Episode 7000	Average Score: -5.00
Episode 8000	Average Score: -5.00
Episode 9000	Average Score: -5.00
Episode 10000	Average Score: -5.00
Episode 11000	Average Score: -5.00
Episode 12000	Average Score: -5.00
Episode 13000	Average Score: -5.00
Episode 14000	Average Score: -5.00
Episode 15000	Average Score: -5.00
Episode 16000	Average Score: -5.00
Episode 17000	Average Score: -5.00
Episode 18000	Average Score: -5.00
Episode 19000	Average Score: -5.00
Episode 20000	Average Score: -5.00
Episode 21000	Average Score: -5.00
Episode 22000	Average Score: -5.00
Episode 23000	Average Score: -5.00
Episode 24000	Average Score: -5.00
Episode 25000	Average Score: -5.00
Episode 26000	Average Score: -5.00
Episode 27000	Average Score: -5.00
Episode 28000	Average Score: -5.00
Episode 29000	Average Score: 

In [None]:
from huggingface_hub import notebook_login
notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [18]:
repo_id = "wowthecoder/reinforce-pixelcopter" 
push_to_hub(
    repo_id,
    pixelcopter_policy,  # The model we want to save
    pixelcopter_hyperparameters,  # Hyperparameters
    eval_env,  # Evaluation environment
    video_fps=30
)

  logger.warn(


model.pt:   0%|          | 0.00/142k [00:00<?, ?B/s]

Your model is pushed to the Hub. You can view your model here: https://huggingface.co/wowthecoder/reinforce-pixelcopter
