# Create a virtual display 🔽

In [1]:
%%capture
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

In [2]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7bd99a61e5f0>

# Install dependencies 🔽

In [3]:
!pip install stable-baselines3[extra]
!pip install gymnasium
!pip install huggingface_sb3
!pip install huggingface_hub
!pip install panda_gym

Collecting shimmy~=1.1.0 (from shimmy[atari]~=1.1.0; extra == "extra"->stable-baselines3[extra])
  Downloading Shimmy-1.1.0-py3-none-any.whl.metadata (3.3 kB)
Collecting autorom~=0.6.1 (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM-0.6.1-py3-none-any.whl.metadata (2.4 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.6.1; extra == "extra"->stable-baselines3[extra])
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ale-py~=0.8.1 (from shimmy[atari]~=1.1.0; extra == "extra"->stable-baselines3[extra])
  Downloading ale_py-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux20

# Import the packages 📦

In [4]:
import os

import gymnasium as gym
import panda_gym

from huggingface_sb3 import load_from_hub, package_to_hub

from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env

from huggingface_hub import login

  from jax import xla_computation as _xla_computation


# PandaReachDense-v3 🦾
The agent we’re going to train is a robotic arm that needs to do controls (moving the arm and using the end-effector).

In robotics, the end-effector is the device at the end of a robotic arm designed to interact with the environment.

In PandaReach, the robot must place its end-effector at a target position (green ball).

We’re going to use the dense version of this environment. It means we’ll get a dense reward function that will provide a reward at each timestep (the closer the agent is to completing the task, the higher the reward). Contrary to a sparse reward function where the environment return a reward if and only if the task is completed.

Also, we’re going to use the End-effector displacement control, it means the action corresponds to the displacement of the end-effector. We don’t control the individual motion of each joint (joint control).

## The environment 🎮

In [5]:
env_id = "PandaReachDense-v3"

env = gym.make(env_id)

s_size = env.observation_space.shape
a_size = env.action_space

  and should_run_async(code)


In [6]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

_____OBSERVATION SPACE_____ 

The State Space is:  None
Sample observation OrderedDict([('achieved_goal', array([8.114315 , 1.5747985, 5.0605803], dtype=float32)), ('desired_goal', array([-8.458551 , -9.493125 ,  3.1096568], dtype=float32)), ('observation', array([-6.698305 , -4.4603224, -4.3671637, -9.454221 ,  6.4856853,
       -7.460611 ], dtype=float32))])


  and should_run_async(code)


The observation space is a dictionary with 3 different elements:

- achieved_goal: (x,y,z) position of the goal.
- desired_goal: (x,y,z) distance between the goal position and the current object position.
- observation: position (x,y,z) and velocity of the end-effector (vx, vy, vz).

In [7]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action


 _____ACTION SPACE_____ 

The Action Space is:  Box(-1.0, 1.0, (3,), float32)
Action Space Sample [ 0.6445129   0.71101964 -0.51594174]


The action space is a vector with 3 values:

- Control x, y, z movement

## Normalize observation and rewards

In [8]:
env = make_vec_env(env_id, n_envs=4)

# Adding this wrapper to normalize the observation and the reward
env = VecNormalize(env, norm_obs=True, norm_reward=True)

# Create the A2C Model 🤖

In [9]:
model = A2C("MultiInputPolicy", env, device="cpu", verbose=1)

Using cpu device


In [10]:
model.learn(1_000_000)

  and should_run_async(code)


------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44.1     |
|    ep_rew_mean        | -12.7    |
| time/                 |          |
|    fps                | 357      |
|    iterations         | 100      |
|    time_elapsed       | 5        |
|    total_timesteps    | 2000     |
| train/                |          |
|    entropy_loss       | -4.3     |
|    explained_variance | 0.814    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 0.918    |
|    std                | 1.01     |
|    value_loss         | 0.278    |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 43       |
|    ep_rew_mean        | -11.9    |
| time/                 |          |
|    fps                | 360      |
|    iterations         | 200      |
|    time_elapsed       | 11       |
|    total_timesteps    | 4000     |
|

<stable_baselines3.a2c.a2c.A2C at 0x7bd88b65e200>

In [11]:
# Save the model and  VecNormalize statistics when saving the agent
model.save("a2c-PandaReachDense-v3")
env.save("vec_normalize.pkl")

# Evaluate the agent 📈

In [12]:
eval_env = DummyVecEnv([lambda: gym.make(env_id)])
eval_env = VecNormalize.load("vec_normalize.pkl", eval_env)

eval_env.render_mode = "rgb_array"
# do not update at test time
eval_env.training = False
# no need to normalize reward at test time
eval_env.norm_reward = False

model = A2C.load("a2c-PandaReachDense-v3")
mean_reward, std_reward = evaluate_policy(model, eval_env)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

Mean reward: -0.25 +/- 0.12


  th_object = th.load(file_content, map_location=device)


# Publish your trained model on the Hub 🔥

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [14]:
from huggingface_sb3 import package_to_hub

package_to_hub(
    model=model,
    model_name=f"a2c-{env_id}",
    model_architecture="A2C",
    env_id=env_id,
    eval_env=eval_env,
    repo_id=f"wowthecoder/a2c-{env_id}", 
    commit_message="Initial commit",
)

[38;5;4mℹ This function will save, evaluate, generate a video of your agent,
create a model card and push everything to the hub. It might take up to 1min.
This is a work in progress: if you encounter a bug, please open an issue.[0m
Saving video to /tmp/tmpd9ly021e/-step-0-to-step-1000.mp4


  """


Moviepy - Building video /tmp/tmpd9ly021e/-step-0-to-step-1000.mp4.
Moviepy - Writing video /tmp/tmpd9ly021e/-step-0-to-step-1000.mp4



                                                                

Moviepy - Done !
Moviepy - video ready /tmp/tmpd9ly021e/-step-0-to-step-1000.mp4
[38;5;4mℹ Pushing repo wowthecoder/a2c-PandaReachDense-v3 to the Hugging Face
Hub[0m


policy.optimizer.pth:   0%|          | 0.00/48.2k [00:00<?, ?B/s]

a2c-PandaReachDense-v3.zip:   0%|          | 0.00/111k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

policy.pth:   0%|          | 0.00/46.3k [00:00<?, ?B/s]

vec_normalize.pkl:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

[38;5;4mℹ Your model is pushed to the Hub. You can view your model here:
https://huggingface.co/wowthecoder/a2c-PandaReachDense-v3/tree/main/[0m


CommitInfo(commit_url='https://huggingface.co/wowthecoder/a2c-PandaReachDense-v3/commit/02bec1dc136955edadacbdff33f6faba2944fc39', commit_message='Initial commit', commit_description='', oid='02bec1dc136955edadacbdff33f6faba2944fc39', pr_url=None, repo_url=RepoUrl('https://huggingface.co/wowthecoder/a2c-PandaReachDense-v3', endpoint='https://huggingface.co', repo_type='model', repo_id='wowthecoder/a2c-PandaReachDense-v3'), pr_revision=None, pr_num=None)