<a href="https://colab.research.google.com/github/yootazi/Reinforcement_Learning_Gym/blob/main/RL_PPO_BipedalWalker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bipedal Walker

Action Space: Box(-1.0, 1.0, (4,), float32)
Observation Space: (24,)

import: gym.make("BipedalWalker-v3")
Algorithm: PPO
Policy:"Mlp"

# 1. Import dependencies

In [None]:
pip install git+https://github.com/DLR-RM/stable-baselines3

In [None]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]

In [3]:
import gym
import torch
import os
import tensorflow as tf
from stable_baselines3 import SAC, TD3
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Setup the Environment

In [4]:
env = gym.make("BipedalWalker-v3")

In [5]:
episodes = 5  # testing the environment 5 times 
for episode in range(1, episodes+1):
    obs = env.reset()  # reseting environment by getting an array of 4 initial set of observations of that perticular environment
    done = False
    score = 0 
    
    while not done:  
        #env.render(mode='human')  # to view the graphical representation of environment (does not work)
        action = env.action_space.sample()  # generating a random action, 0 or 1 - here action_space is Discrete(2) -> {0, 1}
        obs, reward, done, info = env.step(action)  # passing random action to environment (passing 0 or 1)
        score+=reward  # accumulating rewards
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:-99.4896033230933
Episode:2 Score:-108.94805569125204
Episode:3 Score:-87.53951255855898
Episode:4 Score:-103.27160644020482
Episode:5 Score:-109.39670468833981


# 2.2 Vectorizing the Environment and Training the Model

RL Algorithms:
    1. Model-Based - predict future states to generate best possible action
        1. Learn the Model
        2. Given the Model --> AlphaZero
    2. Model-Free ( a lot of developments are happening) - uses current state to make predictions
        1.Policiy Optimization --> PPO (Proximal Policy Optimization ) / A2C / DQN
        2.Q-Learning
using best algorithms based on the action space in Stable Baseline:
    A2C - works on all types of action space (Box, Discrete, MultiDistricrete, MultiBinary, MultiProcessing)
    PPO - works on all types of action space (Box, Discrete, MultiDistricrete, MultiBinary, MultiProcessing)
    DQN - only on Discrete

In [6]:
log_path = os.path.join('Training', 'Logs')  # define a log_path where we can monitor the performance of the model
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log = log_path)  # defining the model - defining the policiy (multulayer perceptron policy (standard NN))
model.learn(total_timesteps=1000000) # 1M

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
|    loss                 | 1.86      |
|    n_updates            | 2610      |
|    policy_gradient_loss | -0.0113   |
|    std                  | 0.341     |
|    value_loss           | 6.15      |
---------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.06e+03    |
|    ep_rew_mean          | 214         |
| time/                   |             |
|    fps                  | 445         |
|    iterations           | 263         |
|    time_elapsed         | 1209        |
|    total_timesteps      | 538624      |
| train/                  |             |
|    approx_kl            | 0.020912768 |
|    clip_fraction        | 0.314       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.36       |
|    explained_variance   | 0.0995      |
|    learning_rate        | 0.0003      |
|    loss              

<stable_baselines3.ppo.ppo.PPO at 0x7f589f3c7190>

# 4. Evaluation

In [7]:
# 300 points in 1600 time steps -> solved 
evaluate_policy(model, env, n_eval_episodes=10, render=False) # average reward 



(308.66683010110285, 0.890446320783785)

# 5. Save and Reload Model

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [9]:
# save the trained model
save_dir = '/content/gdrive/MyDrive/RL_Models'
os.makedirs(save_dir, exist_ok=True)
model.save(save_dir + "/PPO_BipedalW_1M")

In [10]:
# save the trained model in zip 
import zipfile
archive = zipfile.ZipFile("/content/gdrive/MyDrive/RL_Models/PPO_BipedalW_1M.zip", 'r')
for f in archive.filelist:
  print(f.filename)

data
pytorch_variables.pth
policy.pth
policy.optimizer.pth
_stable_baselines3_version
system_info.txt


In [12]:
del model

In [None]:
# load the trained model
model.load('/content/gdrive/MyDrive/RL_Models/PPO_BipedalW_1M')

In [None]:
# train again
model.learn(total_timesteps=1000000)

 # 6. Watching the Performance!

In [None]:
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip install -U colabgymrender

In [48]:
!apt-get install x11-utils > /dev/null 2>&1 and !pip install pyglet==v1.3.2

In [None]:
pip install xvfbwrapper

In [None]:
pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

In [None]:
import gym
from colabgymrender.recorder import Recorder

env = gym.make("BipedalWalker-v3")
directory = './video'
env = Recorder(env, directory)

In [None]:
observation = env.reset()
terminal = False

while not terminal:
  action = env.action_space.sample()
  observation, reward, terminal, info = env.step(action)

env.play()

In [None]:
print(env.path)