# Bipedal_Walker
## Deep Deterministic Policy Gradient (DDPG) TD3 variant
https://pylessons.com/BipedalWalker-v3-PPO

In [1]:
# import standard libraries
import os
import base64
import random
import time

# import third-party libraries
import gymnasium as gym
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.models import Model

import imageio
import cv2
from IPython.display import clear_output, display, HTML
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

## Hardware infos

In [2]:
!nvidia-smi -L

zsh:1: command not found: nvidia-smi


In [3]:
print(tf.config.list_logical_devices())
print(tf.config.list_physical_devices())
print(tf.config.experimental.list_physical_devices("GPU"))
print(len(tf.config.experimental.list_physical_devices("GPU")))

[LogicalDevice(name='/device:CPU:0', device_type='CPU'), LogicalDevice(name='/device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1


2023-06-18 17:23:23.477657: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-06-18 17:23:23.477694: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-06-18 17:23:23.477707: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-06-18 17:23:23.477772: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-06-18 17:23:23.477802: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [4]:
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
print(tf.__version__)
physical_devices = tf.config.list_physical_devices('GPU')
tf.print(physical_devices)

2.13.0-rc1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [7]:
tf.config.run_functions_eagerly(False)

## Create directories

In [8]:
os.makedirs('img/pendulum', exist_ok=True)
os.makedirs('data/pendulum', exist_ok=True)
print('./img :')
!ls -al img
print('./data :')
!ls -al data

./img :
total 30808
drwxr-xr-x   7 thibaudperrin  staff       224 18 jui 16:26 [34m.[m[m
drwxr-xr-x  19 thibaudperrin  staff       608 18 jui 17:22 [34m..[m[m
-rw-r--r--   1 thibaudperrin  staff    464369 18 jui 12:32 BipedalWalker-v3-w0-e0.gif
-rw-r--r--   1 thibaudperrin  staff    889495 18 jui 16:26 Pendulum-v1.gif
-rw-r--r--   1 thibaudperrin  staff   1480449 18 jui 12:32 bipedal_walker.gif
-rw-r--r--   1 thibaudperrin  staff  12930388 18 jui 12:32 lunar_lander.gif
drwxr-xr-x   2 thibaudperrin  staff        64 18 jui 15:11 [34mpendulum[m[m
./data :
total 0
drwxr-xr-x   3 thibaudperrin  staff   96 18 jui 15:12 [34m.[m[m
drwxr-xr-x  19 thibaudperrin  staff  608 18 jui 17:22 [34m..[m[m
drwxr-xr-x   2 thibaudperrin  staff   64 18 jui 15:12 [34mpendulum[m[m


## Constants

In [9]:
ENV_NAME = 'Pendulum-v1'

# Hyperparameters
MEMORY_SIZE = 1_000_000 # 100_000     # size of memory buffer
MINIBATCH_SIZE = 64                   # mini-batch size
ACTOR_LR = 0.001                      # learning rate Actor model
CRITIC_LR = 0.002 # 1e-4              # learning rate Critic model
GAMMA = 0.99                          # discount factor
TAU = 0.005                           # soft update parameter
EPISODES = 100                        # Total number of episodes

NOISE_MEAN = 0.0
NOISE_STDDEV = 0.2
NOISE_DECAY = 0.99                    # adjusts the magnitude of the noise over time


## Utils

In [10]:
def progress(value, max_value, visualize: bool = False) -> str:
    percent = int(value*10/max_value)
    progress = f"{''.join(['█' if x < percent  else '░' for x in range(10)])} ({value}/{max_value})"
    if visualize:
        print(progress)
    return progress

In [11]:
def save_gif(img_list, path):
    # Convert the list of frames to a numpy array
    resized_img_array = []
    for img in img_list:
        img_pil = Image.fromarray(img)
        # Make sure width and height are divisible by 16
        img_resized_pil = img_pil.resize((608, 400))
        img_resized = np.array(img_resized_pil)
        resized_img_array.append(img_resized)
    
    # Create gif video
    fps = 20
    imageio.mimsave(path, resized_img_array, 'GIF', duration=int(1000 * 1/fps), loop=0)

In [12]:
def get_actor_loss(critic, states, actions):
    q_values = critic.model([states, actions])
    return -tf.reduce_mean(q_values)

In [13]:
def update_target_weights(model, target_model, tau=TAU):
    weights = model.get_weights()
    target_weights = target_model.get_weights()
    for i in range(len(weights)):
        target_weights[i] = weights[i] * tau + target_weights[i] * (1 - tau)
    target_model.set_weights(target_weights)

## Actor

In [14]:
class Actor:
    def __init__(self, state_dim, action_dim, action_bound, learning_rate=ACTOR_LR):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.model = self.create_model(learning_rate)

    def create_model(self, learning_rate):
        states = Input(shape=(self.state_dim,))
        out = Dense(24, activation='relu')(states)
        out = Dense(24, activation='relu')(out)
        actions = Dense(self.action_dim, activation='tanh')(out)
        model = Model(inputs=states, outputs=actions)
        model.compile(loss='mse', optimizer=Adam(learning_rate))
        return model

## Critic

In [15]:
class Critic:
    def __init__(self, state_dim, action_dim, learning_rate=CRITIC_LR):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model(learning_rate)

    def create_model(self, learning_rate):
        states = Input(shape=(self.state_dim,))
        actions = Input(shape=(self.action_dim,))
        out = Concatenate()([states, actions])
        out = Dense(24, activation='relu')(out)
        out = Dense(24, activation='relu')(out)
        q_values = Dense(1)(out)
        model = Model(inputs=[states, actions], outputs=q_values)
        model.compile(loss='mse', optimizer=Adam(learning_rate))
        return model

## ReplayBuffer

In [16]:
class ReplayBuffer:
    def __init__(self, max_size):
        self.max_size = max_size
        self.buffer = []

    def add_record(self, state, action, reward, next_state, done):
        if len(self.buffer) >= self.max_size:
            self.buffer.pop(0)
        self.buffer.append((state, action, reward, next_state, done))

    def sample_batch(self, batch_size):
        idxs = np.random.choice(len(self.buffer), size=batch_size)
        batch = [self.buffer[i] for i in idxs]
        states, actions, rewards, next_states, dones = map(np.asarray, zip(*batch))
        return states, actions, rewards, next_states, dones

## Training
### create environment

In [17]:
env = gym.make(ENV_NAME)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]

### Create actor, critic and their target networks

In [18]:
actor = Actor(state_dim, action_dim, action_bound, ACTOR_LR)
target_actor = Actor(state_dim, action_dim, action_bound, ACTOR_LR)
critic = Critic(state_dim, action_dim, CRITIC_LR)
target_critic = Critic(state_dim, action_dim, CRITIC_LR)

### Create replay buffer

In [19]:
buffer = ReplayBuffer(MEMORY_SIZE)

In [20]:
# list of all episodes rewards
total_rewards = []

# Create Ornstein-Uhlenbeck noise
ou_noise = np.random.normal(NOISE_MEAN, NOISE_STDDEV, size=action_dim)

In [21]:
for ep in range(EPISODES):
    state, _ = env.reset()
    done = False
    timestep = 0
    episode_reward = 0
    while not done:
        # function to override printlines from previous loop iteration 
        clear_output(wait=True)
        print(f"Episodes: {progress(ep+1, EPISODES)}")
        print(f"Timestep: {progress(timestep+1, 200)}, Episode reward = {episode_reward}")
        #Select an action
        action = actor.model.predict(state.reshape(1, -1), verbose=0)[0]
        
        # Add noise to action
        noise = ou_noise + NOISE_STDDEV * np.random.randn(action_dim)
        action = np.clip(action + noise, -action_bound, action_bound)
        
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Save in the buffer replay
        buffer.add_record(state, action, reward, next_state, done)
        state = next_state
        episode_reward += reward

        if len(buffer.buffer) >= MINIBATCH_SIZE:
            states, actions, rewards, next_states, dones = buffer.sample_batch(MINIBATCH_SIZE)

            target_q = target_critic.model([next_states, target_actor.model.predict(next_states, verbose=0)])
            targets = rewards + GAMMA * target_q * (1 - dones)

            # Train critic
            with tf.GradientTape() as tape:
                critic_q = critic.model([states, actions])
                critic_loss = tf.reduce_mean((targets - critic_q) ** 2)
            critic_grad = tape.gradient(critic_loss, critic.model.trainable_variables)
            critic.model.optimizer.apply_gradients(zip(critic_grad, critic.model.trainable_variables))

            # Train actor
            with tf.GradientTape() as tape:
                new_actions = actor.model(states)
                actor_loss = get_actor_loss(critic, states, new_actions)
            actor_grad = tape.gradient(actor_loss, actor.model.trainable_variables)
            actor.model.optimizer.apply_gradients(zip(actor_grad, actor.model.trainable_variables))

            # Update target networks
            update_target_weights(actor.model, target_actor.model)
            update_target_weights(critic.model, target_critic.model)

            # Decay the noise parameters at the end of the episode
            if done:
                NOISE_STDDEV *= NOISE_DECAY
            timestep += 1

        # Saving the episode reward
        total_rewards += [episode_reward]

Episodes: ░░░░░░░░░░ (2/100)
Timestep: ███████░░░ (141/200), Episode reward = -869.584845770507


KeyboardInterrupt: 

## Visualizing
### create environment

In [None]:
env = gym.make(ENV_NAME, render_mode="rgb_array")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]

In [None]:
state, _ = env.reset()
done = False
timestep = 0
episode_reward = 0
screen_list = []
while not done:
    # function to override printlines from previous loop iteration 
    clear_output(wait=True)
    print(f"Timestep: {progress(timestep+1, 200)}, Episode reward = {episode_reward}")
    #Select an action
    action = actor.model.predict(state.reshape(1, -1), verbose=0)[0]

    # Printing env render (rgb_array)
    screen = env.render()
    # Add title to the screen
    screen = cv2.putText(
        np.array(screen),
        f"Timestep=[{timestep +1}]",
        (25, 25),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 0, 0),
        1,
        cv2.LINE_AA
    )
    screen_list.append(screen)
    
    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated

    state = next_state
    episode_reward += reward

    timestep += 1

### Save gif

In [None]:
path = f"./img/{ENV_NAME}.gif"
save_gif(screen_list, path)

### Embed the video

In [None]:
video = open(path, 'rb').read()
b64_video = base64.b64encode(video)
video_tag = '<img src="data:image/gif;base64,{0}">'.format(b64_video.decode())

display(HTML(video_tag))