# Topic 5 Overview of Advanced RL Algorithms

## A2C and A3C

### A2C on Cartpole Demo - Single Envirnoment

In [None]:
import gym
from stable_baselines3 import A2C

env = gym.make('CartPole-v0')

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')

env.close()

### A3C on Cartpole Demo - Multiple Envirnoments

In [None]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env("CartPole-v0", n_envs=4)

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=25000)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    

### Activity: A2C on Lunar Landing Environement

In [None]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('LunarLander-v2')

In [None]:
model = _________________

In [None]:
# Separate env for evaluation
eval_env = gym.make('LunarLander-v2')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
model.learn(__________________________)
model.save("a2c_lunar")
del model  # delete trained model to demonstrate loading
model = A2C.load("a2c_lunar")

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _states = ___________________
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

### Solution: A2C on Lunar Landing Environment

In [None]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('LunarLander-v2')

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
model = A2C('MlpPolicy', env, verbose=1)

In [None]:
# Separate env for evaluation
eval_env = gym.make('LunarLander-v2')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
model.learn(total_timesteps=int(1e5))
model.save("a2c_lunar")
del model  # delete trained model to demonstrate loading
model = A2C.load("a2c_lunar")

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

## PPO

In [None]:
import gym
from stable_baselines3 import PPO

env = gym.make('CartPole-v0')

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

model.save("ppo_cartpole")
del model # remove to demonstrate saving and loading
model = PPO.load("ppo_cartpole")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

### Evaluation

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

model = PPO.load("ppo_cartpole")
evaluate_policy(model, env, n_eval_episodes=10, render=True)

### Testing

In [None]:
import gym
from stable_baselines3 import PPO

model = PPO.load("ppo_cartpole")

episode = 1
while True:
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    episode +=1


### Activity: PPO on Lunar Landing Environment

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('LunarLander-v2')

In [None]:
model = ___________________

In [None]:
model._________________________
model.save("ppo_lunar2")
del model  # delete trained model to demonstrate loading
model = PPO.load("ppo_lunar2")

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

### Solution: PPO on Lunar Landing Environment

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('LunarLander-v2')

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
model = PPO('MlpPolicy', env, verbose=1)

In [None]:
# Separate env for evaluation
eval_env = gym.make('LunarLander-v2')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
# Separate env for evaluation
eval_env = gym.make('LunarLander-v2')

# Random Agent, before training
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")

In [None]:
model.learn(total_timesteps=int(1e5))
model.save("ppo_lunar2")
del model  # delete trained model to demonstrate loading
model = PPO.load("ppo_lunar2")

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")