# Topic 5 Overview of Advanced RL Algorithms

## A2C and A3C

### A2C on Cartpole Demo - Single Envirnoment

In [None]:
import gym
from stable_baselines3 import A2C

env = gym.make('CartPole-v0')

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')

env.close()

### A3C on Cartpole Demo - Multiple Envirnoments

In [None]:
import gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env

# Parallel environments
env = make_vec_env("CartPole-v0", n_envs=4)

model = A2C("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=25000)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
    



### Activity: A2C on Taxi Environement

In [None]:
import gym
from stable_baselines3 import A2C

env = gym.make('Taxi-v3')

model = A2C(______________________)
model.learn(total_timesteps=10000)

episodes = 5
total_score = 0
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = _________________
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    total_score += score
    
env.close()
ave_score = total_score/episodes
print(f'Average Score:{ave_score}')


### Solution: A2C on Taxi Environment

In [1]:
import gym
from stable_baselines3 import A2C

env = gym.make('Acrobot-v1')

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

episodes = 5
total_score = 0
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    total_score += score
    
env.close()
avg_score = total_score/episodes
print(f'Average Score:{avg_score}')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 415      |
|    ep_rew_mean        | -414     |
| time/                 |          |
|    fps                | 1042     |
|    iterations         | 100      |
|    time_elapsed       | 0        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -1.07    |
|    explained_variance | -0.34    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -2.72    |
|    value_loss         | 7.58     |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 264      |
|    ep_rew_mean        | -263     |
| time/                 |          |
|    fps                | 1058     |
|    iterations         | 200      |
|    time_elapsed 

-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 145       |
|    ep_rew_mean        | -144      |
| time/                 |           |
|    fps                | 1055      |
|    iterations         | 1400      |
|    time_elapsed       | 6         |
|    total_timesteps    | 7000      |
| train/                |           |
|    entropy_loss       | -0.753    |
|    explained_variance | -0.000636 |
|    learning_rate      | 0.0007    |
|    n_updates          | 1399      |
|    policy_loss        | -1.42     |
|    value_loss         | 1.19      |
-------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 141      |
|    ep_rew_mean        | -140     |
| time/                 |          |
|    fps                | 1053     |
|    iterations         | 1500     |
|    time_elapsed       | 7        |
|    total_timesteps    | 7500     |
| train/             



Episode:1 Score:-78.0
Episode:2 Score:-81.0
Episode:3 Score:-82.0
Episode:4 Score:-106.0
Episode:5 Score:-75.0
Average Score:-84.4


## PPO

In [None]:
import gym
from stable_baselines3 import PPO

env = gym.make('CartPole-v0')

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

model.save("ppo_cartpole")
del model # remove to demonstrate saving and loading
model = PPO.load("ppo_cartpole")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

### Evaluation

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

model = PPO.load("ppo_cartpole")
evaluate_policy(model, env, n_eval_episodes=10, render=True)

### Testing

In [None]:
import gym
from stable_baselines3 import PPO

model = PPO.load("ppo_cartpole")

episode = 1
while True:
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    episode +=1


### Activity: PPO on Acrobot Environment

In [None]:
import gym
from stable_baselines3 import PPO

env = gym.make('Acrobot-v1')

model = _____________________________
model.learn(total_timesteps=10000)

model.save(_________________)
del model # remove to demonstrate saving and loading
model = PPO.load(_______________)

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

model = PPO.load(__________________)
evaluate_policy(___________________________)

### Solution: PPO on Arobot Environment

In [None]:
import gym
from stable_baselines3 import PPO

env = gym.make('Acrobot-v1')

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

model.save("ppo_taxi")
del model # remove to demonstrate saving and loading
model = PPO.load("ppo_taxi")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
env.close()

In [None]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy

model = PPO.load("ppo_taxi")
evaluate_policy(model, env, n_eval_episodes=10, render=True)