### 1. Import Dependencies

In [20]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### 2. Load Environment

In [2]:
environment_name = 'CartPole-v1'
env = gym.make(environment_name, render_mode='human')

In [3]:
episodes = 5
for episode in range(episodes):
    state = env.reset()
    done = False
    truncate = False
    score = 0

    while not done and not truncate:
        # env.render()
        action = env.action_space.sample()
        n_state, reward, done, truncate, info = env.step(action)
        score += reward
    print(f"Episode:{episode+1} Score:{score}")
env.close()

Episode:1 Score:33.0
Episode:2 Score:35.0
Episode:3 Score:12.0
Episode:4 Score:15.0
Episode:5 Score:15.0


### Understanding the Environment

In [5]:
env.action_space

Discrete(2)

##### Two values:
- 0: Push the cart to left
- 1: Push the cart to right

In [6]:
env.action_space.sample()

np.int64(0)

In [7]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

##### Four Values:
- 0: Cart Position
- 1: Cart Velocity
- 2: Pole Angle
- 3: Pole Angular Velocity

In [8]:
env.observation_space.sample()

array([-3.411042  ,  0.40734375,  0.25830874, -2.2261162 ], dtype=float32)

### 3. Training an RL Model

In [16]:
# Make the directories first
log_path = "Logs"

In [17]:
log_path

'Logs'

In [18]:
env = gym.make(environment_name, render_mode='human')
env = DummyVecEnv([lambda : env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [19]:
model.learn(total_timesteps=5000)
env.close()

Logging to Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 45   |
|    iterations      | 1    |
|    time_elapsed    | 44   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 89          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007752626 |
|    clip_fraction        | 0.0813      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00413    |
|    learning_rate        | 0.0003      |
|    loss                 | 6.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0133     |
|    value_loss           | 51.1        |
-----------------------------------------
------------

### 4. Save and Reload Model

In [21]:
PPO_Path = os.path.join('Saved Models', 'PPO_Model_CartPole')

In [22]:
model.save(PPO_Path)

In [23]:
del model

In [25]:
model = PPO.load(PPO_Path, env=env)

In [26]:
model

<stable_baselines3.ppo.ppo.PPO at 0x205163c6780>

### 5. Evaluation

##### Core metrics to look at:
- Average Reward
- Avg episode length

In [36]:
env = gym.make('CartPole-v1', render_mode="human")
evaluate_policy(model, env, n_eval_episodes=10, render=True)
# env.close()

(np.float64(356.9), np.float64(151.88314587208154))

In [37]:
env.close()

### 6. Test Model

In [40]:
env = gym.make('CartPole-v1', render_mode="human")
episodes = 5
for episode in range(episodes):
    obs, _ = env.reset()
    done = False
    truncate = False
    score = 0

    while not done and not truncate:
        # env.render()
        action, _ = model.predict(obs)
        obs, reward, done, truncate, info = env.step(action)
        score += reward
    print(f"Episode:{episode+1} Score:{score}")
env.close()

Episode:1 Score:32.0
Episode:2 Score:118.0
Episode:3 Score:166.0
Episode:4 Score:110.0
Episode:5 Score:27.0


#### To imporove performance:
##### Training Strategies:
 1. Train for longer
 2. Hyperparameter Tuning
 3. Try different algorithms

### 7. View Logs in TensorBoard

In [41]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [42]:
!tensorboard --logdir={training_log_path}

^C


### 8. Adding a callback to the training stage

In [43]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [44]:
save_path = "Saved Models"

In [58]:
eval_env = gym.make('CartPole-v1')
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(
    eval_env,
    callback_on_new_best = stop_callback,
    eval_freq=10000,
    best_model_save_path = save_path,
    verbose=1
)

In [60]:
model = PPO('MlpPolicy', eval_env, verbose=1, tensorboard_log = log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [61]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Logs\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.1     |
|    ep_rew_mean     | 21.1     |
| time/              |          |
|    fps             | 2189     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 25.2        |
|    ep_rew_mean          | 25.2        |
| time/                   |             |
|    fps                  | 1349        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009013742 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00227     |
|    lea

<stable_baselines3.ppo.ppo.PPO at 0x205174cee10>

### 9. Changing Policies

In [62]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [63]:
model = PPO('MlpPolicy', eval_env, verbose=1, tensorboard_log = log_path, policy_kwargs={'net_arch':net_arch})

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [64]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Logs\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.9     |
|    ep_rew_mean     | 23.9     |
| time/              |          |
|    fps             | 1639     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.9        |
|    ep_rew_mean          | 28.9        |
| time/                   |             |
|    fps                  | 1006        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013612106 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.0024     |
|    lea

<stable_baselines3.ppo.ppo.PPO at 0x205175360c0>

### 10. Using an Alterante Algorithm

In [65]:
from stable_baselines3 import DQN

In [66]:
model = DQN('MlpPolicy', eval_env, tensorboard_log=log_path, verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [67]:
model.learn(total_timesteps=20000)

Logging to Logs\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 22       |
|    ep_rew_mean      | 22       |
|    exploration_rate | 0.958    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 3705     |
|    time_elapsed     | 0        |
|    total_timesteps  | 88       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.9     |
|    ep_rew_mean      | 23.9     |
|    exploration_rate | 0.909    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2052     |
|    time_elapsed     | 0        |
|    total_timesteps  | 191      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.467    |
|    n_updates        | 22       |
----------------------------------
----------------------------------
| rollout/            |          

<stable_baselines3.dqn.dqn.DQN at 0x205175c24b0>