### Importing dependencies

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### Loading Environment

In [2]:
env = gym.make('CartPole-v1', render_mode="human")

In [3]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, trunc, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:38.0
Episode:2 Score:22.0
Episode:3 Score:9.0
Episode:4 Score:13.0
Episode:5 Score:24.0


### Train RL model

In [4]:
log_path = os.path.join('Training', 'Logs')

In [5]:
from stable_baselines3.common.env_util import make_vec_env

vec_env = make_vec_env("CartPole-v1", n_envs=1)
model = PPO("MlpPolicy", vec_env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [6]:
model.learn(total_timesteps=25000)

Logging to Training/Logs/PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 24.8     |
|    ep_rew_mean     | 24.8     |
| time/              |          |
|    fps             | 1539     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30          |
|    ep_rew_mean          | 30          |
| time/                   |             |
|    fps                  | 1000        |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009105258 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00424    |

<stable_baselines3.ppo.ppo.PPO at 0x7f3e86af7f70>

### Save and Reload Model

In [7]:
PPO_path = os.path.join('Training', 'Saved_Models', 'PPO_model')

In [8]:
model.save(PPO_path)

In [9]:
del model

In [10]:
model = PPO.load(PPO_path)

### Evaluation

In [11]:
evaluate_policy(model, vec_env, n_eval_episodes=10, render=False)

(500.0, 0.0)

In [12]:
env.close()

### Model Testing

In [13]:
env = gym.make('CartPole-v1', render_mode="human")
obs = env.reset()[0]
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, trunc, info = env.step(action)
    env.render()
    if done: 
        print('info', info)
        break

info {}


In [14]:
env.close()

### See logs in tensorboard

In [15]:
training_log_path = os.path.join(log_path, 'PPO_4')

In [16]:
# !tensorboard --logdir={training_log_path}

### Using Callback

In [17]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [27]:
save_path = os.path.join('Training', 'Saved_Models')
log_path = os.path.join('Training', 'Logs')

In [28]:
env = gym.make('CartPole-v1', render_mode="human")
vec_env = make_vec_env("CartPole-v1", n_envs=1)

In [29]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)
eval_callback = EvalCallback(vec_env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [30]:
model = PPO("MlpPolicy", vec_env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [31]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_8
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 19.8     |
|    ep_rew_mean     | 19.8     |
| time/              |          |
|    fps             | 1366     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.2        |
|    ep_rew_mean          | 26.2        |
| time/                   |             |
|    fps                  | 813         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008539352 |
|    clip_fraction        | 0.0918      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.000284    |

<stable_baselines3.ppo.ppo.PPO at 0x7f3e5caa1990>

In [32]:
evaluate_policy(model, vec_env, n_eval_episodes=10, render=False)

(440.0, 92.22038820130828)

In [33]:
env.close()

### Using custom neural network architecture

In [34]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [35]:
model = PPO('MlpPolicy', vec_env, verbose = 1, policy_kwargs={'net_arch': net_arch})

Using cpu device


In [36]:
model.learn(total_timesteps=20000, callback=eval_callback)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.8     |
|    ep_rew_mean     | 21.8     |
| time/              |          |
|    fps             | 821      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.6        |
|    ep_rew_mean          | 29.6        |
| time/                   |             |
|    fps                  | 627         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015071506 |
|    clip_fraction        | 0.233       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00428     |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x7f3e5caa0520>

In [38]:
evaluate_policy(model, vec_env, n_eval_episodes=10, render=False)

(500.0, 0.0)

### Using Alternate Algorithms

In [37]:
from stable_baselines3 import DQN

In [39]:
model = DQN('MlpPolicy', vec_env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [40]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 24.8     |
|    ep_rew_mean      | 24.8     |
|    exploration_rate | 0.953    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6577     |
|    time_elapsed     | 0        |
|    total_timesteps  | 99       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23.8     |
|    ep_rew_mean      | 23.8     |
|    exploration_rate | 0.91     |
| time/               |          |
|    episodes         | 8        |
|    fps              | 6898     |
|    time_elapsed     | 0        |
|    total_timesteps  | 190      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 23       |
|    ep_rew_mean      | 23       |
|    exploration_rate | 0.869    |
| time/               | 

<stable_baselines3.dqn.dqn.DQN at 0x7f3e5caa34c0>

In [41]:
dqn_path = os.path.join('Training', 'Saved Models', 'DQN_model')

In [43]:
evaluate_policy(model, vec_env, n_eval_episodes=10)

(9.5, 0.6708203932499369)

In [44]:
env.close()