In [305]:
%pip install stable-baselines3[extra]

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\harna\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [306]:
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [307]:

environment_name = "CartPole-v0"

In [308]:
env = gym.make(environment_name)

In [310]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))


Episode:1 Score:20.0
Episode:2 Score:12.0
Episode:3 Score:32.0
Episode:4 Score:14.0
Episode:5 Score:25.0


In [311]:
env.close()

In [312]:
#understanding the environment
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()


1

In [313]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([-2.8818517e+00,  8.5390506e+37, -8.8730782e-02, -1.8720668e+38],
      dtype=float32)

In [315]:
#train an rl model
import os

log_path = os.path.join('train','logs') 


In [316]:
log_path

'train\\logs'

In [317]:

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose = 1,tensorboard_log=log_path)

Using cpu device


In [318]:
model.learn(total_timesteps=20000)

Logging to train\logs\PPO_13
-----------------------------
| time/              |      |
|    fps             | 1058 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 804         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008961154 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00143    |
|    learning_rate        | 0.0003      |
|    loss                 | 7.77        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.017      |
|    value_loss           | 55.3        |
-----------------------------------------
-----

<stable_baselines3.ppo.ppo.PPO at 0x254542fceb0>

In [319]:
import os 
PPO_path = os.path.join('Train', 'Saved Models', 'PPO_cartpole_model')

In [320]:
model.save(PPO_path)

In [321]:
del model

In [322]:
model = PPO.load(PPO_path,env=env)

In [323]:
from stable_baselines3.common.evaluation import evaluate_policy

In [324]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [325]:
env.close()

In [326]:
obs = env.reset()

In [327]:
action,_ = model.predict(obs)


In [328]:
action

array([0], dtype=int64)

In [329]:
#testing and evaluation
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0


    while not done:
        env.render()
        action,_ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    print('Episode:{} score:{}'.format(episode,score))


env.close()


Episode:1 score:[200.]
Episode:2 score:[200.]
Episode:3 score:[200.]
Episode:4 score:[200.]
Episode:5 score:[200.]


In [330]:
model.predict(obs)

(array([0], dtype=int64), None)

In [331]:
import tensorboard
import tensorflow

In [332]:
training_log_path = os.path.join(log_path,'PPO_9')


In [333]:
training_log_path

'train\\logs\\PPO_9'

In [334]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [335]:
import  tensorflow as tf
import datetime

In [336]:
%tensorboard --logdir="/Users/harna/Desktop/code/reinforced learning models/train/logs/PPO_9" --port=6006

Reusing TensorBoard on port 6006 (pid 20260), started 2:54:31 ago. (Use '!kill 20260' to kill it.)

In [337]:
#ADDING A CALL BACK TO THE TRAINING STAGE
#SETTING REARD THRESHOLD
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold


In [338]:
save_path = os.path.join('train','saved models')

In [339]:
from tabnanny import verbose


stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose =1)
eval_callback = EvalCallback(env,
                            callback_on_new_best = stop_callback,
                            eval_freq = 10000,
                            best_model_save_path=save_path,
                            verbose=1)

In [340]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)


Using cpu device


In [341]:
model.learn(total_timesteps=20000,callback=eval_callback)


Logging to train\logs\PPO_14
-----------------------------
| time/              |      |
|    fps             | 1020 |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 746          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0090633705 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.686       |
|    explained_variance   | 0.00527      |
|    learning_rate        | 0.0003       |
|    loss                 | 4.16         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0145      |
|    value_loss           | 46.7         |
------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x25454317760>

In [342]:
#changing policies
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [343]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})


Using cpu device


In [344]:
model.learn(total_timesteps=20000, callback=eval_callback)


Logging to train\logs\PPO_15
-----------------------------
| time/              |      |
|    fps             | 669  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 502         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013747925 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00428    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.72        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0196     |
|    value_loss           | 20.4        |
-----------------------------------------
-----

<stable_baselines3.ppo.ppo.PPO at 0x254542fe0b0>

In [345]:
#USING AN ALTERNATE ALGORITHM
from stable_baselines3 import DQN


In [346]:
model = DQN('MlpPolicy',env,verbose=1, tensorboard_log=log_path)


Using cpu device


In [347]:
model.learn(total_timesteps=20000)


Logging to train\logs\DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.939    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 7185     |
|    time_elapsed     | 0        |
|    total_timesteps  | 129      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.907    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7509     |
|    time_elapsed     | 0        |
|    total_timesteps  | 195      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.857    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 6995     |
|    time_elapsed     | 0        |
|    total_timesteps  | 300      |
----------------------------------
---------------------------

<stable_baselines3.dqn.dqn.DQN at 0x254542fe4d0>

In [348]:
DQN.load

<bound method BaseAlgorithm.load of <class 'stable_baselines3.dqn.dqn.DQN'>>