## Learning Notes of Renee. Article:
https://colab.research.google.com/drive/1m5Ppsrv6B5maUJ-vMgbZtMeSxqFfUVSP?usp=sharing#scrollTo=9xd4fB8ZLcRT
https://reneelin2019.medium.com/use-stable-baselines3-to-solve-mountain-car-continuous-in-gym-3216912cd5e3

In [34]:
# pip install stable-baselines3[extra]

In [35]:
import gym

from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

In [36]:
import os
import tqdm
import time

In [37]:
# Saving logs to visulise in Tensorboard, saving models
models_dir = f"models/Mountain-{time.time()}"
logdir = f"logs/Mountain-{time.time()}"

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(logdir):
    os.makedirs(logdir)

In [38]:
reward_threshold_pi1 = -190
reward_threshold_pi2 = -500

seed = 42

In [39]:
# Parallel environments
env = make_vec_env("Pendulum-v1", n_envs=1, # MountainCarContinuous-v0
                      seed=seed) #, env_kwargs={"render_mode": "human"},  max_episode_steps=2000,

# The learning agent and hyperparameters

model = PPO(
#   n_envs= 4,
#   n_timesteps= 1.0e5,
  env=env,
  policy= 'MlpPolicy',
  n_steps= 1024,
  gae_lambda= 0.95,
  gamma= 0.9,
  n_epochs= 10,
#   batch_size= 64,
  ent_coef= 0.0,
  learning_rate=  1.0e-3,   
  clip_range= 0.2,
  use_sde= True,
  sde_sample_freq= 4
  
)
    

# model = PPO(
#     policy=MlpPolicy,
#     env=env,
#     seed=0,
#     batch_size=256,
#     ent_coef= 0.00429,
#     learning_rate=7.77e-05,
#     n_epochs=10,
#     n_steps=8,
#     gae_lambda=0.9,
#     gamma=0.9999,
#     clip_range=0.1,
#     max_grad_norm =5,
#     vf_coef= 0.19,
#     use_sde=True,
#     policy_kwargs=dict(log_std_init=-3.29, ortho_init=False),
#     verbose=0,
#     tensorboard_log=logdir
#     )


In [40]:
#Training and saving models along the way
TIMESTEPS = 100000
for i in range(25):
    model.learn(total_timesteps=TIMESTEPS,reset_num_timesteps=False, tb_log_name="PPO")
    # model.save(f"{models_dir}/{TIMESTEPS*i}")
  
    if i % 1 == 0:
        mean_reward, std_reward = evaluate_policy(
            model,
            env,
            n_eval_episodes=25,
            deterministic=True,
        )
        print(f" at update {i}: Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")
        if mean_reward > reward_threshold_pi2 and mean_reward < -450:
            print(f"Saving model pi2 at {mean_reward}")
            # model.save(f"{models_dir}/pi2-{TIMESTEPS*i}")
            model.save("pi2") 
        elif mean_reward > reward_threshold_pi1:
            print(f"Saving model pi1 at {mean_reward}")
            # model.save(f"{models_dir}/pi1-{TIMESTEPS*i}")
            model.save("pi1")   



KeyboardInterrupt: 

In [None]:
mean_reward, std_reward = evaluate_policy(
    model,
    env,
    n_eval_episodes=25,
    deterministic=True,
)
print(f" at update {i}: Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

 at update 24: Mean reward: -172.15 +/- 103.49


In [None]:
# model.save("final_model")
# mean_reward, std_reward = evaluate_policy(
#     model,
#     env,
#     n_eval_episodes=100,
#     deterministic=True,
# )
# print(f"Final mean reward: {mean_reward} +/- {std_reward}")

In [None]:
# # Check model performance
# # load the best model you observed from tensorboard - the one reach the goal/ obtaining highest return
# models_dir = "models/Mountain-1653282767.3143597"
# model_path = f"{models_dir}/80000"
# best_model = PPO.load(model_path, env=env)

# obs = env.reset()
# while True:
#     action, _states = best_model.predict(obs)
#     obs, rewards, dones, info = env.step(action)
#     # env.render()  use Python IDE to check, I havn't figure out how to render in Notebook

In [None]:
# env.close()
# model = None     # drop references to free logger file handles

#in terminal?
# python train.py && rm -rf logs/ models/
