# 1. Import Dependencies

In [None]:
#Install SWIG https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering

In [None]:
# !pip install gym[box2d] pyglet==1.3.2

In [None]:
import gym 
from stable_baselines3 import PPO, TD3
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
import os

# 2. Test Environment

In [None]:
environment_name = "CarRacing-v0"

In [None]:
env = gym.make(environment_name)

In [None]:
episodes = 2
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space.sample()

# 3. Train Model

### 3. Adding a callback to the training stage

In [None]:
save_path = os.path.join('Training', 'Saved Models')

In [None]:
stop_callback = StopTrainingOnNoModelImprovement(
    max_no_improvement_evals=10, min_evals=20, verbose=True)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)


In [None]:
log_path = os.path.join('Training', 'Logs')

In [None]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
model.learn(total_timesteps=350000, callback=eval_callback)

# 4. Save Model 

In [None]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_CNNPolicy_Driving_model_500k') # saves the last, not the best

In [None]:
model.save(ppo_path)

# 5. Evaluate and Test

In [None]:
print(evaluate_policy(model, env, n_eval_episodes=20, render=True))

In [None]:
env.close()

# 6. Viewing Logs in Tensroboard

In [None]:
# training_log_path = os.path.join(log_path, 'PPO_4')

In [None]:
# !tensorboard --logdir={training_log_path} --port=6001

In [None]:
episode = 0
while True:
    obs = env.reset()
    score = 0 
    for _ in range(1000):
        action, _states = model.predict(obs.copy(), deterministic=True)
        obs, rewards, dones, info = env.step(action)
        score+=rewards
        env.render()
    episode += 1
    print('Episode:{} Score:{}'.format(episode, score))

In [None]:
env.close()

# 7. Loading the best model after training

In [None]:
# del model # delete current model in memory (because it is the last one, not the best one)
# model

In [None]:
ppo_path = os.path.join('Training', 'Saved Models','the_best_ppo_cnn', 'best_model')
model = PPO.load(ppo_path, env) # load the best model