# 1. Import Dependencies

In [1]:
#Install SWIG https://sourceforge.net/projects/swig/files/swigwin/swigwin-4.0.2/swigwin-4.0.2.zip/download?use_mirror=ixpeering

In [2]:
# !pip install gym[box2d] pyglet==1.3.2

In [2]:
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnNoModelImprovement
import os

# 2. Test Environment

In [3]:
environment_name = "CarRacing-v0"

In [4]:
env = gym.make(environment_name)

In [6]:
episodes = 2
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Track generation: 1094..1380 -> 286-tiles track




Episode:1 Score:-26.315789473684553
Track generation: 1083..1358 -> 275-tiles track
Episode:2 Score:-30.65693430656975


In [7]:
env.close()

In [8]:
env.action_space.sample()

array([0.36200923, 0.01603416, 0.331214  ], dtype=float32)

In [9]:
env.observation_space.sample()

array([[[250, 243, 244],
        [241, 196, 179],
        [ 73, 247, 118],
        ...,
        [ 41, 177, 122],
        [222, 157,  41],
        [  4, 235, 116]],

       [[ 52, 179,  68],
        [  4, 155, 131],
        [ 22, 166, 200],
        ...,
        [211, 156, 125],
        [151, 234,  79],
        [ 84,  64, 112]],

       [[105,  32, 213],
        [ 67, 100, 103],
        [112,  69, 141],
        ...,
        [189,  92,  13],
        [ 59, 131, 223],
        [162, 166, 148]],

       ...,

       [[207, 125, 122],
        [  9, 152,  73],
        [ 39,  22, 189],
        ...,
        [ 21,  80, 210],
        [164,  53, 164],
        [ 67,  99, 214]],

       [[ 32, 243,  84],
        [254, 243,   3],
        [218, 147,  78],
        ...,
        [199,  60,  13],
        [250, 185, 223],
        [246,  32, 165]],

       [[140,  98,  85],
        [107,  24,  29],
        [131,  53, 158],
        ...,
        [164,  69,  36],
        [ 72, 221,  47],
        [192, 162,  85]]

# 3. Train Model

### 3. Adding a callback to the training stage

In [10]:
save_path = os.path.join('Training', 'Saved Models')

In [11]:
stop_callback = StopTrainingOnNoModelImprovement(
    max_no_improvement_evals=10, min_evals=20, verbose=True)
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)


In [12]:
log_path = os.path.join('Training', 'Logs')

In [13]:
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [14]:
model.learn(total_timesteps=350000, callback=eval_callback)

Track generation: 1233..1545 -> 312-tiles track
Logging to Training\Logs\PPO_6




Track generation: 1247..1563 -> 316-tiles track
Track generation: 1197..1500 -> 303-tiles track
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -56.9    |
| time/              |          |
|    fps             | 127      |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 2048     |
---------------------------------
Track generation: 1094..1377 -> 283-tiles track
Track generation: 1223..1533 -> 310-tiles track
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1e+03        |
|    ep_rew_mean          | -57          |
| time/                   |              |
|    fps                  | 79           |
|    iterations           | 2            |
|    time_elapsed         | 51           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.00660193



Track generation: 1135..1423 -> 288-tiles track
Track generation: 1068..1339 -> 271-tiles track
Track generation: 1106..1392 -> 286-tiles track
Track generation: 1127..1418 -> 291-tiles track
Track generation: 1104..1384 -> 280-tiles track
Eval num_timesteps=10000, episode_reward=127.27 +/- 249.02
Episode length: 810.00 +/- 187.57
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 810          |
|    mean_reward          | 127          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0102175735 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.16        |
|    explained_variance   | 0.15         |
|    learning_rate        | 0.0003       |
|    loss                 | 0.15         |
|    n_updates            | 40           |
|    policy_gradient_l

<stable_baselines3.ppo.ppo.PPO at 0x20615d89610>

# 4. Save Model 

In [15]:
ppo_path = os.path.join('Training', 'Saved Models', 'PPO_CnnPolicy_Driving_model_350k') # saves the last, not the best

In [16]:
model.save(ppo_path)

# 5. Evaluate and Test

In [35]:
print(evaluate_policy(model, env, n_eval_episodes=20, render=True))

Track generation: 1124..1409 -> 285-tiles track
Track generation: 1110..1392 -> 282-tiles track
Track generation: 1107..1393 -> 286-tiles track
Track generation: 1220..1529 -> 309-tiles track
Track generation: 1152..1444 -> 292-tiles track
Track generation: 1203..1508 -> 305-tiles track
Track generation: 1203..1508 -> 305-tiles track
Track generation: 1232..1544 -> 312-tiles track
Track generation: 1228..1539 -> 311-tiles track
Track generation: 1188..1489 -> 301-tiles track
Track generation: 951..1193 -> 242-tiles track
Track generation: 975..1223 -> 248-tiles track
Track generation: 1123..1408 -> 285-tiles track
Track generation: 1136..1424 -> 288-tiles track
Track generation: 1047..1318 -> 271-tiles track
Track generation: 1007..1263 -> 256-tiles track
Track generation: 1206..1512 -> 306-tiles track
Track generation: 1197..1500 -> 303-tiles track
Track generation: 1052..1323 -> 271-tiles track
Track generation: 1268..1589 -> 321-tiles track
Track generation: 1047..1313 -> 266-tiles 

In [31]:
env.close()

In [10]:
episode = 0
while True:
    obs = env.reset()
    score = 0 
    for _ in range(1000):
        action, _states = model.predict(obs.copy())
        obs, rewards, dones, info = env.step(action)
        score+=rewards
        env.render()
    episode += 1
    print('Episode:{} Score:{}'.format(episode, score))

In [8]:
env.close()

In [5]:
# del model
# model

In [5]:
ppo_path = os.path.join('Training', 'Saved Models','the_best_ppo_cnn', 'best_model')
model = PPO.load(ppo_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
