Test skripta koja prikazuje rezultate rada rucne implementacije u poredjenju sa off-the-shelf implementacijom DQN algoritma u modelu autoputa.

In [1]:
import gymnasium as gym
import torch
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3 import PPO

env = gym.make('highway-fast-v0', render_mode='rgb_array')
obs, info = env.reset()

In [2]:
def plot_graph(x_axis, x_label, y_label, file_name, title, fig_no):
    plt.figure(figsize=(12, 5))
    plt.plot(x_axis)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

In [3]:
def moving_average(values, window):
    weights = np.repeat(1.0, window) / window
    sma = np.convolve(values, weights, 'valid')
    return sma

In [None]:
env.config['right_lane_reward'] = 0.76
env.config['lane_change_reward'] = 0.15
env.config['collision_reward'] = -0.1
env.config['reward_speed_range'] = [20, 30]
env.config['normalize_reward'] = False

model = torch.load("out/stable/model2_debugged/model.pt")

rewards_avg = []
rewards = []
eps = []
ep_cnt_ = 0
for i in range(500):
  done = truncated = False
  obs, info = env.reset()
  cnt = 1
  reward_ = 0
  ep_cnt_ += 1
  # print(ep_cnt_)
  eps.append(ep_cnt_)
  while not (done or truncated):
    action = model.action_to_take(obs, env)
    obs, reward, done, truncated, info = env.step(action)
    reward_ += reward
    cnt += 1
    # env.render()
  rewards.append(reward_)
  rewards_avg.append(reward_/cnt)
plot_graph(moving_average(rewards, 50), "eps", "rewards", "model test", "Test", 0)
plot_graph(moving_average(rewards_avg, 50), "eps", "rewards", "model test", "Test", 1)

In [4]:
from stable_baselines3 import DQN
env_2 = gym.make('highway-fast-v0', render_mode='rgb_array')
model = DQN('MlpPolicy', env_2,
              policy_kwargs=dict(net_arch=[256, 256]),
              learning_rate=5e-4,
              buffer_size=15000,
              learning_starts=3000,
              batch_size=128,
              gamma=0.99,
              train_freq=1,
              gradient_steps=1,
              target_update_interval=50,
              verbose=1,
              tensorboard_log="highway_dqn/",
              exploration_initial_eps = 0.9,
              exploration_final_eps = 0.65
            )

model.learn(10000, progress_bar=True)
model.save("highway_dqn/model")

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to highway_dqn/DQN_8


Output()

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 15       |
|    ep_rew_mean      | 11.1     |
|    exploration_rate | 0.782    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 26       |
|    time_elapsed     | 2        |
|    total_timesteps  | 60       |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 14.2     |
|    ep_rew_mean      | 10.5     |
|    exploration_rate | 0.766    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 27       |
|    time_elapsed     | 4        |
|    total_timesteps  | 114      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 13.9     |
|    ep_rew_mean      | 10.2     |
|    exploration_rate | 0.75     |
| time/               |          |
|    episodes         | 12       |
|    fps              | 28       |
|    time_elapsed     | 5        |
|    total_timesteps  | 167      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 11.7     |
|    ep_rew_mean      | 8.5      |
|    exploration_rate | 0.744    |
| time/               |          |
|    episodes         | 16       |
|    fps              | 28       |
|    time_elapsed     | 6        |
|    total_timesteps  | 187      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.9     |
|    ep_rew_mean      | 7.98     |
|    exploration_rate | 0.734    |
| time/               |          |
|    episodes         | 20       |
|    fps              | 27       |
|    time_elapsed     | 7        |
|    total_timesteps  | 219      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.8     |
|    ep_rew_mean      | 7.99     |
|    exploration_rate | 0.722    |
| time/               |          |
|    episodes         | 24       |
|    fps              | 27       |
|    time_elapsed     | 9        |
|    total_timesteps  | 260      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.9     |
|    ep_rew_mean      | 8.06     |
|    exploration_rate | 0.709    |
| time/               |          |
|    episodes         | 28       |
|    fps              | 28       |
|    time_elapsed     | 10       |
|    total_timesteps  | 304      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.2     |
|    ep_rew_mean      | 7.64     |
|    exploration_rate | 0.702    |
| time/               |          |
|    episodes         | 32       |
|    fps              | 28       |
|    time_elapsed     | 11       |
|    total_timesteps  | 327      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.1     |
|    ep_rew_mean      | 7.47     |
|    exploration_rate | 0.691    |
| time/               |          |
|    episodes         | 36       |
|    fps              | 28       |
|    time_elapsed     | 12       |
|    total_timesteps  | 362      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.2     |
|    ep_rew_mean      | 7.59     |
|    exploration_rate | 0.678    |
| time/               |          |
|    episodes         | 40       |
|    fps              | 27       |
|    time_elapsed     | 14       |
|    total_timesteps  | 408      |
----------------------------------


----------------------------------
| rollout/            |          |
|    ep_len_mean      | 10.7     |
|    ep_rew_mean      | 7.92     |
|    exploration_rate | 0.659    |
| time/               |          |
|    episodes         | 44       |
|    fps              | 28       |
|    time_elapsed     | 16       |
|    total_timesteps  | 471      |
----------------------------------


In [61]:
model.load("highway_dqn/model")
rewards_off_the_shelf = []
for i in range(500):
    done = truncated = False
    obs, info = env.reset()
    rewards_in_scope = 0
    while not (done or truncated):
        action, _ = model.predict(obs, deterministic=True)
        _, rewards, done, truncated, _ = env.step(action)
        rewards_in_scope += rewards
        env.render()
    print(i)
    rewards_off_the_shelf.append(rewards_in_scope)

plot_graph(moving_average(rewards_off_the_shelf, 50), "Episodes", "Moving average", None, "Off the shelf rewards", None)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53


In [None]:
model = PPO.load("highway_ppo/model")
for i in range(1000):
  done = truncated = False
  obs, info = env.reset()
  while not (done or truncated):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, truncated, info = env.step(action)
    env.render()