Test skripta koja prikazuje rezultate rada rucne implementacije u poredjenju sa off-the-shelf implementacijom DQN algoritma u modelu ukljucenja na autoput.

In [None]:
import gymnasium as gym
import torch
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3 import PPO

env = gym.make('merge-v0', render_mode='rgb_array')
obs, info = env.reset()

Pomoćna funkcija koja se koristi za prikazivanje trenda nagrada tokom rada agenta.

In [None]:
def plot_graph(x_axis, x_label, y_label, file_name, title, fig_no):
    plt.figure(figsize=(12, 5))
    plt.plot(x_axis)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)

Dodatna medota koja se koristi kako bi se preciznije video trend, u slucaju kada se losije vidi trend nagrada sa prosecnim nagradama po epizodi. 

In [None]:
def moving_average(values, window):
    weights = np.repeat(1.0, window) / window
    sma = np.convolve(values, weights, 'valid')
    return sma

U ovom koraku se prethodno trenirani model ucitava i pusta u rad na 1000 epizoda. Na osnovu tih epizoda se prikupljaju nagrade i prikazaju na graficima (loss i rewards). Pre toga se konfigurise okruzenje, gde se dodeljuju specificne vrednosti nagrada dobijene eksperimentalnim putem.

In [None]:
env.config['right_lane_reward'] = 0.76
env.config['lane_change_reward'] = 0.15
env.config['collision_reward'] = -0.1
env.config['reward_speed_range'] = [20, 30]
env.config['normalize_reward'] = False

model = torch.load("out/stable/model2_debugged/model.pt")

rewards_avg = []
rewards = []
eps = []
ep_cnt_ = 0
for i in range(500):
  done = truncated = False
  obs, info = env.reset()
  cnt = 1
  reward_ = 0
  ep_cnt_ += 1
  # print(ep_cnt_)
  eps.append(ep_cnt_)
  while not (done or truncated):
    action = model.action_to_take(obs, env)
    obs, reward, done, truncated, info = env.step(action)
    reward_ += reward
    cnt += 1
    # env.render()
  rewards.append(reward_)
  rewards_avg.append(reward_/cnt)
plot_graph(moving_average(rewards, 50), "eps", "rewards", "model test", "Test", 0)
plot_graph(moving_average(rewards_avg, 50), "eps", "rewards", "model test", "Test", 1)

Dodatna validacija uz pomoc stablebaseline modula i njihove implementacije DQN algoritma.

In [None]:
from stable_baselines3 import DQN
env_2 = gym.make('merge-v0', render_mode='rgb_array')
model = DQN('MlpPolicy', env_2,
              policy_kwargs=dict(net_arch=[256, 256]),
              learning_rate=5e-4,
              buffer_size=15000,
              learning_starts=3000,
              batch_size=128,
              gamma=0.99,
              train_freq=1,
              gradient_steps=1,
              target_update_interval=50,
              verbose=1,
              tensorboard_log="highway_dqn/",
              exploration_initial_eps = 0.9,
              exploration_final_eps = 0.65
            )

model.learn(10000, progress_bar=True)
model.save("highway_dqn/model")

In [None]:
model.load("highway_dqn/model")
rewards_off_the_shelf = []
for i in range(500):
    done = truncated = False
    obs, info = env.reset()
    rewards_in_scope = 0
    while not (done or truncated):
        action, _ = model.predict(obs, deterministic=True)
        _, rewards, done, truncated, _ = env.step(action)
        rewards_in_scope += rewards
        env.render()
    print(i)
    rewards_off_the_shelf.append(rewards_in_scope)

plot_graph(moving_average(rewards_off_the_shelf, 50), "Episodes", "Moving average", None, "Off the shelf rewards", None)