In [1]:
from torch import nn
import torch
from collections import deque
import itertools
import numpy as np
import random
from torch.utils.tensorboard import SummaryWriter
from car import CarEnv,WIN

pygame 2.1.0 (SDL 2.0.16, Python 3.9.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
GAMMA = 0.99               
BATCH_SIZE = 64          
BUFFER_SIZE = 200_000      
MIN_REPLAY_SIZE = 1_000    
EPSILON_START = 1.0 
EPSILON_END = 0.1
EPSILON_DECAY = 10_000      
TARGET_UPDATE_FREQ = 1000
LEARNING_RATE = 0.001
TARGET_SAVE_FREQ = TARGET_UPDATE_FREQ*25
MODELS_DIR = '../saved_models'
LOG_DIR = '../logs/car_6_2'

In [3]:
summary_writer = SummaryWriter(LOG_DIR)

In [4]:
class Network(nn.Module):
    def __init__(self,env):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(env.get_observation_space_size(), 50),
            nn.ReLU(),
            nn.Linear(50, 100),
            nn.ReLU(),
            nn.Linear(100, env.get_action_space_size()),
        )

    def forward(self, x):
        return self.net(x)
    def act(self,obs):
        obs_t = torch.as_tensor(obs, dtype=torch.float32)
        q_values = self.forward(obs_t.unsqueeze(0))
        max_q_index = torch.argmax(q_values, dim=1)[0]
        action = max_q_index.detach().item()
        return action

In [5]:
env = CarEnv()

replay_buffer = deque(maxlen=BUFFER_SIZE)

reward_buffer = deque([],maxlen=100)
score_buffer = deque([],maxlen=100)
episode_reward = 0.0

In [6]:
online_net = Network(env)
target_net = Network(env)
target_net.load_state_dict(online_net.state_dict())

optimizer = torch.optim.Adam(online_net.parameters(), lr=LEARNING_RATE)

In [7]:
# iniziatlize Replay Buffer
obs = env.reset()

for _ in range(MIN_REPLAY_SIZE):
    action = env.sample_from_action_space()

    new_obs, rew, done = env.step(action)    
    
    transition = (obs,action,rew,done,new_obs)
    
    replay_buffer.append(transition)
    obs = new_obs

    if done:
        obs = env.reset()

In [8]:
# training loop

obs = env.reset()

for step in itertools.count():
    epsilon = np.interp(step,[0,EPSILON_DECAY],[EPSILON_START,EPSILON_END])
    rnd_sample = random.random()

    if rnd_sample < epsilon:
        action = env.sample_from_action_space()
    else:
        action = online_net.act(obs)

    new_obs, rew, done = env.step(action)
    

    transition = (obs,action,rew,done,new_obs)
    replay_buffer.append(transition)
    obs = new_obs

    episode_reward += rew

    if done:
        obs = env.reset()
        reward_buffer.append(episode_reward)
        episode_reward = 0.0


    # start gradient step 
    transitions = random.sample(replay_buffer,BATCH_SIZE)
    
    obses = np.asarray([t[0] for t in transitions])
    actions = np.asarray([t[1] for t in transitions])
    rewards = np.asarray([t[2] for t in transitions])
    dones = np.asarray([t[3] for t in transitions])
    new_obses = np.asarray([t[4] for t in transitions])


    obses_t = torch.as_tensor(obses, dtype=torch.float32)
    
    actions_t = torch.as_tensor(actions, dtype=torch.int64).unsqueeze(-1)
    rewards_t = torch.as_tensor(rewards, dtype=torch.float32).unsqueeze(-1)
    dones_t = torch.as_tensor(dones, dtype=torch.float32).unsqueeze(-1)
    new_obses_t = torch.as_tensor(new_obses, dtype=torch.float32)

    # compute targets

    target_q_values = target_net(new_obses_t)

    max_target_q_values = target_q_values.max(dim=1,keepdim=True)[0]

    targets = rewards_t + GAMMA * (1-dones_t) * max_target_q_values

    # loss

    q_values = online_net(obses_t)

    action_q_values = torch.gather(input = q_values, dim=1, index = actions_t)

    loss = nn.functional.smooth_l1_loss(action_q_values,targets)

    # gradient step

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # update target network if needed
    if step % TARGET_UPDATE_FREQ == 0:
        target_net.load_state_dict(online_net.state_dict())
        
    # checkpointing
    if step % TARGET_SAVE_FREQ == 0:
        print("Saving target net")
        torch.save(target_net.state_dict(), MODELS_DIR+"/car_target_net.pth")
    
    # Logging
    if step % 1000 == 0:
        rew_mean = np.mean(reward_buffer)
        print()
        print('Step', step)
        print('Avg Rew',rew_mean)
        summary_writer.add_scalar('avg_rew', rew_mean, global_step=step)


    

Saving target net

Step 0
Avg Rew nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Step 1000
Avg Rew 0.5263157894736842

Step 2000
Avg Rew 0.4358974358974359

Step 3000
Avg Rew 0.6

Step 4000
Avg Rew 0.7464788732394366

Step 5000
Avg Rew 0.872093023255814

Step 6000
Avg Rew 0.9897959183673469

Step 7000
Avg Rew 1.13

Step 8000
Avg Rew 1.32

Step 9000
Avg Rew 1.59

Step 10000
Avg Rew 1.8

Step 11000
Avg Rew 1.93

Step 12000
Avg Rew 2.17

Step 13000
Avg Rew 2.37

Step 14000
Avg Rew 2.34

Step 15000
Avg Rew 2.82

Step 16000
Avg Rew 2.99

Step 17000
Avg Rew 3.41

Step 18000
Avg Rew 3.68

Step 19000
Avg Rew 3.99

Step 20000
Avg Rew 4.14

Step 21000
Avg Rew 4.14

Step 22000
Avg Rew 4.14

Step 23000
Avg Rew 4.14

Step 24000
Avg Rew 5.39
Saving target net

Step 25000
Avg Rew 5.63

Step 26000
Avg Rew 5.74

Step 27000
Avg Rew 6.19

Step 28000
Avg Rew 6.39

Step 29000
Avg Rew 6.69

Step 30000
Avg Rew 6.9

Step 31000
Avg Rew 7.17

Step 32000
Avg Rew 7.23

Step 33000
Avg Rew 7.51

Step 34000
Avg Rew 7.67

Step 35000
Avg Rew 7.82

Step 36000
Avg Rew 8.19

Step 37000
Avg Rew 8.32


KeyboardInterrupt: 