In [1]:
from alpyne.client.alpyne_client import AlpyneClient
import numpy as np
import random
sys.path.append("../..")
from thesis.q_learning.q_table import QTable
from thesis.dqn.replay_memory import ReplayMemory, Transition
import torch
import torch.nn as nn
import math
from itertools import count

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(69)
random.seed(42)

device(type='cpu')

## Set Up Environment

In [2]:
client = AlpyneClient("../../models\AGV_SlipCourse Exported\AGV_SlipCourse.zip", blocking = True, port = 51150)
config = client.configuration_template
config.inSlipChance = 0.3
run = client.create_reinforcement_learning(config)
run = run.run()

  warn(f"Unzipping to temporary directory ({tmp_dir})")


In [3]:
run.get_observation()

[outObservation:DOUBLE_ARRAY=[1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0], outReward:DOUBLE=0]

## Q-Network

In [4]:
inputs = 12
n_actions = 4
policy_net = nn.Sequential(
    nn.Linear(inputs, 200),
    nn.ReLU(),
    nn.Linear(200, 200),
    nn.ReLU(),
    nn.Linear(200, n_actions),
    nn.Sigmoid()
)
target_net = nn.Sequential(
    nn.Linear(inputs, 200),
    nn.ReLU(),
    nn.Linear(200, 200),
    nn.ReLU(),
    nn.Linear(200, n_actions),
    nn.Sigmoid()
)

## Training

In [5]:
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = torch.optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)

In [6]:
GAMMA = 0.99
EPS_START = 1
EPS_END = 0.1
EPS_DECAY = 30000
TARGET_UPDATE = 5
BATCH_SIZE = 128
MAX_STEPS = 1000

In [7]:
steps_done = 0

def select_action(state):
    if not isinstance(state, torch.Tensor):
        state = torch.Tensor([state])
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    if steps_done % 1000 == 0:
        print(eps_threshold, steps_done)
    steps_done += 1
    if sample > eps_threshold:
        #print("network")
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        #print("random")
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

In [8]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [9]:
num_episodes = 2000
next_action = torch.Tensor([1])
reward = 0
t=0
action_tmp = client.action_template
for i_episode in range(num_episodes):
    #if i_episode % 10 == 1:
    print(i_episode, reward, t)
    # Initialize the environment and state
    run.reset()
    state = torch.Tensor([run.get_observation().outObservation], device = device)
    
    for t in count():
        # Select and perform an action
        next_action = select_action(state)
        #print(next_action)
        action_tmp.inAction = int(next_action)
        run.take_action(action_tmp)
        
        optimize_model()
        action = next_action
        
        run.wait_for_completion()
        done = run.is_terminal()
        next_state, reward = run.get_observation().values()
        
        reward = reward if t < MAX_STEPS else -1
        reward = torch.tensor([reward], device=device)
        next_state = torch.Tensor([next_state], device = device) if not done else None

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        #optimize_model()
        if done or t >= MAX_STEPS:
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')

0 0 0
1.0 0
1 tensor([-1.]) 100
2 tensor([-1.]) 274
3 tensor([-1.]) 484
4 tensor([-1.]) 30
0.9704944904338053 1000
5 tensor([-1.]) 162
6 tensor([-1.]) 32
7 tensor([-1.]) 146
8 tensor([-1.]) 174
9 tensor([-1.]) 210
10 tensor([-1.]) 71
11 tensor([-1.]) 42
12 tensor([-1.]) 169
0.941956286528456 2000
13 tensor([-1.]) 521
14 tensor([-1.]) 11
0.9143536762323635 3000
15 tensor([-1.]) 572
16 tensor([-1.]) 206
17 tensor([-1.]) 61
18 tensor([-1.]) 479
19 tensor([-1.]) 187
0.8876559871386527 4000
20 tensor([-1.]) 141
21 tensor([-1.]) 158
22 tensor([-1.]) 277
23 tensor([-1.]) 352
24 tensor([-1.]) 13
25 tensor([-1.]) 13
0.8618335524015527 5000
26 tensor([-1.]) 98
27 tensor([-1.]) 28
28 tensor([-1.]) 216
29 tensor([-1.]) 30
30 tensor([-1.]) 101
31 tensor([-1.]) 295
32 tensor([-1.]) 16
33 tensor([-1.]) 276
0.8368576777701836 6000
34 tensor([-1.]) 38
35 tensor([-1.]) 113
36 tensor([-1.]) 50
37 tensor([-1.]) 59
38 tensor([-1.]) 148
39 tensor([-1.]) 121
40 tensor([-1.]) 23
41 tensor([-1.]) 425
0.8127006