### 3.2 Navigating with Q-learning

In [1]:
from Gridworld import Gridworld

In [2]:
game = Gridworld(size=4, mode='static')

In [3]:
game.display()

array([['+', '-', ' ', 'P'],
       [' ', 'W', ' ', ' '],
       [' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [4]:
game.makeMove('d')

In [5]:
game.display()

array([['+', '-', ' ', ' '],
       [' ', 'W', ' ', 'P'],
       [' ', ' ', ' ', ' '],
       [' ', ' ', ' ', ' ']], dtype='<U2')

In [6]:
game.reward()

-1

Each matrix encodes the position of one of the four objects: the player, the goal, the pit, and the wall

In [7]:
game.board.render_np()

array([[[0, 0, 0, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=uint8)

#### 3.2.7 A neural network as the Q function

In [8]:
import numpy as np
import torch
from Gridworld import Gridworld
import random
from matplotlib import pylab as plt

In [9]:
import torch
from torch import nn
from torch import optim

In [10]:
l1 = n_pixels = 64
l2 = hidden_1 = 150
l3 = hidden_2 = 100
l4 = n_actions = 4

In [11]:
model = nn.Sequential(
    nn.Linear(n_pixels, hidden_1),
    nn.ReLU(),
    nn.Linear(hidden_1, hidden_2),
    nn.ReLU(),
    nn.Linear(hidden_2, n_actions)
)

In [12]:
loss_func = nn.MSELoss()
learning_rate = 1e-3

In [13]:
optimizier = optim.Adam(model.parameters(), lr=learning_rate)

In [14]:
gamma = 0.9
epsilon = 1.0

In [15]:
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r'
}

In [16]:
epochs = 100
losses = []

In [17]:
import ipdb

In [None]:
for i in range(epochs):
    game = Gridworld(size=4, mode='static')
    
    noises = np.random.rand(1, 64) / 10
    board_state = game.board.render_np().reshape(1, 64) + noises
    
    board_state = torch.from_numpy(board_state).float()
    
    status = 1
    
    while(status == 1):
        predicted_reward = model(board_state)
        # predicted_reward = predicted_award.data.numpy()
        
        if (random.random() < epsilon):
            # do a random action
            action = np.random.randint(0, 4)
        else:
            # choose action with highest reward
            #action = np.argmax(predicted_reward)
            action = torch.argmax(predicted_reward, dim=-1)
        
        # map the index action to board action
        action_item = action.item() if type(action) == torch.Tensor else action
        board_action = action_set[action_item]
        game.makeMove(board_action)
        
        # noises = np.random.rand(1, 64) / 10
        noises = torch.randn(1, 64) / 10
        new_board_state = game.board.render_np().reshape(1, 64)
        new_board_state = torch.from_numpy(new_board_state).float() + noises
        
        reward = game.reward()
        with torch.no_grad():
            next_predicted_reward = model(new_board_state)
        
        max_next_predicted_reward = torch.max(next_predicted_reward)
        
        if reward == -1:
            y = reward + (gamma * max_next_predicted_reward)
        else:
            y = reward
            
        y = torch.tensor([y]).float().detach()
        x = predicted_reward.squeeze()[action]
        # ipdb.set_trace()
        loss = loss_func(x, y)
        losses.append(loss.item())
        print(i, loss.item())
            
        optimizier.zero_grad()
        loss.backward()
        optimizier.step()
        board_state = new_board_state
            
        if reward != -1:
            status = 0
            
    if epsilon > 0.1:
        epsilon -= (1/epochs)

0 0.11097139120101929
0 0.5617592334747314
0 0.1987077295780182
0 0.001668112468905747
0 0.3621941804885864
0 0.20956547558307648
0 0.04010184109210968
0 0.3634780943393707
0 0.38536956906318665
0 0.4198072850704193
0 0.0014484327984973788
0 0.30854055285453796
0 0.02473548799753189
0 0.19024935364723206
0 0.14784100651741028
0 0.18802711367607117
0 0.4699954390525818
0 0.005103636998683214
0 0.01815406233072281
0 0.3440820276737213
0 0.024973491206765175
0 0.12828406691551208
0 0.04405996948480606
0 0.28007543087005615
0 0.26557859778404236
0 0.5676853656768799
0 0.21459850668907166
0 0.0009886397747322917
0 0.030531344935297966
0 0.4630725085735321
0 0.41960957646369934
0 0.08997751027345657
0 0.6663221716880798
0 1.4103680849075317
0 0.5476836562156677
0 0.021969327703118324
0 0.1684989333152771
0 0.2918976843357086
0 0.018027331680059433
0 0.6009304523468018
0 0.030693864449858665
0 0.12209904938936234
0 0.05629337579011917
0 1.0258772373199463
0 0.24097348749637604
0 0.00565777765

In [29]:
loss_fn = torch.nn.MSELoss()

In [32]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [18]:
epochs = 1000
losses = [] #A
for i in range(epochs): #B
    game = Gridworld(size=4, mode='static') #C
    state_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0 #D
    state =state1 = torch.from_numpy(state_).float() #E
    status = 1 #F
    while(status == 1): #G
        qval = model(state1) #H
        qval_ = qval.data.numpy()
        if (random.random() < epsilon): #I
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        
        action = action_set[action_] #J
        game.makeMove(action) #K
        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/10.0
        state2 = torch.from_numpy(state2_).float() #L
        reward = game.reward()
        with torch.no_grad():
            newQ = model(state2.reshape(1,64))
        maxQ = torch.max(newQ) #M
        if reward == -1: #N
            Y = reward + (gamma * maxQ)
        else:
            Y = reward
        Y = torch.Tensor([Y]).detach()
        X = qval.squeeze()[action_] #O
        ipdb.set_trace()
        loss = loss_fn(X, Y) #P
        print(i, loss.item())
        # clear_output(wait=True)
        optimizer.zero_grad()
        loss.backward()
        losses.append(loss.item())
        optimizer.step()
        state1 = state2
        if reward != -1: #Q
            status = 0
    if epsilon > 0.1: #R
        epsilon -= (1/epochs)
    break

> [0;32m/tmp/ipykernel_162/919283308.py[0m(31)[0;36m<cell line: 3>[0;34m()[0m
[0;32m     30 [0;31m        [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 31 [0;31m        [0mloss[0m [0;34m=[0m [0mloss_fn[0m[0;34m([0m[0mX[0m[0;34m,[0m [0mY[0m[0;34m)[0m [0;31m#P[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     32 [0;31m        [0mprint[0m[0;34m([0m[0mi[0m[0;34m,[0m [0mloss[0m[0;34m.[0m[0mitem[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  p X


tensor(0.1497, grad_fn=<SelectBackward0>)


ipdb>  p Y


tensor([-0.8712])


ipdb>  q


In [17]:
game = Gridworld(size=4, mode='static')
    
noises = np.random.rand(1, 64) / 10
state_ = game.board.render_np().reshape(1, 64) + noises

In [18]:
state_

array([[0.01471271, 0.06032012, 0.07127256, 1.00500285, 0.00792098,
        0.00314291, 0.02458515, 0.07165251, 0.04608435, 0.03568782,
        0.01463065, 0.04912329, 0.02254903, 0.05141171, 0.07320424,
        0.07633822, 1.05049042, 0.02625306, 0.02009018, 0.01187027,
        0.02935323, 0.07509029, 0.09681164, 0.07926751, 0.01381738,
        0.08274171, 0.02428693, 0.07390254, 0.03472809, 0.08194706,
        0.01986409, 0.09043415, 0.0211539 , 1.03356975, 0.02037663,
        0.04863698, 0.00840975, 0.06374379, 0.02673741, 0.05044182,
        0.07577376, 0.03977445, 0.05295397, 0.08837413, 0.01337966,
        0.09536006, 0.01063841, 0.0483612 , 0.02925482, 0.07546094,
        0.03488519, 0.0526915 , 0.02143795, 1.07059728, 0.04020031,
        0.0802544 , 0.0295922 , 0.01433514, 0.04139732, 0.0872749 ,
        0.04445284, 0.03103007, 0.0072175 , 0.07799303]])