In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env = gym.make('LunarLander-v2')
env.seed(0)
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
State shape:  (8,)
Number of actions:  4


In [3]:
from dqn_agent import Agent

In [4]:
scores_window = deque(maxlen=100)

In [5]:
eps = 1.0

In [6]:
state = env.reset()

In [7]:
score = 0

In [8]:
t = 0

In [9]:
seed = 59

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed):
        super(QNetwork, self).__init__()
        self.seed  = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [11]:
dqn_local = QNetwork(8, 4, seed)

In [12]:
dqn_target = QNetwork(8, 4, seed)

In [13]:
import torch.optim as optim
print(dqn_local.parameters())
optimizer = optim.Adam(dqn_local.parameters(), lr=5e-4)

<generator object Module.parameters at 0x10bb21518>


In [14]:
memory = deque(maxlen=int(1e5))

In [15]:
state

array([-5.91564178e-04,  9.42304904e-01, -5.99357188e-02,  1.12770955e-01,
        6.92289264e-04,  1.35763153e-02,  0.00000000e+00,  0.00000000e+00])

In [16]:
state = torch.from_numpy(state).float().unsqueeze(0)
print(state)

tensor([[-0.0006,  0.9423, -0.0599,  0.1128,  0.0007,  0.0136,  0.0000,
          0.0000]])


In [17]:
dqn_local.eval()

QNetwork(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=4, bias=True)
)

In [21]:
with torch.no_grad():
    action_values = dqn_local(state)
    print(action_values)
    print(np.argmax(action_values.data))

tensor(1.00000e-02 *
       [[-0.6673,  5.7288, -9.8835,  5.0341]])
tensor(1)


In [22]:
r = random.random()
print(r)

if r > eps: 
    action_id = np.argmax(action_values.data)
else:
    action_id = random.choice(np.arange(4))

print(action_id)

0.6558345055531296
0


In [23]:
next_state, reward, done, _ = env.step(action_id)
print(next_state)
print(reward)
print(done)

[-0.00118332  0.94361125 -0.05985444  0.08708715  0.00136317  0.01341959
  0.          0.        ]
2.00584477863886
False


In [24]:
experience = [state, action_id, reward, next_state, done]
print(experience)
memory.append(experience)
print(memory)

[tensor([[-0.0006,  0.9423, -0.0599,  0.1128,  0.0007,  0.0136,  0.0000,
          0.0000]]), 0, 2.00584477863886, array([-0.00118332,  0.94361125, -0.05985444,  0.08708715,  0.00136317,
        0.01341959,  0.        ,  0.        ]), False]
deque([[tensor([[-0.0006,  0.9423, -0.0599,  0.1128,  0.0007,  0.0136,  0.0000,
          0.0000]]), 0, 2.00584477863886, array([-0.00118332,  0.94361125, -0.05985444,  0.08708715,  0.00136317,
        0.01341959,  0.        ,  0.        ]), False]], maxlen=100000)


If `memory` reaches big enough a size, I sample a batch from it to do some learning. Let me pretend that a size of `1` is big enough for now. 

In [35]:
experiences = random.sample(memory, k=1)
print(experiences)
states = torch.from_numpy(np.vstack([e[0] for e in experiences])).float()
actions = torch.from_numpy(np.vstack([e[1] for e in experiences])).long()
rewards = torch.from_numpy(np.vstack([e[2] for e in experiences])).float()
next_states = torch.from_numpy(np.vstack([e[3] for e in experiences])).float()
dones = torch.from_numpy(np.vstack([e[4] for e in experiences]).astype(np.uint8)).float()
print(states)
print(actions)
print(rewards)
print(next_states)
print(dones)

[[tensor([[-0.0006,  0.9423, -0.0599,  0.1128,  0.0007,  0.0136,  0.0000,
          0.0000]]), 0, 2.00584477863886, array([-0.00118332,  0.94361125, -0.05985444,  0.08708715,  0.00136317,
        0.01341959,  0.        ,  0.        ]), False]]
tensor([[-0.0006,  0.9423, -0.0599,  0.1128,  0.0007,  0.0136,  0.0000,
          0.0000]])
tensor([[ 0]])
tensor([[ 2.0058]])
tensor([[-0.0012,  0.9436, -0.0599,  0.0871,  0.0014,  0.0134,  0.0000,
          0.0000]])
tensor([[ 0.]])


In [37]:
dqn_target = QNetwork(8, 4, seed)
temp = dqn_target(next_states)
print(temp.detach())
print(temp.detach().max(1))
print(temp.detach().max(1)[0])
Q_targets_next = dqn_target(next_states).detach().max(1)[0].unsqueeze(1)
print(Q_targets_next)

tensor(1.00000e-02 *
       [[-0.5892,  5.7058, -9.8002,  4.9709]])
(tensor(1.00000e-02 *
       [ 5.7058]), tensor([ 1]))
tensor(1.00000e-02 *
       [ 5.7058])
tensor(1.00000e-02 *
       [[ 5.7058]])


In [39]:
Q_targets = rewards + (0.99 * Q_targets_next * (1 - dones))
print(Q_targets)

tensor([[ 2.0623]])


In [40]:
Q_expected = dqn_local(states).gather(1, actions)
print(Q_expected)

tensor(1.00000e-03 *
       [[-6.6728]])


In [41]:
# We want to update the local DQN using the target DQN. 
loss = F.mse_loss(Q_expected, Q_targets)
print(loss)

tensor(4.2808)


In [42]:
optimizer.zero_grad()

In [43]:
loss.backward()

In [44]:
optimizer.step()

In [88]:
tau = 1e-3

for target_param, local_param in zip(dqn_target.parameters(), dqn_local.parameters()):
    target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [89]:
state = next_state
score += reward
print(state)
print(score)

[-0.00118332  0.94361125 -0.05985444  0.08708715  0.00136317  0.01341959
  0.          0.        ]
2.00584477863886
