# Deep Q-Network (DQN)
---
In this notebook, you will implement a DQN agent with OpenAI Gym's LunarLander-v2 environment.

### 1. Import the Necessary Packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from os import path
import sys

repo_path = path.dirname(path.dirname(path.dirname(path.abspath("__file__"))))
sys.path.append(repo_path)

In [3]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline


### 2. Instantiate the Environment and Agent

Initialize the environment in the code cell below.

In [4]:
env = gym.make('LunarLander-v2')
env.seed(0)
print('State shape: ', env.observation_space.shape)
print('Number of actions: ', env.action_space.n)

State shape:  (8,)
Number of actions:  4


Before running the next code cell, familiarize yourself with the code in **Step 2** and **Step 3** of this notebook, along with the code in `dqn_agent.py` and `model.py`.  Once you have an understanding of how the different files work together, 
- Define a neural network architecture in `model.py` that maps states to action values.  This file is mostly empty - it's up to you to define your own deep Q-network!
- Finish the `learn` method in the `Agent` class in `dqn_agent.py`.  The sampled batch of experience tuples is already provided for you; you need only use the local and target Q-networks to compute the loss, before taking a step towards minimizing the loss.

Once you have completed the code in `dqn_agent.py` and `model.py`, run the code cell below.  (_If you end up needing to make multiple changes and get unexpected behavior, please restart the kernel and run the cells from the beginning of the notebook!_)

You can find the solution files, along with saved model weights for a trained agent, in the `solution/` folder.  (_Note that there are many ways to solve this exercise, and the "solution" is just one way of approaching the problem, to yield a trained agent._)

### Run Test on Agent-Environment

Initialize Network

In [5]:
from src.dqn_agent import Agent

agent = Agent(state_size=8, action_size=4, seed=0)

In [6]:
w_local_init = [w for w in agent.qnetwork_local.parameters()]
w_targets_init = [w for w in agent.qnetwork_target.parameters()]

In [7]:
agent.qnetwork_local.out.weight

Parameter containing:
tensor([[-0.0317, -0.0682,  0.1063,  0.1617, -0.1443,  0.0955,  0.1219,
          0.0309,  0.0021,  0.1360,  0.0052, -0.0441, -0.0131,  0.0567,
          0.0443, -0.0095,  0.0558,  0.0476, -0.1432,  0.0440,  0.1082,
          0.0203, -0.0161,  0.1354, -0.1544, -0.0807, -0.0799,  0.1527,
         -0.1710,  0.0175, -0.1712, -0.0058],
        [ 0.0659, -0.0413,  0.1716,  0.0006, -0.0901,  0.0456,  0.0919,
         -0.0004,  0.1512, -0.1119,  0.0754,  0.1004,  0.0878, -0.0001,
          0.0505, -0.1387, -0.1387, -0.0110, -0.0222,  0.1313,  0.1284,
          0.1199, -0.0810, -0.1388, -0.1658,  0.0640, -0.1593, -0.1765,
         -0.0893, -0.1493, -0.0697, -0.1113],
        [-0.0907,  0.1681, -0.0333,  0.1466,  0.0039, -0.0227, -0.0518,
         -0.1264, -0.1420,  0.1224, -0.0545, -0.1724,  0.1622,  0.0033,
         -0.1183, -0.1390, -0.1485, -0.1550,  0.1600,  0.1219,  0.0145,
         -0.1507, -0.0462, -0.1661,  0.0609, -0.0469, -0.0378, -0.0552,
          0.0829,  0.0

In [8]:
w_local_init  # an element of each layer

[Parameter containing:
 tensor([[-0.0026,  0.1897, -0.2910, -0.2602, -0.1362,  0.0948, -0.0070,
           0.2803],
         [-0.0314,  0.0936, -0.1068, -0.0695, -0.3378, -0.2342, -0.1457,
           0.0131],
         [ 0.1398,  0.2121, -0.2397, -0.1540,  0.1284,  0.2936, -0.0728,
           0.2646],
         [-0.0570,  0.0374,  0.3201, -0.3280, -0.2226, -0.0895, -0.1378,
           0.3055],
         [-0.2292, -0.1628, -0.2470, -0.3311, -0.2064,  0.3039,  0.1578,
           0.1714],
         [ 0.0186, -0.1813,  0.0598, -0.3301, -0.2555, -0.1823,  0.2231,
           0.2073],
         [-0.1568, -0.0128,  0.2261,  0.3515,  0.1403,  0.0478,  0.2371,
          -0.2082],
         [ 0.0659, -0.2741, -0.2450, -0.1826,  0.1600,  0.1422, -0.2094,
           0.1068],
         [ 0.1941, -0.0446,  0.0135,  0.0819,  0.2193,  0.3395, -0.2725,
          -0.1296],
         [ 0.1389,  0.2929,  0.3077,  0.3120,  0.0704, -0.3074,  0.0325,
          -0.2212],
         [-0.3295,  0.3141,  0.2688, -0.3527,  

In [9]:
w_targets_init  # an element of each layer

[Parameter containing:
 tensor([[-0.0026,  0.1897, -0.2910, -0.2602, -0.1362,  0.0948, -0.0070,
           0.2803],
         [-0.0314,  0.0936, -0.1068, -0.0695, -0.3378, -0.2342, -0.1457,
           0.0131],
         [ 0.1398,  0.2121, -0.2397, -0.1540,  0.1284,  0.2936, -0.0728,
           0.2646],
         [-0.0570,  0.0374,  0.3201, -0.3280, -0.2226, -0.0895, -0.1378,
           0.3055],
         [-0.2292, -0.1628, -0.2470, -0.3311, -0.2064,  0.3039,  0.1578,
           0.1714],
         [ 0.0186, -0.1813,  0.0598, -0.3301, -0.2555, -0.1823,  0.2231,
           0.2073],
         [-0.1568, -0.0128,  0.2261,  0.3515,  0.1403,  0.0478,  0.2371,
          -0.2082],
         [ 0.0659, -0.2741, -0.2450, -0.1826,  0.1600,  0.1422, -0.2094,
           0.1068],
         [ 0.1941, -0.0446,  0.0135,  0.0819,  0.2193,  0.3395, -0.2725,
          -0.1296],
         [ 0.1389,  0.2929,  0.3077,  0.3120,  0.0704, -0.3074,  0.0325,
          -0.2212],
         [-0.3295,  0.3141,  0.2688, -0.3527,  

Run an episode

In [10]:

hist = []
# watch an untrained agent
state = env.reset()

for j in range(200):
    
    action = agent.act(state)
    exp = (state, action)
    next_state, reward, done, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    exp = exp + (reward,)
    hist.append(exp)
    state = next_state
    
    if done:
        break 
        
env.close()

In [11]:
len(hist)

70

Explore nets

In [12]:
states_batch = torch.tensor([exp[0] for exp in hist])
actions_batch = torch.tensor([exp[1].astype(int) for exp in hist], dtype=torch.long).view(-1,1)
actions_batch.size(), states_batch.size()

(torch.Size([70, 1]), torch.Size([70, 8]))

In [13]:
ps_local = agent.qnetwork_local.forward(states_batch)
ps_local[:5,:]

tensor([[-0.0480, -0.1037, -0.1024,  0.0779],
        [-0.0502, -0.1045, -0.1028,  0.0748],
        [-0.0526, -0.1045, -0.1037,  0.0721],
        [-0.0541, -0.1036, -0.1048,  0.0697],
        [-0.0561, -0.1030, -0.1061,  0.0673]])

In [14]:
ps_local = ps_local.gather(1, actions_batch)
ps_local.size()

torch.Size([70, 1])

In [15]:
ps_target = agent.qnetwork_target.forward(states_batch).max(dim=1)[0].view(-1,1)
ps_target.size()

torch.Size([70, 1])

In [16]:
ps_target.size()

torch.Size([70, 1])

In [17]:
import torch.nn.functional as F

In [18]:
loss = F.mse_loss(ps_local, ps_target)
loss

tensor(1.00000e-03 *
       4.0142)

Explore history

In [19]:
len(hist)  # this should be greater than 64

70

In [20]:
hist[0][0]  # state

array([-5.9156417e-04,  1.4134574e+00, -5.9935719e-02,  1.1277095e-01,
        6.9228926e-04,  1.3576316e-02,  0.0000000e+00,  0.0000000e+00],
      dtype=float32)

In [21]:
hist[0][1]  # action

3

In [22]:
hist[0][2]  # reward

2.5724821158297018

In [23]:
mem_buffer = agent.memory
len(mem_buffer)

70

Explore Memory Buffer

Access Agent's memory buffer and simulate a sample().
Each sample is a batch or a set of experiences <s, a, r, s', done> where each element is a tensor 

In [24]:
mem_buffer = agent.memory 
experiences = mem_buffer.sample()  # batch

In [25]:
mem_buffer.batch_size

64

In [26]:
experiences[0].size()  # states: batch_size x state_space_size

torch.Size([64, 8])

In [27]:
experiences[1].size()  # actions: batch_size x 1

torch.Size([64, 1])

In [28]:
experiences[2].size()  # rewards: batch_size x 1

torch.Size([64, 1])

In [29]:
experiences[3].size()  # states_next: batch_size x state_space_size

torch.Size([64, 8])

In [30]:
experiences[4].size() # done: batch_size x 1

torch.Size([64, 1])

Explore Train weights

In [31]:
w_local_b1 = [w for w in agent.qnetwork_local.parameters()]
w_targets_b1 = [w for w in agent.qnetwork_target.parameters()]

In [32]:
w_local_b1

[Parameter containing:
 tensor([[-0.0011,  0.1927, -0.2880, -0.2631, -0.1389,  0.0918, -0.0070,
           0.2803],
         [-0.0344,  0.0906, -0.1099, -0.0665, -0.3348, -0.2311, -0.1457,
           0.0131],
         [ 0.1371,  0.2092, -0.2426, -0.1511,  0.1313,  0.2965, -0.0728,
           0.2646],
         [-0.0600,  0.0404,  0.3231, -0.3250, -0.2196, -0.0925, -0.1378,
           0.3055],
         [-0.2262, -0.1598, -0.2440, -0.3341, -0.2094,  0.3009,  0.1578,
           0.1714],
         [ 0.0158, -0.1783,  0.0628, -0.3329, -0.2540, -0.1853,  0.2231,
           0.2073],
         [-0.1539, -0.0157,  0.2232,  0.3544,  0.1433,  0.0507,  0.2371,
          -0.2082],
         [ 0.0659, -0.2741, -0.2450, -0.1826,  0.1600,  0.1422, -0.2094,
           0.1068],
         [ 0.1913, -0.0416,  0.0105,  0.0849,  0.2165,  0.3366, -0.2725,
          -0.1296],
         [ 0.1359,  0.2899,  0.3047,  0.3150,  0.0734, -0.3044,  0.0325,
          -0.2212],
         [-0.3265,  0.3171,  0.2718, -0.3557,  

In [33]:
w_targets_b1

[Parameter containing:
 tensor([[-0.0026,  0.1897, -0.2910, -0.2602, -0.1362,  0.0948, -0.0070,
           0.2803],
         [-0.0314,  0.0936, -0.1069, -0.0695, -0.3378, -0.2342, -0.1457,
           0.0131],
         [ 0.1398,  0.2121, -0.2397, -0.1540,  0.1284,  0.2936, -0.0728,
           0.2646],
         [-0.0570,  0.0374,  0.3201, -0.3280, -0.2226, -0.0895, -0.1378,
           0.3055],
         [-0.2292, -0.1628, -0.2470, -0.3311, -0.2064,  0.3039,  0.1578,
           0.1714],
         [ 0.0186, -0.1813,  0.0598, -0.3301, -0.2555, -0.1823,  0.2231,
           0.2073],
         [-0.1568, -0.0128,  0.2261,  0.3515,  0.1403,  0.0478,  0.2371,
          -0.2082],
         [ 0.0659, -0.2741, -0.2450, -0.1826,  0.1600,  0.1422, -0.2094,
           0.1068],
         [ 0.1941, -0.0446,  0.0135,  0.0819,  0.2193,  0.3395, -0.2725,
          -0.1296],
         [ 0.1389,  0.2929,  0.3077,  0.3120,  0.0704, -0.3074,  0.0325,
          -0.2212],
         [-0.3295,  0.3141,  0.2688, -0.3527,  