# DQN Prototyping

In [1]:
##DQN prototyping

import gym
import numpy as np
import tensorflow as tf
from collections import deque, namedtuple

In [2]:
#Hard coded variables
EPISODES = 10
EPSILON = 1
EPSILON_DECAY = 0.99
SAMPLE_BATCH = 64
GAMMA = 0.99
MIN_EPSILON = 0.01

In [3]:
from tensorflow import keras

In [4]:
Exp = namedtuple("experience", field_names=['s', 'a', 'r','done', 's_p'])

In [5]:
class ExperienceReplay():
    def __init__(self, size):
        self.buffer = deque(maxlen=size)
        
    def __len__(self):
        return len(self.buffer)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, sample_size = SAMPLE_BATCH):
        state = []
        actions = []
        rewards = []
        dones = []
        state_p = []
        ##get sample of indices
        if sample_size > len(self.buffer):
            
            for i in range(len(self.buffer)):
                s, a, r, d, s_p = self.buffer[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
        else:
            indices = np.random.choice(len(self.buffer), 
                                       sample_size, 
                                       replace=False)
            for i in indices:
                s, a, r, d, s_p = self.buffer[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
            
            

In [4]:
class Model(keras.Model):
    def __init__(self, action_size):
        super(Model, self).__init__()
        self.layer1 = keras.layers.Dense(64)
        self.layer2 = keras.layers.Dense(128)
#         self.layer3 = keras.layers.Dense(128)
        self.layer4 = keras.layers.Dense(action_size)
        
    def call(self, input):
        x = self.layer1(input)
        x = self.layer2(x)
#         x = self.layer3(x)
        x = self.layer4(x)
        
        return x

In [5]:
env = gym.make("LunarLander-v2")

In [6]:
env.action_space.sample()

0

In [7]:
env.observation_space.shape[0]

8

In [10]:
def dqn(EPSILON):
    
    try:
        model = Model(env.action_space.n)

        model.compile(loss=tf.losses.CategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics = tf.keras.metrics.CategoricalAccuracy())
        replay = ExperienceReplay(10000)
        best_reward = 0
        scores = []
        for episode in range(EPISODES):

            state = env.reset()
            score = 0

            while True:

                ## Choose an action
                if np.random.random() > EPSILON:
                    action = model.predict(state.reshape(1, -1))
                else:
                    action = env.action_space.sample()
                env.render()
                state_p, reward, done, _ = env.step(action=action)
                replay.add(Exp(s = state, 
                               a = action,
                               r = reward,
                               done= done, 
                               s_p = state_p))
                #Learn 
                states, actions, rewards, dones, states_p = replay.sample()
                q_vals = model.predict(states)
                print(q_vals.shape)
                print(actions)
                q_vals_new = np.max(model.predict(states_p), axis = 1)

                q_target = rewards + GAMMA * q_vals_new * dones
                print(q_target.shape)
                print(states.shape)
                model.fit(states, q_target, epochs = 1)

                EPSILON *= EPSILON_DECAY
                state = state_np
                score += reward

                if done:
                    if score > best_reward:
                        best_reward = score
                    print("Episode {}  Best Reward {} Last Reward {} Epsilon {}"\
                          .format(i, best_reward, score, EPSILON))
                    break

            scores.append[score]
    except Exception as e:
        print(e)
            
                
            
            

In [11]:
dqn(EPSILON)

(1, 4)
(1,)
(1, 8)
in user code:

    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    F

In [12]:
m = Model(4)

In [8]:
states = np.random.random((100,8))

In [30]:
actions = np.array([np.random.randint(0,3) for _ in range(100)])
dones = tf.convert_to_tensor(np.array([np.random.choice([True, False]) for _ in range(100)]), dtype=float)
rewards = tf.convert_to_tensor(np.random.random(100), dtype=float) 

In [31]:
states = tf.convert_to_tensor(states)

In [32]:
q_vals_new = m.predict(states)



In [33]:
q_vals_new = tf.reduce_max(q_vals_new, axis = 1)

In [34]:
# future_reward =  GAMMA * q_vals_new * dones
q_target = rewards + GAMMA * q_vals_new * dones

In [36]:
q_target

<tf.Tensor: shape=(100,), dtype=float32, numpy=
array([ 4.6802300e-01,  7.3251384e-01,  5.9462130e-01,  3.7499362e-01,
        4.2816612e-01,  2.4111138e-01,  7.1863309e-02, -6.3981861e-07,
        9.3522227e-01,  4.8337314e-01,  5.1459688e-01,  7.2414517e-01,
        2.2765218e-01,  5.8940673e-01,  2.0365399e-01,  3.1388038e-01,
        6.8997127e-01,  2.9539933e-02,  3.7801873e-02,  5.0249841e-02,
        5.6596142e-01,  1.3937636e-01,  1.1201465e+00,  7.8489655e-01,
        1.6528982e-01,  1.1773119e+00,  1.0038668e+00,  2.7358162e-01,
        2.3002504e-01,  1.2752679e-01,  7.8065705e-01,  1.5613222e-01,
        9.2335290e-01,  2.6859796e-01,  2.3024434e-02,  4.0808398e-01,
        7.0484877e-01,  6.3417631e-01,  7.1476310e-01,  6.6262549e-01,
        3.3574754e-01,  6.8542975e-01,  4.6406010e-01,  8.3527100e-01,
        1.1809965e+00,  8.5010499e-01,  7.4649982e-02,  8.7912297e-01,
        6.3631570e-01,  8.1315839e-01,  6.4485055e-01,  5.3647113e-01,
        9.8471850e-02,  2.903

In [26]:
q_vals = m.predict(states)



In [27]:
q_vals

array([[-0.09044285,  0.78429145,  0.22290823, -0.22095975],
       [ 0.06582987,  0.8060102 ,  0.21171093,  0.05288252],
       [-0.02123485,  0.78302485,  0.30825198, -0.06496517],
       [ 0.0634696 ,  0.6149648 ,  0.09870172,  0.29003182],
       [ 0.14686641,  0.5924538 ,  0.44316274,  0.2361229 ],
       [ 0.1105642 ,  0.28230122,  0.40268916,  0.19998103],
       [ 0.00114902,  0.57714707,  0.1451543 ,  0.02638064],
       [ 0.05291217,  0.5928583 ,  0.15167826,  0.16922504],
       [ 0.11161931,  0.7220261 ,  0.29377592,  0.23279035],
       [ 0.10644533,  0.51654917,  0.40414834,  0.03691044],
       [-0.04124548,  0.959939  ,  0.11080159,  0.11617209],
       [ 0.01196576,  0.74462247,  0.08967584,  0.20435905],
       [ 0.10151611,  0.32137164,  0.23601817,  0.23200189],
       [-0.12369305,  0.77988875,  0.18715343, -0.03689567],
       [-0.0033296 ,  0.7215647 ,  0.25049186, -0.01530738],
       [ 0.18982309,  0.49643707,  0.36266565,  0.13021822],
       [-0.0223823 ,  0.

In [28]:
q_vals_new = np.max(q_vals, axis = 1)

In [29]:
q_vals_new

array([0.78429145, 0.8060102 , 0.78302485, 0.6149648 , 0.5924538 ,
       0.40268916, 0.57714707, 0.5928583 , 0.7220261 , 0.51654917,
       0.959939  , 0.74462247, 0.32137164, 0.77988875, 0.7215647 ,
       0.49643707, 0.60878474, 0.8319981 , 0.5816212 , 0.27555102,
       0.48252717, 0.43086812, 0.5244136 , 0.27403802, 0.41813347,
       0.54460466, 0.71389663, 0.5302336 , 0.6228853 , 0.70055676,
       0.857682  , 0.61169446, 0.5729729 , 0.6630689 , 0.17471473,
       0.73713565, 0.6050472 , 0.569217  , 0.46024138, 0.57352567,
       0.6768075 , 0.5126452 , 0.78347206, 0.46514654, 0.6131806 ,
       0.64612585, 0.54629946, 0.64494157, 0.37578356, 0.26264685,
       0.49940652, 0.6289438 , 0.39027512, 0.29145074, 0.7621701 ,
       0.8342236 , 0.8312341 , 0.33164984, 0.3055746 , 0.70319116,
       0.35827547, 0.62008363, 0.8125546 , 0.6944517 , 0.48879552,
       0.4038457 , 0.45224434, 0.6488539 , 0.7158497 , 0.5582352 ,
       0.6222996 , 0.57102674, 0.34720746, 0.40188372, 0.56878

In [18]:
indexer = [[i,a] for i, a in enumerate(actions)]

In [30]:
q_target = rewards + GAMMA * q_vals_new * dones

In [37]:
actions

array([2, 1, 2, 0, 2, 0, 2, 0, 1, 0, 1, 2, 1, 2, 1, 0, 2, 1, 2, 2, 2, 1,
       1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 2,
       2, 1, 0, 1, 0, 1, 1, 2, 0, 1, 1, 1, 0, 0, 2, 1, 2, 0, 0, 0, 2, 0,
       2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 0, 1, 2, 0, 1, 0, 0, 0,
       1, 0, 2, 2, 2, 0, 0, 2, 1, 2, 0, 0])

In [39]:
mask = tf.one_hot(actions, env.action_space.n)

In [31]:
actions.reshape(1,-1)

array([[1, 2, 1, 1, 0, 2, 0, 1, 2, 2, 1, 0, 0, 1, 2, 2, 1, 0, 2, 2, 1, 2,
        0, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 0, 1, 2,
        2, 0, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0,
        1, 2, 2, 0, 2, 2, 1, 2, 1, 0, 1, 0, 2, 2, 0, 1, 0, 0, 1, 2, 2, 1,
        0, 2, 0, 2, 0, 2, 0, 0, 1, 2, 0, 1]])

In [32]:
q_vals[ np.array(range(len(actions))), actions] = q_target

In [33]:
q_vals

array([[-9.04428512e-02,  1.33480012e-01,  2.22908229e-01,
        -2.20959753e-01],
       [ 6.58298656e-02,  8.06010187e-01,  3.93272400e-01,
         5.28825223e-02],
       [-2.12348476e-02,  9.09851313e-01,  3.08251977e-01,
        -6.49651736e-02],
       [ 6.34696037e-02,  1.21117008e+00,  9.87017155e-02,
         2.90031821e-01],
       [ 1.32922339e+00,  5.92453778e-01,  4.43162739e-01,
         2.36122906e-01],
       [ 1.10564202e-01,  2.82301217e-01,  1.12999940e+00,
         1.99981034e-01],
       [ 1.12994373e+00,  5.77147067e-01,  1.45154297e-01,
         2.63806432e-02],
       [ 5.29121682e-02,  2.77841300e-01,  1.51678264e-01,
         1.69225037e-01],
       [ 1.11619309e-01,  7.22026110e-01,  1.46574366e+00,
         2.32790351e-01],
       [ 1.06445327e-01,  5.16549170e-01,  6.58480704e-01,
         3.69104445e-02],
       [-4.12454829e-02,  1.92897737e+00,  1.10801585e-01,
         1.16172090e-01],
       [ 7.40475297e-01,  7.44622469e-01,  8.96758437e-02,
      

In [22]:
np.insert(q_vals, actions.reshape(1,-1), q_target,axis = 1).shape

ValueError: index array argument obj to insert must be one dimensional or scalar

In [37]:
for idx, ele in enumerate(q_target):
    q_vals[idx, actions[idx]] = ele

In [38]:
q_vals

array([[ 1.05933082e+00, -4.09482986e-01, -1.09645113e-01,
        -2.76938468e-01],
       [ 1.12121284e-01,  3.34523022e-02,  1.21548146e-01,
        -3.76550615e-01],
       [ 1.25785962e-01,  1.63199946e-01,  5.73666096e-01,
        -2.02540964e-01],
       [ 3.57886940e-01,  2.09592879e-02, -2.70189941e-01,
        -3.16969961e-01],
       [ 2.56461918e-01,  4.36659724e-01, -2.42995560e-01,
        -4.05976593e-01],
       [ 2.21720740e-01, -7.73252994e-02,  9.54361558e-01,
        -4.45882589e-01],
       [ 1.46730423e-01,  5.99445403e-02,  5.86728036e-01,
        -3.00942630e-01],
       [ 1.63625538e-01, -2.89448023e-01,  1.08740056e+00,
        -3.56372267e-01],
       [ 1.93658173e-01,  1.05069327e+00, -1.52085647e-01,
        -3.55726123e-01],
       [ 9.50317979e-02,  8.95255327e-01, -5.36086738e-01,
        -3.53307188e-01],
       [ 1.08486712e-01,  5.86275339e-01, -1.17670894e-01,
        -4.50735390e-01],
       [ 2.05686882e-01,  9.44465816e-01, -4.77745503e-01,
      

In [42]:
actions

array([1, 1, 2, 0, 2, 0, 0, 0, 2, 1, 0, 1, 2, 1, 2, 2, 1, 1, 0, 2, 0, 2,
       1, 2, 1, 0, 2, 0, 2, 2, 0, 2, 2, 1, 0, 0, 2, 2, 1, 2, 2, 1, 2, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 2, 2, 1, 0, 0, 2, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 2, 0, 1, 2, 2, 2, 2, 2, 1, 2, 0, 2, 1, 1, 1, 0, 0, 2, 0, 2, 2,
       0, 0, 2, 1, 0, 2, 1, 1, 2, 1, 0, 1])

In [63]:
env.reset()
env.step(env.action_space.sample())

(array([-0.00389986,  1.391949  , -0.20417786, -0.43419787,  0.00670153,
         0.0897917 ,  0.        ,  0.        ], dtype=float32),
 -2.2680782152933134,
 False,
 {})

In [90]:
env.step(env.action_space.sample())

(array([-0.06623831,  0.9630391 , -0.25177294, -0.9925629 ,  0.21233341,
         0.09543299,  0.        ,  0.        ], dtype=float32),
 -0.48689421732714666,
 False,
 {})

In [138]:
np.argmax(model.predict(env.reset().reshape(1,-1)), axis = 1)



array([1])

In [141]:
state = np.random.random((10,8))
rewards = np.random.random((10,1))
dones = np.array([np.random.choice([True, False]) for i in range(10)])

In [144]:
q_fut = np.max(model.predict(state), axis = 1)



In [154]:
dones*q_fut*GAMMA +rewards.reshape(-1)

array([0.40931207, 0.09657527, 0.90224723, 0.31664349, 0.51111311,
       1.25086158, 0.32520274, 1.35936309, 0.42618603, 0.67265275])

In [148]:
q_fut

array([0.31978217, 0.08766681, 0.42509478, 0.30133817, 0.30999145,
       0.2773019 , 0.38368082, 0.36521935, 0.26070544, 0.1978873 ],
      dtype=float32)

In [156]:
o = np.array([])
o = np.append(o, np.array([1,2,3]))
o = np.append(o, np.array([4,5,7]))

In [158]:
o = [[1,2,3],[1,3,4]]

(2, 3)