# DQN Prototyping

In [1]:
##DQN prototyping

import gym
import numpy as np
import tensorflow as tf
from collections import deque, namedtuple

In [2]:
#Hard coded variables
EPISODES = 10
EPSILON = 1
EPSILON_DECAY = 0.99
SAMPLE_BATCH = 64
GAMMA = 0.99
MIN_EPSILON = 0.01

In [3]:
from tensorflow import keras

In [4]:
Exp = namedtuple("experience", field_names=['s', 'a', 'r','done', 's_p'])

In [5]:
class ExperienceReplay():
    def __init__(self, size):
        self.buffer = deque(maxlen=size)
        
    def __len__(self):
        return len(self.buffer)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, sample_size = SAMPLE_BATCH):
        state = []
        actions = []
        rewards = []
        dones = []
        state_p = []
        ##get sample of indices
        if sample_size > len(self.buffer):
            
            for i in range(len(self.buffer)):
                s, a, r, d, s_p = self.buffer[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
        else:
            indices = np.random.choice(len(self.buffer), 
                                       sample_size, 
                                       replace=False)
            for i in indices:
                s, a, r, d, s_p = self.buffer[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
            
            

In [7]:
class Model(keras.Model):
    def __init__(self, action_size):
        super(Model, self).__init__()
        self.layer1 = keras.layers.Dense(64)
        self.layer2 = keras.layers.Dense(128)
        self.layer3 = keras.layers.Dense(128)
        self.layer4 = keras.layers.Dense(action_size)
        
    def call(self, input):
        x = self.layer1(input)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        return x

In [7]:
env = gym.make("LunarLander-v2")

In [8]:
env.action_space.sample()

0

In [9]:
env.observation_space.shape[0]

8

In [10]:
def dqn(EPSILON):
    
    try:
        model = Model(env.action_space.n)

        model.compile(loss=tf.losses.CategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics = tf.keras.metrics.CategoricalAccuracy())
        replay = ExperienceReplay(10000)
        best_reward = 0
        scores = []
        for episode in range(EPISODES):

            state = env.reset()
            score = 0

            while True:

                ## Choose an action
                if np.random.random() > EPSILON:
                    action = model.predict(state.reshape(1, -1))
                else:
                    action = env.action_space.sample()
                env.render()
                state_p, reward, done, _ = env.step(action=action)
                replay.add(Exp(s = state, 
                               a = action,
                               r = reward,
                               done= done, 
                               s_p = state_p))
                #Learn 
                states, actions, rewards, dones, states_p = replay.sample()
                q_vals = model.predict(states)
                print(q_vals.shape)
                print(actions)
                q_vals_new = np.max(model.predict(states_p), axis = 1)

                q_target = rewards + GAMMA * q_vals_new * dones
                print(q_target.shape)
                print(states.shape)
                model.fit(states, q_target, epochs = 1)

                EPSILON *= EPSILON_DECAY
                state = state_np
                score += reward

                if done:
                    if score > best_reward:
                        best_reward = score
                    print("Episode {}  Best Reward {} Last Reward {} Epsilon {}"\
                          .format(i, best_reward, score, EPSILON))
                    break

            scores.append[score]
    except Exception as e:
        print(e)
            
                
            
            

In [11]:
dqn(EPSILON)

(1, 4)
(1,)
(1, 8)
in user code:

    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "C:\Users\tanma\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    F

In [8]:
m = Model(4)

In [6]:
states = np.random.random((100,8))

In [9]:
actions = [np.random.randint(0,3) for _ in range(100)]

In [11]:
q_vals = m.predict(states)

In [12]:
q_vals.shape

(100, 4)

In [63]:
env.reset()
env.step(env.action_space.sample())

(array([-0.00389986,  1.391949  , -0.20417786, -0.43419787,  0.00670153,
         0.0897917 ,  0.        ,  0.        ], dtype=float32),
 -2.2680782152933134,
 False,
 {})

In [90]:
env.step(env.action_space.sample())

(array([-0.06623831,  0.9630391 , -0.25177294, -0.9925629 ,  0.21233341,
         0.09543299,  0.        ,  0.        ], dtype=float32),
 -0.48689421732714666,
 False,
 {})

In [138]:
np.argmax(model.predict(env.reset().reshape(1,-1)), axis = 1)



array([1])

In [141]:
state = np.random.random((10,8))
rewards = np.random.random((10,1))
dones = np.array([np.random.choice([True, False]) for i in range(10)])

In [144]:
q_fut = np.max(model.predict(state), axis = 1)



In [154]:
dones*q_fut*GAMMA +rewards.reshape(-1)

array([0.40931207, 0.09657527, 0.90224723, 0.31664349, 0.51111311,
       1.25086158, 0.32520274, 1.35936309, 0.42618603, 0.67265275])

In [148]:
q_fut

array([0.31978217, 0.08766681, 0.42509478, 0.30133817, 0.30999145,
       0.2773019 , 0.38368082, 0.36521935, 0.26070544, 0.1978873 ],
      dtype=float32)

In [156]:
o = np.array([])
o = np.append(o, np.array([1,2,3]))
o = np.append(o, np.array([4,5,7]))

In [158]:
o = [[1,2,3],[1,3,4]]

(2, 3)