# DQN Prototyping

In [1]:
##DQN prototyping

import gym
import numpy as np
import tensorflow as tf
from collections import deque, namedtuple

In [143]:
#Hard coded variables
EPISODES = 10
EPSILON = 1
EPSILON_DECAY = 0.99
SAMPLE_BATCH = 64
GAMMA = 0.99

In [45]:
from tensorflow import keras

In [2]:
c = deque(maxlen=100)

In [103]:
Exp = namedtuple("experience", field_names=['s', 'a', 'r','done', 's_p'])

In [4]:
c.append(Exp(1,2,3,5))
c.append(Exp(2,3,4,5))


In [6]:
state = np.array([])
actions = np.array([])
rewards = np.array([])
state_p = np.array([])
for i in range(len(c)):
 
    s, a, r, s_p = c[i]
    state = np.append(state,s) 
    actions = np.append(actions, a) 
    rewards = np.append(rewards, r) 
    state_p = np.append(state_p, s_p)

In [140]:
class ExperienceReplay():
    def __init__(self, size):
        self.buffer = deque(maxlen=size)
        
    def __len__(self):
        return len(self.buffer)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, sample_size = SAMPLE_BATCH):
        state = []
        actions = []
        rewards = []
        dones = []
        state_p = []
        ##get sample of indices
        if sample_size > len(self.buffer):
            for i in range(len(self.buffer)):
                s, a, r, d, s_p = c[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
        else:
            indices = np.random.choice(len(self.buffer), 
                                       sample_size, 
                                       replace=False)
            for i in indices:
                s, a, r, d, s_p = c[i]
                state.append(s)
                actions.append(a)
                rewards.append(r)
                dones.append(d)
                state_p.append(s_p)
            return np.array(state), np.array(actions), np.array(rewards), np.array(dones), np.array(state_p)
            
            

In [48]:
class Model(keras.Model):
    def __init__(self, action_size):
        super(Model, self).__init__()
        self.layer1 = keras.layers.Dense(64)
        self.layer2 = keras.layers.Dense(128)
        self.layer3 = keras.layers.Dense(128)
        self.layer4 = keras.layers.Dense(action_size)
        
    def call(self, input):
        x = self.layer1(input)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        return x

In [49]:
model = Model(4)

2022-05-25 17:57:55.821359: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 17:57:55.826675: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 17:57:55.827051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-25 17:57:55.828016: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [51]:
model.compile(loss=tf.losses.CategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics = tf.keras.metrics.CategoricalAccuracy())

In [54]:
model.build((None, 8))

In [55]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               multiple                  576       
                                                                 
 dense_1 (Dense)             multiple                  8320      
                                                                 
 dense_2 (Dense)             multiple                  16512     
                                                                 
 dense_3 (Dense)             multiple                  516       
                                                                 
Total params: 25,924
Trainable params: 25,924
Non-trainable params: 0
_________________________________________________________________


In [8]:
env = gym.make("LunarLander-v2")

In [24]:
env.action_space.sample()

2

In [44]:
env.observation_space.shape[0]

8

In [60]:
model.predict(np.random.random((100,8)))



array([[ 1.39343739e-02, -1.85523719e-01,  3.60337198e-01,
        -3.33276466e-02],
       [ 3.48664880e-01, -8.08867216e-02,  3.28666449e-01,
        -2.76468620e-02],
       [-9.57832187e-02,  6.69447631e-02,  2.56931841e-01,
         3.70903052e-02],
       [ 2.51774490e-02,  1.18934087e-01,  3.41448307e-01,
        -9.37756598e-02],
       [ 8.47860202e-02, -3.82518172e-01,  2.94760585e-01,
        -7.31362104e-02],
       [ 7.38018006e-03, -3.53653133e-01,  2.65057981e-01,
         2.39874423e-02],
       [-3.09006214e-01, -2.73080856e-01,  2.56869465e-01,
         1.27394527e-01],
       [ 3.86793405e-01, -2.24604547e-01,  2.11883411e-01,
        -1.95798516e-01],
       [-1.29346967e-01, -1.20875269e-01,  1.93650678e-01,
        -1.16823442e-01],
       [ 1.50138989e-01,  2.48946771e-02,  8.45889077e-02,
         2.10751966e-02],
       [ 2.38221273e-01, -2.49520361e-01,  4.47792053e-01,
        -1.15226731e-01],
       [-9.95132625e-02,  5.47733903e-02,  1.90395266e-01,
      

In [58]:
np.random.random((10,8))

array([[0.02831499, 0.62038775, 0.18093637, 0.80595114, 0.79132862,
        0.18363344, 0.84024099, 0.71797677],
       [0.99562149, 0.83630511, 0.9476401 , 0.09238772, 0.85096023,
        0.43776523, 0.49075761, 0.76702867],
       [0.90276082, 0.24806449, 0.28766574, 0.31268968, 0.00549859,
        0.65040789, 0.02128169, 0.62538051],
       [0.70238679, 0.58748033, 0.05697965, 0.5815584 , 0.63398542,
        0.87392228, 0.04425838, 0.38453657],
       [0.63241762, 0.81631012, 0.45111601, 0.93229515, 0.69750908,
        0.88138831, 0.92759001, 0.58468893],
       [0.09503773, 0.83689978, 0.03577982, 0.55687207, 0.50928529,
        0.65407984, 0.9163363 , 0.15907204],
       [0.84665889, 0.92591672, 0.02879576, 0.14624993, 0.72940469,
        0.64491741, 0.03670241, 0.53269122],
       [0.8258057 , 0.3505615 , 0.71720313, 0.33999732, 0.56805005,
        0.4035784 , 0.91059843, 0.19157799],
       [0.59254459, 0.95163202, 0.40136231, 0.09203811, 0.01644002,
        0.52409186, 0.088691

In [101]:
def dqn():
    
    model = Model()
    target = Model()
    
    model.compile(loss=tf.losses.CategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics = tf.keras.metrics.CategoricalAccuracy())
    target.compile(loss=tf.losses.CategoricalCrossentropy(), optimizer=tf.optimizers.Adam(), metrics = tf.keras.metrics.CategoricalAccuracy())
    replay = ExperienceReplay(10000)
    
    for episode in range(EPISODES):
    
        state = env.reset()
        score = 0
        
        while True:
            
            if np.random.random() > EPSILON:
                action = model.predict(state.reshape(1, -1))
            else:
                action = env.action_space.sample()
                
            state_p, reward, done, _ = env.step(action=action)
            replay.add(Exp(s = state, 
                           a = action,
                           r = reward,
                           done= done, 
                           s_p = state_p))
            #Learn 
            states, actions, rewards, dones, states_p = replay.sample()
            q_vals = model.predict(states)
            q_vals_new = np.max(model.predict(states_p), axis = 1)
            
            q_target = rewards + GAMMA * q_vals_new * dones
            
                       
                
            
            

SyntaxError: invalid syntax (2576453929.py, line 19)

In [63]:
env.reset()
env.step(env.action_space.sample())

(array([-0.00389986,  1.391949  , -0.20417786, -0.43419787,  0.00670153,
         0.0897917 ,  0.        ,  0.        ], dtype=float32),
 -2.2680782152933134,
 False,
 {})

In [90]:
env.step(env.action_space.sample())

(array([-0.06623831,  0.9630391 , -0.25177294, -0.9925629 ,  0.21233341,
         0.09543299,  0.        ,  0.        ], dtype=float32),
 -0.48689421732714666,
 False,
 {})

In [138]:
np.argmax(model.predict(env.reset().reshape(1,-1)), axis = 1)



array([1])

In [141]:
state = np.random.random((10,8))
rewards = np.random.random((10,1))
dones = np.array([np.random.choice([True, False]) for i in range(10)])

In [144]:
q_fut = np.max(model.predict(state), axis = 1)



In [154]:
dones*q_fut*GAMMA +rewards.reshape(-1)

array([0.40931207, 0.09657527, 0.90224723, 0.31664349, 0.51111311,
       1.25086158, 0.32520274, 1.35936309, 0.42618603, 0.67265275])

In [148]:
q_fut

array([0.31978217, 0.08766681, 0.42509478, 0.30133817, 0.30999145,
       0.2773019 , 0.38368082, 0.36521935, 0.26070544, 0.1978873 ],
      dtype=float32)

In [156]:
o = np.array([])
o = np.append(o, np.array([1,2,3]))
o = np.append(o, np.array([4,5,7]))

In [158]:
o = [[1,2,3],[1,3,4]]

(2, 3)