![CartPole](CartPole.jpg)

|Agent Class|Env Class|
|---------|---------|
|init()|reset()|
|ann_model()|step(action)|
|remember() #storage||
|act(s)||
|replay # training||
|adaptive$\epsilon$||



In [2]:
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

class DQLAgent:
    def __init__(self, env):
        # parameter / hyperparameter
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        
        self.gamma = 0.95
        self.learning_rate = 0.001 
        
        self.epsilon = 1  # explore
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000)
        
        self.model = self.build_model()
        
        
    def build_model(self):
        # neural network for deep q learning
        model = Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = "tanh"))
        model.add(Dense(self.action_size,activation = "linear"))
        model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        # acting: explore or exploit
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
    
    def replay(self,batch_size):
        "vectorized replay method"
        if len(agent.memory) < batch_size:
            return
        # Vectorized method for experience replay
        minibatch = random.sample(self.memory, batch_size)
        minibatch = np.array(minibatch)
        not_done_indices = np.where(minibatch[:, 4] == False)
        y = np.copy(minibatch[:, 2])

        # If minibatch contains any non-terminal states, use separate update rule for those states
        if len(not_done_indices[0]) > 0:
            predict_sprime = self.model.predict(np.vstack(minibatch[:, 3]))
            predict_sprime_target = self.model.predict(np.vstack(minibatch[:, 3]))
            
            # Non-terminal update rule
            y[not_done_indices] += np.multiply(self.gamma, predict_sprime_target[not_done_indices, np.argmax(predict_sprime[not_done_indices, :][0], axis=1)][0])

        actions = np.array(minibatch[:, 1], dtype=int)
        y_target = self.model.predict(np.vstack(minibatch[:, 0]))
        y_target[range(batch_size), actions] = y
        self.model.fit(np.vstack(minibatch[:, 0]), y_target, epochs=1, verbose=0)
            
    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay  
    
    def load(self, name):
        self.model = load_model(name)
            
    def save(self, name):
        self.model.save(name)

In [4]:
if __name__ == "__main__":
    
    # initialize gym env and agent
    env = gym.make("CartPole-v0")
    agent = DQLAgent(env)
    
    batch_size = 16
    episodes = 100
    for e in range(episodes):
        
        # initialize environment
        state = env.reset()
        
        state = np.reshape(state,[1,env.observation_space.shape[0]])
        
        time = 0
        while True:

            env.render()
            
            # act
            action = agent.act(state) # select an action
            
            # step
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state,[1,env.observation_space.shape[0]])
            
            # remember / storage
            agent.remember(state, action, reward, next_state, done)
            
            # update state
            state = next_state
            
            # replay
            agent.replay(batch_size)
            
            # adjust epsilon
            agent.adaptiveEGreedy()
            
            time += 1
            
            if done:
                print("Episode: {}, time: {}".format(e,time))
                if time == 500:
                    print("Saving trained model as cartpole-dqn.h5")
                    agent.save("cartpole-dqn.h5")
                break
            
    env.close()

Episode: 0/100, time: 21
Episode: 1/100, time: 21
Episode: 2/100, time: 28
Episode: 3/100, time: 77
Episode: 4/100, time: 27


Exception ignored in: <function Viewer.__del__ at 0x0000026190C79160>
Traceback (most recent call last):
  File "C:\Users\Dilemre\anaconda3\lib\site-packages\gym\envs\classic_control\rendering.py", line 185, in __del__
    self.close()
  File "C:\Users\Dilemre\anaconda3\lib\site-packages\gym\envs\classic_control\rendering.py", line 101, in close
    self.window.close()
  File "C:\Users\Dilemre\anaconda3\lib\site-packages\pyglet\window\win32\__init__.py", line 328, in close
    super(Win32Window, self).close()
  File "C:\Users\Dilemre\anaconda3\lib\site-packages\pyglet\window\__init__.py", line 857, in close
    app.windows.remove(self)
  File "C:\Users\Dilemre\anaconda3\lib\_weakrefset.py", line 114, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x000002619A254F90; to 'Win32Window' at 0x0000026198C54A90>


Episode: 5/100, time: 23
Episode: 6/100, time: 12
Episode: 7/100, time: 12
Episode: 8/100, time: 10
Episode: 9/100, time: 12
Episode: 10/100, time: 12
Episode: 11/100, time: 9
Episode: 12/100, time: 10
Episode: 13/100, time: 10
Episode: 14/100, time: 11
Episode: 15/100, time: 11
Episode: 16/100, time: 10
Episode: 17/100, time: 10
Episode: 18/100, time: 12
Episode: 19/100, time: 12
Episode: 20/100, time: 10
Episode: 21/100, time: 11
Episode: 22/100, time: 14
Episode: 23/100, time: 12
Episode: 24/100, time: 11
Episode: 25/100, time: 11
Episode: 26/100, time: 17
Episode: 27/100, time: 28
Episode: 28/100, time: 27
Episode: 29/100, time: 91
Episode: 30/100, time: 34
Episode: 31/100, time: 17
Episode: 32/100, time: 23
Episode: 33/100, time: 17
Episode: 34/100, time: 15
Episode: 35/100, time: 31
Episode: 36/100, time: 22
Episode: 37/100, time: 19
Episode: 38/100, time: 31
Episode: 39/100, time: 19
Episode: 40/100, time: 20
Episode: 41/100, time: 49
Episode: 42/100, time: 43
Episode: 43/100, t

In [4]:
import time
env = gym.make("CartPole-v1")
trained_model = DQLAgent(env)
trained_model.load("cartpole-dqn.h5")

for i in range(10):
    state = env.reset()
    state = np.reshape(state, [1,env.observation_space.shape[0]])
    time_t = 0
    
    while True:
        env.render()
        action = trained_model.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, 4])
        state = next_state
        time_t += 1
        time.sleep(0.2)
        if done: 
            print(i, ": ", time_t)
            break
print("Done")
env.close()

14
Done
12
Done
21
Done
13
Done
26
Done
72
Done
32
Done
14
Done
33
Done
14
Done
28
Done
14
Done
15
Done
10
Done
21
Done
35
Done
10
Done
44
Done
15
Done
20
Done
13
Done
15
Done
66
Done
28
Done
27
Done
13
Done
18
Done
17
Done
63
Done
15
Done
14
Done
13
Done
11
Done
14
Done
30
Done
53
Done
12
Done
13
Done


KeyboardInterrupt: 

In [5]:
env.close()