# Cartpole DQN

Deep Q-Learning Network with Keras and OpenAI Gym, based on [Keon Kim's code](https://github.com/keon/deep-q-learning/blob/master/dqn.py).

#### Import dependencies

In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os # for creating directories

Using TensorFlow backend.


#### Set hyperparameters

In [2]:
env = gym.make('CartPole-v0') # initialise environment

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
state_size = env.observation_space.shape[0]
state_size

4

In [4]:
action_size = env.action_space.n
action_size

2

In [5]:
batch_size = 32

In [6]:
n_episodes = 1000 # n games we want agent to play 

In [7]:
output_dir = 'model_output/cartpole/'

In [8]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Define agent

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # double-ended queue; acts like list, but elements can be added/removed from either end
        self.gamma = 0.95 # decay or discount rate: enables agent to take into account future actions in addition to the immediate ones, but discounted at this rate
        self.epsilon = 1.0 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
        self.epsilon_decay = 0.995 # decrease number of random explorations as the agent's performance (hopefully) improves over time
        self.epsilon_min = 0.01 # minimum amount of random exploration permitted
        self.learning_rate = 0.001 # rate at which NN adjusts models parameters via SGD to reduce cost 
        self.model = self._build_model() # private method 
    
    def _build_model(self):
        # neural net to approximate Q-value function:
        model = Sequential()
        model.add(Dense(32, activation='relu', 
                        input_dim=self.state_size)) # 1st hidden layer; states as input
        model.add(Dense(32, activation='relu')) # 2nd hidden layer
        model.add(Dense(self.action_size, activation='linear')) # 2 actions, so 2 output neurons: 0 and 1 (L/R)
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, 
                            reward, next_state, done)) # list of previous experiences, enabling re-training later

    def replay(self, batch_size): # method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory, batch_size) # sample a minibatch from memory
        for state, action, reward, next_state, done in minibatch: # extract data for each minibatch sample
            target = reward # if done (boolean whether game ended or not, i.e., whether final state or not), then target = reward
            if not done: # if not done, then predict future discounted reward
                target = (reward + 
                          self.gamma * # (target) = reward + (discount rate gamma) * 
                          np.amax(self.model.predict(next_state)[0])) # (maximum target Q based on future action a')
            target_f = self.model.predict(state) # approximately map current state to future discounted reward
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) # single epoch of training with x=state, y=target_f; fit decreases loss btwn target_f and y_hat
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def act(self, state):
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # if not acting randomly, predict reward value based on current state
        return np.argmax(act_values[0]) # pick the action that will give the highest reward (i.e., go left or right?)
    
    def save(self, name):
        self.model.save_weights(name)

    def load(self, name):
        self.model.load_weights(name)

#### Interact with environment

In [10]:
agent = DQNAgent(state_size, action_size) # initialise agent

In [11]:
done = False
for e in range(n_episodes): # iterate over new episodes of the game
    state = env.reset() # reset state at start of each new episode of the game
    state = np.reshape(state, [1, state_size])
    
    for time in range(5000):  # time represents a frame of the game; goal is to keep pole upright as long as possible up to range, e.g., 500 or 5000 timesteps
#         env.render()
        action = agent.act(state) # action is either 0 or 1 (move cart left or right); decide on one or other here
        next_state, reward, done, _ = env.step(action) # agent interacts with env, gets feedback; 4 state data points, e.g., pole angle, cart position        
        reward = reward if not done else -10 # reward +1 for each additional frame with pole upright        
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done) # remember the previous timestep's state, actions, reward, etc.        
        state = next_state # set "current state" for upcoming iteration to the current next state        
        if done: # episode ends if agent drops pole or we reach timestep 5000
            print("episode: {}/{}, score: {}, e: {:.2}" # print the episode's score and agent's epsilon
                  .format(e, n_episodes-1, time, agent.epsilon))
            break # exit loop
    if len(agent.memory) > batch_size:
        agent.replay(batch_size) # train the agent by replaying the experiences of the episode
    if e % 50 == 0:
        agent.save(output_dir + "weights_" 
                   + '{:04d}'.format(e) + ".hdf5") 

episode: 0/999, score: 19, e: 1.0
episode: 1/999, score: 14, e: 1.0
episode: 2/999, score: 37, e: 0.99
episode: 3/999, score: 11, e: 0.99
episode: 4/999, score: 35, e: 0.99
episode: 5/999, score: 41, e: 0.98
episode: 6/999, score: 18, e: 0.98
episode: 7/999, score: 10, e: 0.97
episode: 8/999, score: 9, e: 0.97
episode: 9/999, score: 24, e: 0.96
episode: 10/999, score: 18, e: 0.96
episode: 11/999, score: 16, e: 0.95
episode: 12/999, score: 16, e: 0.95
episode: 13/999, score: 12, e: 0.94
episode: 14/999, score: 27, e: 0.94
episode: 15/999, score: 22, e: 0.93
episode: 16/999, score: 16, e: 0.93
episode: 17/999, score: 30, e: 0.92
episode: 18/999, score: 16, e: 0.92
episode: 19/999, score: 14, e: 0.91
episode: 20/999, score: 10, e: 0.91
episode: 21/999, score: 19, e: 0.9
episode: 22/999, score: 32, e: 0.9
episode: 23/999, score: 17, e: 0.9
episode: 24/999, score: 34, e: 0.89
episode: 25/999, score: 9, e: 0.89
episode: 26/999, score: 21, e: 0.88
episode: 27/999, score: 10, e: 0.88
episode: 

episode: 226/999, score: 61, e: 0.32
episode: 227/999, score: 56, e: 0.32
episode: 228/999, score: 36, e: 0.32
episode: 229/999, score: 63, e: 0.32
episode: 230/999, score: 31, e: 0.32
episode: 231/999, score: 29, e: 0.32
episode: 232/999, score: 54, e: 0.31
episode: 233/999, score: 94, e: 0.31
episode: 234/999, score: 57, e: 0.31
episode: 235/999, score: 58, e: 0.31
episode: 236/999, score: 54, e: 0.31
episode: 237/999, score: 45, e: 0.31
episode: 238/999, score: 38, e: 0.3
episode: 239/999, score: 52, e: 0.3
episode: 240/999, score: 42, e: 0.3
episode: 241/999, score: 30, e: 0.3
episode: 242/999, score: 99, e: 0.3
episode: 243/999, score: 93, e: 0.3
episode: 244/999, score: 48, e: 0.3
episode: 245/999, score: 63, e: 0.29
episode: 246/999, score: 16, e: 0.29
episode: 247/999, score: 42, e: 0.29
episode: 248/999, score: 72, e: 0.29
episode: 249/999, score: 69, e: 0.29
episode: 250/999, score: 25, e: 0.29
episode: 251/999, score: 56, e: 0.29
episode: 252/999, score: 40, e: 0.28
episode:

episode: 447/999, score: 199, e: 0.11
episode: 448/999, score: 95, e: 0.11
episode: 449/999, score: 127, e: 0.11
episode: 450/999, score: 166, e: 0.11
episode: 451/999, score: 164, e: 0.1
episode: 452/999, score: 99, e: 0.1
episode: 453/999, score: 68, e: 0.1
episode: 454/999, score: 101, e: 0.1
episode: 455/999, score: 199, e: 0.1
episode: 456/999, score: 199, e: 0.1
episode: 457/999, score: 199, e: 0.1
episode: 458/999, score: 132, e: 0.1
episode: 459/999, score: 108, e: 0.1
episode: 460/999, score: 126, e: 0.1
episode: 461/999, score: 177, e: 0.1
episode: 462/999, score: 199, e: 0.099
episode: 463/999, score: 45, e: 0.099
episode: 464/999, score: 41, e: 0.098
episode: 465/999, score: 108, e: 0.098
episode: 466/999, score: 126, e: 0.097
episode: 467/999, score: 20, e: 0.097
episode: 468/999, score: 19, e: 0.096
episode: 469/999, score: 42, e: 0.096
episode: 470/999, score: 90, e: 0.095
episode: 471/999, score: 72, e: 0.095
episode: 472/999, score: 69, e: 0.094
episode: 473/999, score

episode: 661/999, score: 199, e: 0.037
episode: 662/999, score: 199, e: 0.036
episode: 663/999, score: 94, e: 0.036
episode: 664/999, score: 156, e: 0.036
episode: 665/999, score: 199, e: 0.036
episode: 666/999, score: 159, e: 0.036
episode: 667/999, score: 199, e: 0.035
episode: 668/999, score: 199, e: 0.035
episode: 669/999, score: 36, e: 0.035
episode: 670/999, score: 24, e: 0.035
episode: 671/999, score: 43, e: 0.035
episode: 672/999, score: 199, e: 0.035
episode: 673/999, score: 199, e: 0.034
episode: 674/999, score: 45, e: 0.034
episode: 675/999, score: 68, e: 0.034
episode: 676/999, score: 199, e: 0.034
episode: 677/999, score: 37, e: 0.034
episode: 678/999, score: 28, e: 0.034
episode: 679/999, score: 69, e: 0.033
episode: 680/999, score: 199, e: 0.033
episode: 681/999, score: 199, e: 0.033
episode: 682/999, score: 199, e: 0.033
episode: 683/999, score: 199, e: 0.033
episode: 684/999, score: 199, e: 0.033
episode: 685/999, score: 199, e: 0.032
episode: 686/999, score: 199, e: 0

episode: 873/999, score: 199, e: 0.013
episode: 874/999, score: 199, e: 0.013
episode: 875/999, score: 199, e: 0.013
episode: 876/999, score: 199, e: 0.012
episode: 877/999, score: 199, e: 0.012
episode: 878/999, score: 199, e: 0.012
episode: 879/999, score: 199, e: 0.012
episode: 880/999, score: 119, e: 0.012
episode: 881/999, score: 199, e: 0.012
episode: 882/999, score: 199, e: 0.012
episode: 883/999, score: 199, e: 0.012
episode: 884/999, score: 199, e: 0.012
episode: 885/999, score: 199, e: 0.012
episode: 886/999, score: 199, e: 0.012
episode: 887/999, score: 199, e: 0.012
episode: 888/999, score: 199, e: 0.012
episode: 889/999, score: 199, e: 0.012
episode: 890/999, score: 199, e: 0.012
episode: 891/999, score: 199, e: 0.012
episode: 892/999, score: 199, e: 0.011
episode: 893/999, score: 199, e: 0.011
episode: 894/999, score: 199, e: 0.011
episode: 895/999, score: 199, e: 0.011
episode: 896/999, score: 199, e: 0.011
episode: 897/999, score: 158, e: 0.011
episode: 898/999, score: 

In [12]:
# saved agents can be loaded with agent.load("./path/filename.hdf5") 