In [1]:
import gym
import random
import numpy as np
from collections import deque
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam, RMSprop

In [2]:
env = gym.make("CartPole-v1")

In [3]:
def Random_games():
    for episode in range(10):
        env.reset()
        # this is each frame, up to 500
        # but we won't make it that far with random
        for t in range(500):
            # This will display the environmnet
            env.render()
            
            # This will just create a sample action in any environment
            # In this environment, the action can be 0 or 1, which is left or right
            action = env.action_space.sample()
            
            # This executes the environment with an action,
            # and returns the observation of the environment
            # the reward, if env over and other info
            next_state, reward, done, info = env.step(action)
            
            print(t, next_state, reward, done, info, action)
            if done:
                break

In [16]:
Random_games()

0 [ 0.0070667   0.23074133  0.00252513 -0.2503905 ] 1.0 False {} 1
1 [ 0.01168153  0.42582713 -0.00248268 -0.54227589] 1.0 False {} 1
2 [ 0.02019807  0.23074016 -0.0133282  -0.25037624] 1.0 False {} 0
3 [ 0.02481287  0.03581105 -0.01833572  0.03807308] 1.0 False {} 0
4 [ 0.02552909 -0.15904323 -0.01757426  0.32491497] 1.0 False {} 0
5 [ 0.02234823  0.03632448 -0.01107596  0.02674209] 1.0 False {} 1
6 [ 0.02307472 -0.15863689 -0.01054112  0.31590995] 1.0 False {} 0
7 [ 0.01990198 -0.35360712 -0.00422292  0.60525001] 1.0 False {} 0
8 [ 0.01282984 -0.54866977  0.00788208  0.89659983] 1.0 False {} 0
9 [ 0.00185644 -0.74389768  0.02581407  1.19174992] 1.0 False {} 0
10 [-0.01302151 -0.54911951  0.04964907  0.90726849] 1.0 False {} 1
11 [-0.0240039  -0.35470361  0.06779444  0.63059483] 1.0 False {} 1
12 [-0.03109797 -0.55070276  0.08040634  0.9438345 ] 1.0 False {} 0
13 [-0.04211203 -0.35675074  0.09928303  0.67746041] 1.0 False {} 1
14 [-0.04924704 -0.16313812  0.11283224  0.41761293] 1.0 F

13 [-0.02156353 -0.74930845 -0.01235379  0.90862458] 1.0 False {} 1
14 [-0.0365497  -0.94426102  0.0058187   1.1973992 ] 1.0 False {} 0
15 [-0.05543492 -1.13945779  0.02976669  1.4919001 ] 1.0 False {} 0
16 [-0.07822408 -1.33492909  0.05960469  1.79372727] 1.0 False {} 0
17 [-0.10492266 -1.14052343  0.09547923  1.52015002] 1.0 False {} 1
18 [-0.12773313 -0.94667646  0.12588223  1.25873221] 1.0 False {} 1
19 [-0.14666666 -0.75336972  0.15105688  1.00797817] 1.0 False {} 1
20 [-0.16173405 -0.95015002  0.17121644  1.34402942] 1.0 False {} 0
21 [-0.18073705 -1.14696143  0.19809703  1.68502199] 1.0 False {} 0
22 [-0.20367628 -0.95460672  0.23179747  1.45999315] 1.0 True {} 1
0 [-0.036913    0.1872989   0.03839906 -0.30961422] 1.0 False {} 1
1 [-0.03316702 -0.00834852  0.03220678 -0.00507287] 1.0 False {} 0
2 [-0.03333399 -0.2039172   0.03210532  0.29759518] 1.0 False {} 0
3 [-0.03741234 -0.00926728  0.03805723  0.01520809] 1.0 False {} 1
4 [-0.03759768  0.18528881  0.03836139 -0.26522859] 1

29 [-0.01258099 -0.43289633 -0.03693426  0.47567966] 1.0 False {} 0
30 [-0.02123892 -0.62747781 -0.02742066  0.75649663] 1.0 False {} 0
31 [-0.03378847 -0.8222113  -0.01229073  1.04042639] 1.0 False {} 0
32 [-0.0502327  -1.01716783  0.0085178   1.32922571] 1.0 False {} 0
33 [-0.07057606 -1.21239621  0.03510231  1.62456187] 1.0 False {} 0
34 [-0.09482398 -1.01770445  0.06759355  1.34302208] 1.0 False {} 1
35 [-0.11517807 -1.21360859  0.09445399  1.65606463] 1.0 False {} 0
36 [-0.13945024 -1.40969713  0.12757528  1.97661456] 1.0 False {} 0
37 [-0.16764418 -1.60591155  0.16710758  2.30594979] 1.0 False {} 0
38 [-0.19976241 -1.80212562  0.21322657  2.64507229] 1.0 True {} 0


In [4]:
# Neural network model for Deep Q Learning
def OurModel(input_shape, action_space):
    X_input = Input(input_shape)
    
    # 'Dense' is basic form of a neural network layer
    #  Input layer of state size(4) and hidden layer with 512 nodes
    X = Dense(512, input_shape=input_shape, activation='relu', kernel_initializer='he_uniform')(X_input)
    
    # Hidden layer with 256 nodes
    X = Dense(256, activation='relu', kernel_initializer='he_uniform')(X)
    
    # Hidden layer with 64 nodes
    X = Dense(64, activation='relu', kernel_initializer='he_uniform')(X)
        
    # Hidden layer with # of actions: 2 nodes(left, right)
    X = Dense(action_space, activation='linear', kernel_initializer='he_uniform')(X)
    
    model = Model(inputs=X_input, outputs=X, name='CartPole_DQN_model')
    model.compile(loss='mse', optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=['accuracy'])
    model.summary()
    return model

In [6]:
class DQNAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        # By default, CartPole-v1 has max episode steps = 500
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.EPISODES = 1000
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount rate
        self.epsilon = 1.0 # Exploration rate
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999
        self.batch_size = 64
        self.train_start = 1000
        
        # Create main model
        self.model = OurModel(input_shape=(self.state_size, ), action_space=self.action_size)
        
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.train_start:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
                
    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else: 
            return np.argmax(self.model.predict(state))
    
    def replay(self):
        if len(self.memory) < self.train_start:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))
        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []
        
        # Do this before prediction
        # for speedup, this could be done on the tensor level
        # but easier to understand using a loop
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])
        
        # Do batch prediction to save speed
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)
        
        for i in range(self.batch_size):
            # Correction on the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q target(s', a')
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))
        
        # Train the Neural Network with batches
            self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
    
    def load(self, name):
        self.model = load_model(name)
    
    def save(self, name):
        self.model.save(name)
    
    def run(self):
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
#                 self.env.render()
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done or i == self.env._max_episode_steps - 1:
                    reward = reward
                else:
                    reward = - 100
                
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1
                if done:
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
                    if i == 500:
                        print("Saving trained model as cartpole-dqn.h5")
                        self.save("cartpole-dqn.h5")
                        return
                self.replay()
    
    def test(self):
        self.load("cartpole-dqn.h5")
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(state)
                state = np.reshape(next_state, [1, self.state_size])
                i += 1
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
                    break

In [None]:
if __name__ == "__main__":
    agent = DQNAgent()
    agent.run()

Model: "CartPole_DQN_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4)]               0         
_________________________________________________________________
dense (Dense)                (None, 512)               2560      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_2 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
Total params: 150,466
Trainable params: 150,466
Non-trainable params: 0
_________________________________________________________________
episode: 0/1000, score: 44, e: 1.0
episode: 1/1000, score: 27, e: 1.0
episode: 2/1000, score: 11, e: 1.0
epi