In [2]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
import random
import numpy as np
import numpy
from collections import deque
import gym

In [3]:
class DQN:
    def __init__(self, action_size):
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95
        self.learning_rate = 0.001
        self.model = self.buildModel()
    
    def buildModel(self):
        input = Input(shape=(4, ))
        x = Dense(16, activation='relu')(input)
        x = Dense(16, activation='relu')(x)
        x = Dense(2, activation='linear')(x)
        model = Model(inputs=input, outputs=x)
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1)
        if self.epsilon > self.epsilon_min:
            self.epsilon *=self.epsilon_decay

            
def main(episodes, epochss):
    
    env = gym.make('CartPole-v0')
    agent = DQN(2)
    for episode in range(episodes):
        state = env.reset()
        state = np.array(state).reshape(-1, 4)
        
        for epoch in range(epochss):
            env.render()
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.array(next_state).reshape(-1, 4)
            agent.remember = (state, action, reward, next_state, done)
            
            state = next_state
            
            if done:
                print("episode:{}/{}, score:{}".format(episode, episodes, epoch))
                break
    agent.replay(16)
    
if __name__ =="__main__":
    main(1000, 1000)

  result = entry_point.load(False)


episode:0/1000, score:14
episode:1/1000, score:10
episode:2/1000, score:21
episode:3/1000, score:14
episode:4/1000, score:15
episode:5/1000, score:28
episode:6/1000, score:12
episode:7/1000, score:12
episode:8/1000, score:11
episode:9/1000, score:25
episode:10/1000, score:16
episode:11/1000, score:21
episode:12/1000, score:17
episode:13/1000, score:22
episode:14/1000, score:16
episode:15/1000, score:27
episode:16/1000, score:19
episode:17/1000, score:25
episode:18/1000, score:38
episode:19/1000, score:13
episode:20/1000, score:26
episode:21/1000, score:16
episode:22/1000, score:9
episode:23/1000, score:36
episode:24/1000, score:17
episode:25/1000, score:22
episode:26/1000, score:14
episode:27/1000, score:10
episode:28/1000, score:47
episode:29/1000, score:13
episode:30/1000, score:12
episode:31/1000, score:53
episode:32/1000, score:15
episode:33/1000, score:12
episode:34/1000, score:32
episode:35/1000, score:10
episode:36/1000, score:21
episode:37/1000, score:20
episode:38/1000, score:

KeyboardInterrupt: 

In [None]:
import gym
import random
env = gym.make('CartPole-v0')

while True:
    env.reset()
    env.render()