In [1]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
import gym
import numpy as np 
import random
from collections import deque

In [2]:
# Hyper Parameters
ENV_NAME = 'CartPole-v0'
EPISODE = 500 # Episode limitation
STEP = 300 # Step limitation in an episode
TEST=10
# Hyper Parameters for DQN
GAMMA = 0.9 # discount factor for target Q
INITIAL_EPSILON = 0.5 # starting value of epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
REPLAY_SIZE = 10000 # experience replay buffer size
BATCH_SIZE = 32 # size of minibatch

env = gym.make(ENV_NAME)

In [3]:
# env.reset()
# for _ in range(1000):
#     env.render()
#     env.step(env.action_space.sample()) # take a random action
# env.close()

In [6]:
class DQN(object):
    def __init__(self,state_size, action_size,dd=True):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=3000)
        self.gamma = 0.99    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.train_batch = 32
        self._model = self._createModel(dd)
        
    @property
    def model(self):# 定义为只读属性
        return self._model
    def _huber_loss(self, target, prediction):
        # sqrt(1+error^2)-1
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)
    
    def _createModel(self,dd=True):
        model = keras.Sequential()
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.action_size, activation='linear'))
        if dd:
            model.compile(loss=self._huber_loss,optimizer='adam')
        else:
            model.compile(loss='mse',optimizer='adam')
        return model
        
    def train(self):
        if len(self.memory)>=self.train_batch:
            minibatch = random.sample(self.memory,self.train_batch) 
            state_batch = np.zeros([self.train_batch,self.state_size])
            target_batch = np.zeros([self.train_batch,self.action_size]) 
            for i,(state, action, reward, next_state, done) in enumerate(minibatch):
                state_batch[i,:] = state
                target_batch[i,:] = self.predict_action(state)
                target_batch[i,action] = reward if done else reward+self.gamma*np.amax(self.predict_action(next_state)[0])
                # print(state_batch.shape,target_batch.shape)
            self.model.fit(state_batch, target_batch, epochs=1, verbose=0)
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
    def predict_action(self,state):# 预测动作 
        print(state.shape)
        return self.model.predict(state)
    def act(self,state):# 执行的动作，具有随机性
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        else:
            #print(self.predict_action(state)) 
            return np.argmax(self.predict_action(state)[0])
        
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        #self._train()     
    def save(self,name = 'models/test'):
        self.model.save(name)
        self.saveWeight(name)
    def load(self,name = 'models/test'):
        self._model= load_model(name)
    def saveWeight(self,name = 'models/test'):
        self.model.save_weights(name+'.weight')
    def loadWeight(self,name = 'models/test'):
        self.model.load_weights(name+'.weight')
agent = DQN(4,2,False)

In [7]:
saveModelName = './DQN'
for times in range(EPISODE):
        state = env.reset().reshape(1,4) 
        for i in range(199):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            reward = 0.1 if not done else -1
            next_state = next_state.reshape(1,4)
            agent.remember(state,action,reward,next_state,done)
            state = next_state
            if done:
                print('[times]:{}/{}\t\t[i]:{}\t\t[epsilon]:{}'.format(times,EPISODE,i,agent.epsilon))
                break
            if i == 198:
                print('[times]:{}/{}\t\t[i]:{}\t\t[epsilon]:{}\t#success#'.format(times,EPISODE,i,agent.epsilon))
        agent.train()
        # if (times+1)%100==0:
        #     agent.save(saveModelName+str(times+1))
        #     print('[Saved] savename: `%s`'%(saveModelName+str(times+1)))

[times]:0/500		[i]:24		[epsilon]:1.0
[times]:1/500		[i]:47		[epsilon]:1.0
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
[times]:2/500		[i]:10		[epsilon]:0.995
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4)
(1, 4

KeyboardInterrupt: 

In [16]:
agent.save('./models/')

INFO:tensorflow:Assets written to: ./models/assets


In [24]:
state=env.reset()
for _ in range(1000):
    env.render()
    state, reward, done, _ =env.step(np.argmax(agent.predict_action(state.reshape(1,4))[0])) # take a random action
    if done:
        break
env.close()