In [6]:
import gym
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf

import random
from collections import deque
import numpy as np

In [15]:
class ActorCritic():
    def __init__(self, env, sess):
        self.env = env
        self.sess = sess
        
        # 学习参数
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.95
        self.learning_rate = 0.001
        self.memory = deque(maxlen=2000)
        
        self.tau = 0.125
        
        self.actor_state_input, self.actorModel = self.createActorModel()
        _, self.targetActorModel = self.createActorModel()
        
        self.critic_state_input, self.critic_action_input, self.criticModel = self.createCriticModel()
        _, _, self.targetCriticModel = self.createCriticModel()
        
        self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]])
        self.actor_model_weights = self.actorModel.trainable_weights
        self.actor_grads = tf.gradients(self.actorModel.output, actor_model_weights, -self.actor_critic_grad)
        self.graps = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
        
        self.critic_grads = tf.gradients(self.criticModel.output, self.critic_action_input)
    # 模型定义
    def createActorModel(self):
        state_input = Input(shape=self.env.observation_space.shape)  # self.env.observation_shape.shape为状态的大小
        x = Dense(24, activation='relu')(state_input)
        x = Dense(48, activation='relu')(x)
        x = Dense(24, activation='relu')(x)
        output = Dense(self.env.action_space.shape[0], activation='relu')(x)
        
        model = Model(inputs=state_input, outputs=output)
        adam = Adam(lr=0.001)
        model.compile(loss="mse", optimizer=adam)
        
        return state_input, model
    
    
    def createCriticModel(self):
        state_input = Input(shape=self.env.observation_space.shape)
        x = Dense(64, activation='relu')(state_input)
        x = Dense(32)(x)
        
        action_input = Input(shape=self.env.action_space.shape)
        y = Dense(32)(action_input)
        
        merged = Add()([x, y])
        merged = Dense(16, activation='relu')(merged)
        output = Dense(1, activationn='relu')(merged)
        
        model = Model(inputs=[state_input, action_input], outputs=output)
        adam = Adam(lr=0.001)
        model.compile(loss='mse', optimizer=adam)
        
        return state_input, action_input, model
        
        
    def train(self):
        batch_size = 32
        if len(self.memory) <= batch_size:
            return
        
        reward = []
        samples = self.memory.sample(self.memory, batch_size)
        self.trainCritic(samples)
        self.trainActor(samples)
    
    
    def trainCritic(self, samples):
        for sample in samples:
            state, action, reward, next_state, done = sample
            if not done:
                target_action = self.targetActorModel.predict(new_state)
                future_reward = self.targetCriticModel.predict([new_state, target_action])[0][0]
                reward += self.gamma * future_reward
            self.criticModel.fit([state, action], reward, verbose=0)
            
    
    def trainActor(self, samples):
        for sample in samples:
            state, action, reward, next_state, done = sample
            predicted_action = self.buildActorModel.predict(state)
            grads = self.sess.run(self.critic_grads, feed_dict={self.critic_state_input:state, self.critic_action_input: predicted_action})[0]
            
            self.sess.run(self.optimize, feed_dict={self.critic_state_input:state, self.actor_critic_grad: grads})
    
    
    def updateTargetActorModel(self):
        self.actor_model_weights = self.actorModel.get_weights()
        actor_target_weights = self.targetCriticModel.get_weights()
        
        for i in range(len(actor_target_weights)):
            actor_target_weights[i] = actor_model_weights[i]
        self.targetCriticModel.set_weights(actor_target_weights)
        
        
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return self.actorModel.predict(state)
    
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [16]:
def main():
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make('Pendulum-v0')
    actor_critic = ActorCritic(env, sess)
    
    num_trials = 10000
    trial_len = 500
    
    state = env.reset()
    action = env.action_space.sample()
    
    while True:
        env.render()
        state = np.array(state).reshape(-1, env.observation_space.shape[0])
        action = actor_critic.act(state)
        action = np.array(action).reshape(-1, env.action_space.shape[0])
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state).reshape(-1, env.observation_space.shape[0])
        
        actor_critic.remember(state, action, reward, next_state, done)
        actor_critic.train()
        
        state = next_state
        

if __name__ =="__main__":
    main()

  result = entry_point.load(False)


ValueError: Layer dense_2 was called with an input that isn't a symbolic tensor. Received type: <class 'method'>. Full input: [<bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x000001B0A647B160>>]. All inputs to the layer should be tensors.

In [9]:
len([1,1,1,1,1])

5