In [1]:
import gym
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Dropout
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K
import tensorflow as tf

import random
from collections import deque
import numpy as np

Using TensorFlow backend.


In [2]:
class ActorCritic():
    def __init__(self, env, sess):
        self.env = env
        self.sess = sess
        
        # 学习参数
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.95
        self.learning_rate = 0.001
        self.memory = deque(maxlen=2000)
        
        self.tau = 0.125
        
        self.actor_state_input, self.actorModel = self.createActorModel()
        _, self.targetActorModel = self.createActorModel()
        
        self.critic_state_input, self.critic_action_input, self.criticModel = self.createCriticModel()
        _, _, self.targetCriticModel = self.createCriticModel()
        
        self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.env.action_space.shape[0]])
        self.actor_model_weights = self.actorModel.trainable_weights
        self.actor_grads = tf.gradients(self.actorModel.output, self.actor_model_weights, -self.actor_critic_grad)
        self.grads = zip(self.actor_grads, self.actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(self.grads)
        
        self.critic_grads = tf.gradients(self.criticModel.output, self.critic_action_input)
    # 模型定义
    def createActorModel(self):
        state_input = Input(shape=self.env.observation_space.shape)  # self.env.observation_shape.shape为状态的大小
        x = Dense(24, activation='relu')(state_input)
        x = Dense(48, activation='relu')(x)
        x = Dense(24, activation='relu')(x)
        output = Dense(self.env.action_space.shape[0], activation='relu')(x)
        
        model = Model(inputs=state_input, outputs=output)
        adam = Adam(lr=0.001)
        model.compile(loss="mse", optimizer=adam)
        
        return state_input, model
    
    
    def createCriticModel(self):
        state_input = Input(shape=self.env.observation_space.shape)
        x = Dense(64, activation='relu')(state_input)
        x = Dense(32)(x)
        
        action_input = Input(shape=self.env.action_space.shape)
        y = Dense(32)(action_input)
        
        merged = Add()([x, y])
        merged = Dense(16, activation='relu')(merged)
        output = Dense(1, activation='relu')(merged)
        
        model = Model(inputs=[state_input, action_input], outputs=output)
        adam = Adam(lr=0.001)
        model.compile(loss='mse', optimizer=adam)
        
        return state_input, action_input, model
        
        
    def train(self):
        batch_size = 32
        if len(self.memory) <= batch_size:
            return
        
        reward = []
        samples = random.sample(self.memory, batch_size)
        self.trainCritic(samples)
        self.trainActor(samples)
    
    
    def trainCritic(self, samples):
        for sample in samples:
            state, action, reward, next_state, done = sample
            if not done:
                target_action = self.targetActorModel.predict(next_state)
                future_reward = self.targetCriticModel.predict([next_state, target_action])[0][0]
                reward += self.gamma * future_reward
            self.criticModel.fit([state, action], reward, verbose=0)
            
    
    def trainActor(self, samples):
        for sample in samples:
            state, action, reward, next_state, done = sample
            predicted_action = self.actorModel.predict(state)
            grads = self.sess.run(self.critic_grads, feed_dict={self.critic_state_input:state, self.critic_action_input: predicted_action})[0]
            
            self.sess.run(self.optimize, feed_dict={self.critic_state_input:state, self.actor_critic_grad: grads})
    
    
    def updateTargetActorModel(self):
        self.actor_model_weights = self.actorModel.get_weights()
        actor_target_weights = self.targetCriticModel.get_weights()
        
        for i in range(len(actor_target_weights)):
            actor_target_weights[i] = actor_model_weights[i]
        self.targetCriticModel.set_weights(actor_target_weights)
        
        
    def act(self, state):
        self.epsilon *= self.epsilon_decay
        if np.random.random() < self.epsilon:
            return self.env.action_space.sample()
        return self.actorModel.predict(state)
    
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [3]:
def main():
    sess = tf.Session()
    K.set_session(sess)
    env = gym.make('Pendulum-v0')
    actor_critic = ActorCritic(env, sess)
    
    num_trials = 10000
    trial_len = 500
    
    state = env.reset()
    action = env.action_space.sample()
    
    while True:
        env.render()
        state = np.array(state).reshape(-1, env.observation_space.shape[0])
        action = actor_critic.act(state)
        action = np.array(action).reshape(-1, env.action_space.shape[0])
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state).reshape(-1, env.observation_space.shape[0])
        
        actor_critic.remember(state, action, reward, next_state, done)
        actor_critic.train()
        
        state = next_state
        

if __name__ =="__main__":
    main()

  result = entry_point.load(False)


InvalidArgumentError: You must feed a value for placeholder tensor 'input_1' with dtype float and shape [?,3]
	 [[Node: input_1 = Placeholder[dtype=DT_FLOAT, shape=[?,3], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'input_1', defined at:
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelapp.py", line 505, in start
    self.io_loop.start()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelbase.py", line 370, in dispatch_queue
    yield self.process_one()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 1080, in __init__
    self.run()
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelbase.py", line 357, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelbase.py", line 267, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\kernelbase.py", line 534, in execute_request
    user_expressions, allow_stdin,
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 2843, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 2869, in _run_cell
    return runner(coro)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 3044, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 3209, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\IPython\core\interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-3-f2af202181b8>", line 29, in <module>
    main()
  File "<ipython-input-3-f2af202181b8>", line 5, in main
    actor_critic = ActorCritic(env, sess)
  File "<ipython-input-2-51bfd8905a8e>", line 15, in __init__
    self.actor_state_input, self.actorModel = self.createActorModel()
  File "<ipython-input-2-51bfd8905a8e>", line 30, in createActorModel
    state_input = Input(shape=self.env.observation_space.shape)  # self.env.observation_shape.shape为状态的大小
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\keras\engine\input_layer.py", line 178, in Input
    input_tensor=tensor)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\keras\legacy\interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\keras\engine\input_layer.py", line 87, in __init__
    name=self.name)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\keras\backend\tensorflow_backend.py", line 517, in placeholder
    x = tf.placeholder(dtype, shape=shape, name=name)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tensorflow\python\ops\array_ops.py", line 1734, in placeholder
    return gen_array_ops.placeholder(dtype=dtype, shape=shape, name=name)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 5927, in placeholder
    "Placeholder", dtype=dtype, shape=shape, name=name)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tensorflow\python\framework\ops.py", line 3414, in create_op
    op_def=op_def)
  File "c:\users\zhchxiao\anaconda3\envs\pytorch\lib\site-packages\tensorflow\python\framework\ops.py", line 1740, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): You must feed a value for placeholder tensor 'input_1' with dtype float and shape [?,3]
	 [[Node: input_1 = Placeholder[dtype=DT_FLOAT, shape=[?,3], _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [9]:
len([1,1,1,1,1])

5