In [1]:
import matplotlib.pyplot as plt
import numpy as np
import numpy.random as nr
import sys
import random
import time
import tensorflow as tf
from collections import deque
from keras.layers import Dense, Input, Add, GaussianNoise,Concatenate
from keras.optimizers import Adam, SGD, Nadam
from keras.models import Model
from keras import backend as K
from keras import regularizers

from mlagents.envs import UnityEnvironment

%matplotlib inline

print("Python version:")
print(sys.version)

# check Python version
if (sys.version_info[0] < 3):
    raise Exception("ERROR: ML-Agents Toolkit (v0.3 onwards) requires Python 3")

Using TensorFlow backend.


Python version:
3.6.6 |Anaconda, Inc.| (default, Jun 28 2018, 11:27:44) [MSC v.1900 64 bit (AMD64)]


In [2]:
env_name = "../env/CartPole"  # Name of the Unity environment binary to launch

In [3]:
env = UnityEnvironment(file_name=env_name)

# Set the default brain to work with
default_brain = env.brain_names[0]
brain = env.brains[default_brain]

INFO:mlagents.envs:
'CartPoleAcamedy' started successfully!
Unity Academy name: CartPoleAcamedy
        Number of Brains: 1
        Number of External Brains : 1
        Reset Parameters :
		
Unity brain name: CartPoleBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space size (per agent): 5
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): [1]
        Vector Action descriptions: 


In [4]:
class OU_noise():
    def __init__(self,action_size,mu=0,theta=0.1,sigma=0.1):
        self.action_size = action_size
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_size)*self.mu
        #self.shape = np.shape(self.action_size)
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_size)*self.mu

    def noise(self):
        x = self.state
        
        dx = self.theta * (self.mu - x) + self.sigma * nr.randn(self.action_size[0],self.action_size[1])
        self.state = x + dx
        #print(self.state)
        return self.state


class DDPGAgent:
    def __init__(self, state_size, agent_size,action_size):
        self.state_size = state_size
        self.agent_size = agent_size
        self.action_size = action_size
        self.load_model = True
        self.Gausian_size = 0.01
        self.gard_clip_radious = 100.0

        # build networks
        self.actor = self.build_actor()
        self.actor_target = self.build_actor()
        self.critic = self.build_critic()
        self.critic_target = self.build_critic()
        self.actor_updater = self.actor_optimizer()

        self.memory = deque(maxlen=50000)
        self.batch_size = 256
        self.discount_factor = 0.99
        self.epsilon = 1
        self.epsilon_decay = 0.999
        
        self.noiser = OU_noise([agent_size,self.action_size]) #수정한 부분
        
        if self.load_model:
            self.actor.load_weights("3DBall_actor.h5")
            self.actor_target.load_weights("3DBall_actor.h5")
            self.critic.load_weights("3DBall_critic.h5")
            self.critic_target.load_weights("3DBall_critic.h5")

    def build_actor(self):
        print("building actor network")
        input = Input(shape=[self.state_size])
        h1 = Dense(512, activation='elu')(input)
        h1 = Dense(512, activation='elu')(h1)
        h1 = Dense(512, activation='elu')(h1)
        h1 = Dense(512, activation='elu')(h1)
        h1 = Dense(512, activation='elu')(h1)
        h1 = Dense(512, activation='elu')(h1)
        h1 = Dense(512, activation='elu')(h1)
        action = Dense(self.action_size, activation='tanh')(h1)
        actor = Model(inputs=input, outputs=action)
        actor.summary()
        return actor

    def actor_optimizer(self):
        actions = self.actor.output
        dqda = tf.gradients(self.critic.output, self.critic.input)
        loss = actions * tf.clip_by_value(-dqda[1],-self.gard_clip_radious,self.gard_clip_radious) 

        optimizer = Adam(lr=0.00001)
        updates = optimizer.get_updates(self.actor.trainable_weights, [], loss)
        train = K.function([self.actor.input, self.critic.input[0],
                            self.critic.input[1]], [], updates=updates)
        return train

    def build_critic(self):
        print("building critic network")
        state = Input(shape=[self.state_size], name='state_input')
        action = Input(shape=[self.action_size], name='action_input')
        w1 = Dense(64, activation='elu')(state)
        w1 = Dense(64, activation='elu')(w1)
        a1 = Dense(64, activation='elu')(action)
        a1 = Dense(64, activation='elu')(a1)
        c = Concatenate()([w1,a1])
        #c = Add()([w1,a1])
        h2 = Dense(512, activation='elu')(c)
        h2 = Dense(512, activation='elu')(h2)
        h2 = Dense(512, activation='elu')(h2)
        h2 = Dense(512, activation='elu')(h2)
        h2 = Dense(512, activation='elu')(h2)
        h2 = Dense(512, activation='elu')(h2)
        h2 = Dense(512, activation='elu')(h2)
        Velue = Dense(1, activation='linear',kernel_regularizer=regularizers.l1_l2(0.0001,0.0001))(h2)
        critic = Model(inputs=[state, action], outputs=Velue)
        critic.compile(loss='mse', optimizer=Adam(lr=0.00001))
        critic.summary()
        return critic

    def get_action(self, state):
        self.epsilon = max(self.epsilon*self.epsilon_decay,0.05)
        #print(state)
        action = self.actor.predict(state)

        real = action + self.epsilon*self.noiser.noise()
        return np.clip(real,-1.1,1.1)
    
    def gat_action_nonoise(self,state):
        action = self.actor.predict(state)
        
        real = action + 0.01*self.noiser.noise()
        return np.clip(real,-1.1,1.1)

    def append_sample(self, state, action, reward, next_state, done):
        
        for i in range(self.agent_size):
            self.memory.append((state[i], action[i], reward[i], next_state[i], done[i]))
        #self.memory.append((state, action, reward, next_state, done))

    def train_model(self):
        # make mini-batch from replay memory
        mini_batch = random.sample(self.memory, self.batch_size)

        states = np.asarray([e[0] for e in mini_batch])
        actions = np.asarray([e[1] for e in mini_batch])
        rewards = np.asarray([e[2] for e in mini_batch])
        next_states = np.asarray([e[3] for e in mini_batch])
        dones = np.asarray([e[4] for e in mini_batch])

        # update critic network
        critic_action_input = self.actor_target.predict(next_states)
        target_q_values = self.critic_target.predict([next_states, critic_action_input])

        targets = np.zeros([self.batch_size, 1])
        for i in range(self.batch_size):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = rewards[i] + self.discount_factor * target_q_values[i]

        self.critic.train_on_batch([states, actions], targets)

        # update actor network
        a_for_grad = self.actor.predict(states)
        self.actor_updater([states, states, a_for_grad])
        #self.actor_updater([states, states, actions])
        
    def train_critic(self):
        mini_batch = random.sample(self.memory, self.batch_size)

        states = np.asarray([e[0] for e in mini_batch])
        actions = np.asarray([e[1] for e in mini_batch])
        rewards = np.asarray([e[2] for e in mini_batch])
        next_states = np.asarray([e[3] for e in mini_batch])
        dones = np.asarray([e[4] for e in mini_batch])
        
        self.critic.train_on_batch([states, actions], rewards)

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.actor_target.set_weights(self.actor.get_weights())
        self.critic_target.set_weights(self.critic.get_weights())

In [5]:
# Reset the environment
env_info = env.reset(train_mode=True)[default_brain]

# Examine the state space for the default brain
print("Agent state looks like: \n{}".format(env_info.vector_observations[0]))

print("Agent shape looks like: \n{}".format(np.shape(env_info.vector_observations)))

for observation in env_info.visual_observations:
    print("Agent observations look like:")
    if observation.shape[3] == 3:
        plt.imshow(observation[0,:,:,:])
    else:
        plt.imshow(observation[0,:,:,0])
print("Agent shape looks like: \n{}".format(np.shape(env_info.visual_observations)))

agent = DDPGAgent(5,10, 1)

Agent state looks like: 
[0.         0.58897752 0.         0.         0.        ]
Agent shape looks like: 
(10, 5)
Agent shape looks like: 
(0,)
building actor network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               3072      
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_5 (Dense)              (None, 512)

In [None]:
train_mode = False  # Whether to run the environment in training or inference mode

reward_memory = deque(maxlen=20)
agents_reward = np.zeros(agent.agent_size)
env_info = env.reset(train_mode=train_mode)[default_brain]
for episode in range(10000000):
    #env_info = env.reset(train_mode=train_mode)[default_brain]
    state = env_info.vector_observations
    done = False
    episode_rewards = 0
    for i in range(100):
        
        if not train_mode:
            action = agent.gat_action_nonoise(state)
        else:
            action = agent.get_action(state)
        #action = np.column_stack([np.random.randint(0, action_size[i], size=(len(env_info.agents))) for i in range(len(action_size))])
        env_info = env.step(action)[default_brain]
        next_state = env_info.vector_observations
        reward = env_info.rewards
        agents_reward += reward
        #episode_rewards += reward#env_info.rewards[0]
        done = env_info.local_done
        for idx,don in enumerate(done):
            if don:
                reward_memory.append(agents_reward[idx])
                agents_reward[idx] = 0
        if train_mode:
            agent.append_sample(state,action,reward,next_state,done)
        state = next_state
        
        if len(agent.memory) > agent.batch_size*10 and train_mode:
            agent.train_model()
        
    agent.noiser.reset()
    agent.update_target_model()
    if agent.epsilon <= 0.05:
        agent.epsilon = 0.99

    if episode%50 == 0 and not episode == 0 and train_mode:
        agent.actor.save("3DBall_actor.h5")
        agent.critic.save("3DBall_critic.h5")
        print("model saved")
    print("episode_{} reward: {} episilon: {}".format(episode,np.mean(reward_memory),agent.epsilon))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


episode_0 reward: nan episilon: 0.12617562129402005
episode_1 reward: nan episilon: 0.12617562129402005
episode_2 reward: 375.99501249790194 episilon: 0.12617562129402005
episode_3 reward: 375.99501249790194 episilon: 0.12617562129402005
episode_4 reward: 379.53031074404714 episilon: 0.12617562129402005
episode_5 reward: 379.53031074404714 episilon: 0.12617562129402005
episode_6 reward: 380.30878142118456 episilon: 0.12617562129402005
episode_7 reward: 380.30878142118456 episilon: 0.12617562129402005
episode_8 reward: 375.1499094069004 episilon: 0.12617562129402005
episode_9 reward: 375.1499094069004 episilon: 0.12617562129402005
episode_10 reward: 372.993072450161 episilon: 0.12617562129402005
episode_11 reward: 372.993072450161 episilon: 0.12617562129402005
episode_12 reward: 374.32981864213946 episilon: 0.12617562129402005
episode_13 reward: 374.32981864213946 episilon: 0.12617562129402005
episode_14 reward: 370.40313653945924 episilon: 0.12617562129402005
episode_15 reward: 370.403

episode_122 reward: 374.0882230460644 episilon: 0.12617562129402005
episode_123 reward: 374.0882230460644 episilon: 0.12617562129402005
episode_124 reward: 379.310762155056 episilon: 0.12617562129402005
episode_125 reward: 379.310762155056 episilon: 0.12617562129402005
episode_126 reward: 376.8824450492859 episilon: 0.12617562129402005
episode_127 reward: 376.8824450492859 episilon: 0.12617562129402005
episode_128 reward: 374.50164291262627 episilon: 0.12617562129402005
episode_129 reward: 374.50164291262627 episilon: 0.12617562129402005
episode_130 reward: 378.08746535778045 episilon: 0.12617562129402005
episode_131 reward: 378.08746535778045 episilon: 0.12617562129402005
episode_132 reward: 375.4950793325901 episilon: 0.12617562129402005
episode_133 reward: 375.4950793325901 episilon: 0.12617562129402005
episode_134 reward: 381.7012608349323 episilon: 0.12617562129402005
episode_135 reward: 381.7012608349323 episilon: 0.12617562129402005
episode_136 reward: 381.85872579813 episilon: 

episode_242 reward: 378.6995527505875 episilon: 0.12617562129402005
episode_243 reward: 380.91693609952927 episilon: 0.12617562129402005
episode_244 reward: 380.91693609952927 episilon: 0.12617562129402005
episode_245 reward: 376.3219992876053 episilon: 0.12617562129402005
episode_246 reward: 376.3219992876053 episilon: 0.12617562129402005
episode_247 reward: 374.14579213261607 episilon: 0.12617562129402005
episode_248 reward: 374.14579213261607 episilon: 0.12617562129402005
episode_249 reward: 379.37322015166285 episilon: 0.12617562129402005
episode_250 reward: 379.37322015166285 episilon: 0.12617562129402005
episode_251 reward: 380.4633460044861 episilon: 0.12617562129402005
episode_252 reward: 380.4633460044861 episilon: 0.12617562129402005
episode_253 reward: 377.8414687871933 episilon: 0.12617562129402005
episode_254 reward: 377.8414687871933 episilon: 0.12617562129402005
episode_255 reward: 379.3700046777725 episilon: 0.12617562129402005
episode_256 reward: 379.3700046777725 epis

episode_363 reward: 380.8071120917797 episilon: 0.12617562129402005
episode_364 reward: 380.8071120917797 episilon: 0.12617562129402005
episode_365 reward: 380.3968954324722 episilon: 0.12617562129402005
episode_366 reward: 380.3968954324722 episilon: 0.12617562129402005
episode_367 reward: 373.1217606782913 episilon: 0.12617562129402005
episode_368 reward: 373.1217606782913 episilon: 0.12617562129402005
episode_369 reward: 375.4556148707867 episilon: 0.12617562129402005
episode_370 reward: 375.4556148707867 episilon: 0.12617562129402005
episode_371 reward: 381.3254927456379 episilon: 0.12617562129402005
episode_372 reward: 381.3254927456379 episilon: 0.12617562129402005
episode_373 reward: 379.29953083992007 episilon: 0.12617562129402005
episode_374 reward: 379.29953083992007 episilon: 0.12617562129402005
episode_375 reward: 382.5432978451252 episilon: 0.12617562129402005
episode_376 reward: 382.5432978451252 episilon: 0.12617562129402005
episode_377 reward: 379.04965872764586 episilo

episode_483 reward: 381.1709763407707 episilon: 0.12617562129402005
episode_484 reward: 373.3446085035801 episilon: 0.12617562129402005
episode_485 reward: 373.3446085035801 episilon: 0.12617562129402005
episode_486 reward: 371.52071564793584 episilon: 0.12617562129402005
episode_487 reward: 371.52071564793584 episilon: 0.12617562129402005
episode_488 reward: 377.12317610383036 episilon: 0.12617562129402005
episode_489 reward: 377.12317610383036 episilon: 0.12617562129402005
episode_490 reward: 374.114218711853 episilon: 0.12617562129402005
episode_491 reward: 374.114218711853 episilon: 0.12617562129402005
episode_492 reward: 378.73018830418584 episilon: 0.12617562129402005
episode_493 reward: 378.73018830418584 episilon: 0.12617562129402005
episode_494 reward: 384.3487518787384 episilon: 0.12617562129402005
episode_495 reward: 384.3487518787384 episilon: 0.12617562129402005
episode_496 reward: 379.797479981184 episilon: 0.12617562129402005


In [None]:
print(reward)

In [None]:
agent.epsilon_decay =0.999