In [1]:
import tensorflow as tf
import numpy as np
import random

In [104]:
a = tf.random_normal(shape=[16,4], dtype=tf.float32)
b = tf.one_hot(tf.constant([0,0,0,0,1,1,1,2,2,2,3,3,3,2,1,3]),depth=4)
c = tf.reduce_sum(a * b, axis = -1)

with tf.Session() as sess:
    x,y,z = sess.run([a,b,c])

In [106]:
print(x)
print(z)

[[-1.50538075  0.44697276 -0.14002024 -2.03396487]
 [ 0.05067395 -0.45982233 -0.32494169 -0.12284373]
 [ 1.41731477 -0.9119519   1.51394963 -0.65013605]
 [ 0.89202952  0.62645262  0.52674538 -0.6262579 ]
 [ 0.65703481  0.63458788 -0.4508917  -0.24278168]
 [ 0.79539788 -0.87796795  0.37232098  0.8916586 ]
 [-0.24759692 -1.4219656   0.44021103 -0.90602392]
 [-0.40116554  1.2351898  -1.01278329  0.08814283]
 [ 1.27070558  1.33237183 -0.3653762  -1.50765073]
 [-0.16447587  1.90410519  1.51378405 -0.8397491 ]
 [ 0.64948243  0.33649352  2.17079663 -0.45793384]
 [-0.32103696  0.09110694  1.04220772 -1.27053106]
 [-0.16583857 -0.7216785   0.28754005 -0.11295979]
 [ 1.3425082  -1.56153238 -2.30320191 -0.55602211]
 [-0.48113894 -1.23819602 -0.13900961  1.16311073]
 [ 1.25212169  1.15608799 -0.85099274  0.41163331]]
[-1.50538075  0.05067395  1.41731477  0.89202952  0.63458788 -0.87796795
 -1.4219656  -1.01278329 -0.3653762   1.51378405 -0.45793384 -1.27053106
 -0.11295979 -2.30320191 -1.23819602 

In [1]:
import tensorflow as tf
import numpy as np

class QNetwork():
    """Actor (Policy) Model."""
    def __init__(self, state_size, action_size, optimizer, gamma = 0.9, seed = 42):
        """Initialize parameters and build model.
        Params
        ======
            state_size (int): Dimension of each state, example: (8,)
            action_size (int): Dimension of each action, example: 4
            seed (int): Random seed
        """
        "*** YOUR CODE HERE ***"
        self.state_size = state_size
        self.action_size = action_size
        self.optim = optimizer
        self.gamma = gamma

        print("State size: %i" % self.state_size)
        print("Action size: %i" % self.action_size)
        # Initalize
        tf.reset_default_graph()
        np.random.seed(seed)
        self.sess = tf.Session() # Prepare a tensorflow sesion

        # Prepared placeholders
        self.state = tf.placeholder(shape = [None, self.state_size], dtype = tf.float32) # input: S
        self.next_state = tf.placeholder(shape = [None, self.state_size], dtype = tf.float32) # input: S'
        self.action = tf.placeholder(shape = [None, 1], dtype = tf.int32) # action
        self.reward = tf.placeholder(shape = [None, 1], dtype = tf.float32) # reward
        self.done = tf.placeholder(shape = [None, 1], dtype = tf.float32) # done or not

        self.action = tf.one_hot(self.action, depth = action_size)

        # Build networks
        with tf.variable_scope("Qtable"):
            neurons_of_layers = [64, 64, 64]
            # Use to update/train the agent's brain
            # Used to get Q(s, a)
            self.q_local = self._build_model(x = self.state, 
                                             neurons_of_layers = neurons_of_layers, 
                                             scope = 'local', trainable = True)

            # with fixed parameters, used to get Q(s', a)
            self.q_target = self._build_model(x = self.next_state, 
                                              neurons_of_layers = neurons_of_layers, 
                                              scope = 'target', trainable = False)
        
        # Handlers of parameters
        self.localnet_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'Qtable/local')
        self.targetnet_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'Qtable/target')
        self.params_replace = [tf.assign(old, new) for old, new in zip(self.localnet_params, self.targetnet_params)]

        # Compute loss (TD-loss), TD_error = lr * ((reward + gamma * max(Q(s',A)) - Q(s,a))
        td_target = self.reward + self.gamma * tf.reduce_max(self.q_target, axis = -1) * (1. - self.done)
        td_expect = tf.reduce_sum(self.q_local*self.action, axis = -1)
        self.loss = tf.reduce_mean(tf.squared_difference(td_target, td_expect))
        with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
            self.update_ops = self.optim.minimize(self.loss, var_list = self.localnet_params)

        # Finally, initalize weights
        self.saver = tf.train.Saver()
        self.sess.run(tf.global_variables_initializer())
        print("Network Ready")

    def _build_model(self, x, scope, trainable, neurons_of_layers = [25,25,25], acti = tf.nn.relu):
        """
        x: input_placeholder
        scope: local network or target network
        trainable: control if it is controlable or not
        neurons_of_layers: numbers of units for each layers
        acti: activation
        """
        # Defined block function, useful when repeats
        def mlp_block(x, name, units, activation, trainable = True):
            with tf.variable_scope(name):
                x = tf.layers.dense(x, units = units, trainable = trainable)
                x = activation(x)
            return x
            
        # Build a simple multi-layers network
        with tf.variable_scope(scope):
            for i, n_unit in enumerate(neurons_of_layers):
                block_name = 'block_' + str(i)
                # if i = 0 (first block), inputs should be input_placeholder
                x = mlp_block(x = x, 
                    name = block_name, 
                    units = n_unit, 
                    activation = acti, 
                    trainable = trainable)
            x = tf.layers.dense(x, units = self.action_size, trainable = trainable)
        return x

    def get_action(self, state):
        """
        Generate action from localnet
        get the action with the highest state-action pair
        """
        action = np.argmax(self.sess.run(self.q_local, feed_dict = {self.state:state}))
        return action

    def train(self, batch):
        """
        Train the localnet
        - Args:
            - batch: (state, action, reward, next_state, done)
        - Returns:
            - loss
        """
        current_loss, _ = self.sess.run([self.loss, self.update_ops], feed_dict= {self.state:state,
                                                                                  self.action: action,
                                                                                  self.reward: reward,
                                                                                  self.next_state: next_state,
                                                                                  self.done: done})
        return current_loss

    def update_target_network(self):
        """
        Swap memory from localnet to targetnet,
        simply session run the replacement ops
        """
        self.sess.run(self.params_replace)

    def save_model(self, model_name = None):
        """
        Save the model (save localnet, we don't save target net)
        """
        saver.save(self.sess, 'dqn.ckpt' if model_name is None else model_name)
        print("model saved")

    def load_model(self, model_name = None):
        """
        Load the model
        """
        saver.restore(self.sess, 'dqn.ckpt' if model_name is None else model_name)
        print("model loaded")

In [3]:
class ReplayBuffer:
    """Fixed-size buffer to store experience tuples."""

    def __init__(self, action_size, buffer_size, batch_size, seed):
        """Initialize a ReplayBuffer object.

        Params
        ======
            action_size (int): dimension of each action
            buffer_size (int): maximum size of buffer
            batch_size (int): size of each training batch
            seed (int): random seed
        """
        self.action_size = action_size
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        """Add a new experience to memory."""
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
    
    def sample(self):
        """Randomly sample a batch of experiences from memory."""
        experiences = random.sample(self.memory, k=self.batch_size)

        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.vstack([e.action for e in experiences if e is not None])
        rewards = np.vstack([e.reward for e in experiences if e is not None])
        next_states = np.vstack([e.next_state for e in experiences if e is not None])
        dones = np.vstack([e.done for e in experiences if e is not None])
  
        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        """Return the current size of internal memory."""
        return len(self.memory)

In [5]:
import numpy as np
import random
from collections import namedtuple, deque
import sys

#from model import QNetwork
import tensorflow as tf

BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 1e-3              # for soft update of target parameters
LR = 5e-4               # learning rate 
UPDATE_EVERY = 4        # how often to update the network

In [6]:
class Agent():
    """Interacts with and learns from the environment."""

    def __init__(self, state_size, action_size, seed):
        """Initialize an Agent object.
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)

        # Q-Network
        optimizer = tf.train.RMSPropOptimizer(learning_rate= LR)
        self.Qnetwork = QNetwork(state_size = state_size, 
                                 action_size = action_size, 
                                 optimizer = optimizer,
                                 gamma=GAMMA)
        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
    
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)
        
        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        # Epsilon-greedy action selection
        if random.random() > eps:
            action_value = self.Qnetwork.get_action(state)
            return np.argmax(action_values)
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        #states, actions, rewards, next_states, dones = experiences

        ## TODO: compute and minimize the loss
        "*** YOUR CODE HERE ***"
        current_loss = self.Qnetwork.train(experiences)
        print('\rCurrent loss: %.3f' % current_loss)
        sys.stdout.flush()

        # ------------------- update target network ------------------- #
        self.Qnetwork.update_target_network()