In [1]:
import gym
import numpy as np 
from collections import namedtuple, deque 
import keras 
import keras.backend as K
from keras import layers, models, optimizers
import tensorflow as tf 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
env = gym.make('MountainCarContinuous-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
state_size = env.observation_space.shape[0]
action_size = env.action_space

In [4]:
action_size = 1 
action_low, action_high = env.action_space.low, env.action_space.high
action_low, action_high

(array([-1.], dtype=float32), array([1.], dtype=float32))

In [5]:
import random
class ReplayBuffer():
    def __init__(self, maxlen=1000, batch_size=32):
        self.memory = deque(maxlen=maxlen)
        self.batch_size = batch_size
        self.experience = namedtuple("Experience",field_names=["state", "action", "reward", "next_state", "done"])
        
    def add(self, state, action, reward, next_state, done):
        e = self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
        
    def sample(self):
        return random.sample(self.memory, k=self.batch_size)
    
    def __len__(self):
        return len(self.memory)
    

In [6]:
class Actor:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.build_model()
    def build_model(self):
        states = keras.layers.Input(shape=(self.state_size, ), name='states')
        h = layers.Dense(units=64, activation='relu') (states)
        h = layers.BatchNormalization() (h)
        h = layers.Dense(units=30, activation='relu') (h)
        
        actions = layers.Dense(units=self.action_size, activation='tanh') (h)
        
        self.model = keras.models.Model(inputs=states, outputs=actions)
        
        action_gradients = layers.Input(shape=(self.action_size,))
        loss = K.mean(-action_gradients * actions)
        optimizer = optimizers.Adam()
        updates_op = optimizer.get_updates(params=self.model.trainable_weights, loss=loss)
        self.train_fn = K.function(inputs=[self.model.input, action_gradients, K.learning_phase()],
                                  outputs=[],
                                  updates=updates_op)
        
        
        

In [12]:
class Critic():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.build_model()
    def build_model(self):
        states = layers.Input(shape=(self.state_size,), name='states')
        actions = layers.Input(shape=(self.action_size,),name='actions')
        
        s_net = layers.Dense(100, activation='relu')(states)
        s_net = layers.BatchNormalization() (s_net)
        s_net = layers.Dense(30, activation='relu')(s_net)
        s_net = layers.BatchNormalization() (s_net)
        
        a_net = layers.Dense(100, activation='relu')(actions)
        a_net = layers.BatchNormalization() (a_net)
        a_net = layers.Dense(30, activation='relu')(a_net)
        a_net = layers.BatchNormalization() (a_net)
        
        net = layers.Add()([s_net, a_net])
        net = layers.Activation('relu')(net)
        Q_values = layers.Dense(1, activation='linear')(net)
        self.model = models.Model(inputs=[states, actions], outputs=Q_values)
        
        optimizer = optimizers.Adam()
        self.model.compile(optimizer=optimizer, loss='mse')
        action_gradients = K.gradients(Q_values, actions)
        
        self.get_action_gradients = K.function(inputs=[*self.model.input, K.learning_phase()],
                                       outputs=action_gradients)

In [13]:
import copy

class OUNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, size, mu, theta, sigma):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.randn(len(x))
        self.state = x + dx
        return self.state

In [14]:
import os 
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = 2
        self.action_size = 1
        self.action_low = -1
        self.action_high = 1

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.25
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 10000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def random_policy_practice(self, action_size, n_episodes=1,max_time_steps_per_episode=500,render=False):
        for episode in range(1, n_episodes+1):
            state = self.reset_episode()
            action = np.random.uniform(-1,1, size=action_size).reshape(1, action_size)[0]
                
            #for time_step in range(max_time_steps_per_episode):
            for time_step in range(max_time_steps_per_episode):
                if render:
                    self.task.render()
                next_state, reward, done, _ = self.task.step(action)
                self.memory.add(self.last_state, action, reward, next_state, done)
                if done:
                    break;
                action = np.random.uniform(-1,1, size=action_size).reshape(1, action_size)[0]
                self.last_state = next_state
    
    def train(self, n_episodes=10000):
        """
        n_episodes       : How many episodes the agent will play
        max_time_steps   : The maximum time_steps that the Agent able to act in one episode. If max_time_steps == -1, then
                           the Agent will act/step until reaches Terminal State
        render           : env.render() # May slow training process
        """
        self.all_rewards = []
        for episode in range(1, n_episodes+1):
            state = self.reset_episode()
            action = self.act(state)
            
            rewards = []
            while True:
                
                #self.task.render()
                next_state, reward, done, _ = self.task.step(action)
                self.step(action, reward, next_state, done)
                rewards.append(reward)
                if done:
                    episode_reward = sum(rewards)
                    print("Episode : {}/{}  Reward : {}".format(episode, n_episodes,episode_reward))
                    #store episode reward i_episode pair 
                    self.all_rewards.append((episode_reward, episode))
                    break;
                action = self.act(next_state)
            if np.max(self.all_rewards) <= episode_reward:
                self.save_weights()
                
                
    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)
        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)   

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
        
    def save_weights(self):
        if not os.path.isdir('weights'):
            os.mkdir('weights')
        self.actor_local.model.save_weights('weights/actor_local.h5')
        self.actor_target.model.save_weights('weights/actor_target.h5')
        self.critic_target.model.save_weights('weights/critic_target.h5')
        self.critic_local.model.save_weights('weights/critic_local.h5')
        print("Weights saved ! ")
    
    
    def sampling(self, n_episodes=2):
        for episode in range(1, n_episodes+1):
            state = self.task.reset() 
            episode_rewards = []
            while True:
                self.task.render()
                action = np.array(self.act(state)).reshape(1, self.action_size)[0]
                next_state, reward , done, info = self.task.step(action)
                state = next_state
                episode_rewards.append(reward)
                
                if done:
                    print(" Episode : {}/{}  Reward : {} ".format(episode, n_episodes, np.sum(episode_rewards)))
                    break;
                    
    

In [15]:
Agent = DDPG(env)

In [16]:
Agent.train(n_episodes=3000,)

Episode : 1/3000  Reward : -77.1739514208887
Episode : 2/3000  Reward : 49.649796361401826
Weights saved ! 
Episode : 3/3000  Reward : 44.865705316897674
Episode : 4/3000  Reward : 20.38555349934704
Episode : 5/3000  Reward : -149.6626071311422
Episode : 6/3000  Reward : 48.03774299649952
Episode : 7/3000  Reward : -111.82800467005673
Episode : 8/3000  Reward : 57.461363034390864
Weights saved ! 
Episode : 9/3000  Reward : -117.95821043154972
Episode : 10/3000  Reward : -120.91927614678919
Episode : 11/3000  Reward : 86.73139963423534
Weights saved ! 
Episode : 12/3000  Reward : 62.255079996744335
Episode : 13/3000  Reward : 26.814017255392642
Episode : 14/3000  Reward : -124.13638140999882
Episode : 15/3000  Reward : 28.036775439694495
Episode : 16/3000  Reward : 63.80116294464639
Episode : 17/3000  Reward : 70.01685313478866
Episode : 18/3000  Reward : -136.2489107525566
Episode : 19/3000  Reward : 68.81130254573877
Episode : 20/3000  Reward : -125.48959398174135
Episode : 21/3000  R

KeyboardInterrupt: 

In [None]:
Agent.actor_local.model.load_weights('weights/actor_local.h5')

In [None]:
Agent.sampling(n_episodes=5)