In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
import time
from gym.envs.registration import register
from IPython.display import clear_output
from itertools import count

In [2]:
random.seed(2212)
np.random.seed(2212)
tf.random.set_seed(2212)

In [3]:
tf.keras.backend.set_floatx('float64')

In [4]:
env = gym.make('Pendulum-v0')
print('observation_space:', env.observation_space.shape)
print('action_space:', env.action_space.shape)

act_limit = env.action_space.high[0]

observation_space: (3,)
action_space: (1,)


In [5]:
from collections import deque

class Agent():
    def __init__(self, env):
        self.gamma = 0.99
        
        self.env = env
        state_dim = np.squeeze(env.observation_space.shape)
        action_dim = np.squeeze(env.action_space.shape)
        
        self.state_dim, self.action_dim = state_dim, action_dim
        
        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim, action_dim)
        
        self.target_actor = Actor(state_dim, action_dim)
        self.target_critic = Critic(state_dim, action_dim)
        
        self.target_actor.model.set_weights(self.actor.model.get_weights())
        self.target_critic.model.set_weights(self.critic.model.get_weights())
        
        self.memory = deque(maxlen=100000)
        
    def experience(self, e):
        self.memory.append(e)
        
    def replay(self, size):
        samples = random.sample(self.memory, size)
        states, actions, next_states, rewards, dones = zip(*samples)
        
        states = np.asarray(states).reshape((-1, self.state_dim))
        actions = np.asarray(actions).reshape((-1, self.action_dim))
        next_states = np.asarray(next_states).reshape((-1, self.state_dim))
        rewards = np.asarray(rewards).reshape((-1, 1))
        dones = np.asarray(dones).reshape((-1, 1)).astype(np.float)
        
        states = tf.convert_to_tensor(states)
        actions = tf.convert_to_tensor(actions)
        next_states = tf.convert_to_tensor(next_states)
        rewards = tf.convert_to_tensor(rewards)
        dones = tf.convert_to_tensor(dones)
        
        return states, actions, next_states, rewards, dones
        
    @tf.function
    def train(self, states, actions, next_states, rewards, dones):
        with tf.GradientTape() as tape:
            target_actions = self.target_actor.model(next_states)
            next_values = self.target_critic.model([next_states, target_actions])
            target_values = rewards + self.gamma * next_values * (1 - dones)
            
            values = self.critic.get_value(states, actions)
            critic_loss = tf.reduce_mean(tf.square(target_values - values))
            
        critic_grad = tape.gradient(critic_loss, self.critic.model.trainable_variables)
        self.critic.optimizer.apply_gradients(
            zip(critic_grad, self.critic.model.trainable_variables)
        )
        
        with tf.GradientTape() as tape:
            actions = self.actor.model(states)
            values = self.critic.model([states, actions])
            actor_loss = -tf.reduce_mean(values)
            
        actor_grad = tape.gradient(actor_loss, self.actor.model.trainable_variables)
        self.actor.optimizer.apply_gradients(
            zip(actor_grad, self.actor.model.trainable_variables)
        )

In [6]:
class Actor():
    def __init__(self, state_dim, action_dim):
        
        last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
        inputs = tf.keras.Input((state_dim,))
        hidden = tf.keras.layers.Dense(64, activation='relu')(inputs)
        #hidden = tf.keras.layers.BatchNormalization()(hidden)
        hidden = tf.keras.layers.Dense(64, activation='relu')(hidden)
        #hidden = tf.keras.layers.BatchNormalization()(hidden)
        outputs = tf.keras.layers.Dense(
            action_dim, activation='tanh', kernel_initializer=last_init
            #action_dim, activation='linear'
        )(hidden)

        outputs = outputs * act_limit
        model = tf.keras.Model(inputs, outputs)
        
        self.model = model
        self.optimizer = tf.keras.optimizers.Adam(lr=0.001)
        
    def get_policy(self, states):
        policy = self.model(tf.convert_to_tensor(states))
        return policy
        
    def get_action(self, state, noise=True):
        states = np.expand_dims(state, axis=0)
        action = self.get_policy(states)
        
        if noise:
            action += np.random.normal(0, 0.3)
            
        action = np.clip(action, -act_limit, act_limit)
        return action
    
class Critic():
    def __init__(self, state_dim, action_dim):
        state_in = tf.keras.Input((state_dim,))
        action_in = tf.keras.Input((action_dim,))
        
        concat = tf.keras.layers.concatenate([state_in, action_in])
        hidden = tf.keras.layers.Dense(64, activation='relu')(concat)
        #hidden = tf.keras.layers.BatchNormalization()(hidden)
        hidden = tf.keras.layers.Dense(64, activation='relu')(hidden)
        #hidden = tf.keras.layers.BatchNormalization()(hidden)
        output = tf.keras.layers.Dense(1, activation='linear')(hidden)
        
        model = tf.keras.Model([state_in, action_in], output)
        
        self.model = model
        self.optimizer = tf.keras.optimizers.Adam(lr=0.002)
        
    def get_value(self, states, actions):
        states, actions = tf.convert_to_tensor(states), tf.convert_to_tensor(actions)
        value = self.model([states, actions])
        return value
    
agent = Agent(env)


@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))


In [7]:
tau = 0.005

def run(agent, num_episodes, render=False):
    for ep in range(num_episodes):
        state = env.reset()
        
        done = False
        ep_reward = 0
        for step in count():
            action = agent.actor.get_action(state)[0]
            next_state, reward, done, _ = env.step(action)
            
            if render:
                env.render()
            
            ep_reward += reward
            
            agent.experience((state, action, next_state, reward, done))
            
            batch_size = 128
            if len(agent.memory) > batch_size:
                states, actions, next_states, rewards, dones = agent.replay(batch_size)
                agent.train(states, actions, next_states, rewards, dones)
                
            update_target(agent.target_actor.model.variables, agent.actor.model.variables, tau)
            update_target(agent.target_critic.model.variables, agent.critic.model.variables, tau)
            
            if done:
                break
                
            state = next_state
                
            #if step == 1: break
        print(f'Episode: {ep}, ep_reward: {ep_reward}, step: {step+1}')
        

In [8]:
run(agent, 100)

Episode: 0, ep_reward: -1432.6894552208246, step: 200
Episode: 1, ep_reward: -1537.8364227309914, step: 200
Episode: 2, ep_reward: -1373.513556611526, step: 200
Episode: 3, ep_reward: -1755.5402890398743, step: 200
Episode: 4, ep_reward: -976.3706046331565, step: 200
Episode: 5, ep_reward: -1276.3500523131042, step: 200
Episode: 6, ep_reward: -1042.453425490905, step: 200
Episode: 7, ep_reward: -986.4722966773998, step: 200
Episode: 8, ep_reward: -968.824885582649, step: 200
Episode: 9, ep_reward: -1322.797183737679, step: 200
Episode: 10, ep_reward: -1135.6764687399452, step: 200
Episode: 11, ep_reward: -1263.7223792446835, step: 200
Episode: 12, ep_reward: -1011.7006643424007, step: 200
Episode: 13, ep_reward: -515.3197731995912, step: 200
Episode: 14, ep_reward: -1232.2215217177554, step: 200
Episode: 15, ep_reward: -410.186722466114, step: 200
Episode: 16, ep_reward: -131.77884883477253, step: 200
Episode: 17, ep_reward: -128.37848905125225, step: 200
Episode: 18, ep_reward: -130.4

In [9]:
run(agent, 10, True)

Episode: 0, ep_reward: -239.48814787772773, step: 200
Episode: 1, ep_reward: -127.1561848900932, step: 200
Episode: 2, ep_reward: -232.95233327872737, step: 200
Episode: 3, ep_reward: -126.02897310010258, step: 200
Episode: 4, ep_reward: -0.9286318973483271, step: 200
Episode: 5, ep_reward: -254.75131548443818, step: 200
Episode: 6, ep_reward: -117.15766637515263, step: 200
Episode: 7, ep_reward: -258.1157172919842, step: 200
Episode: 8, ep_reward: -233.71333827836006, step: 200
Episode: 9, ep_reward: -330.0635699215766, step: 200


In [10]:
env.close()