# Synchronous PPO with PyBullet Ant

In [1]:
import datetime,gym,os,pybullet_envs,time,os,psutil,ray
import numpy as np
import tensorflow as tf
from util import gpu_sess,suppress_tf_warning,PPOBuffer,save_ppo_model_and_buffer,restore_ppo_model_and_buffer
from ppo import create_ppo_model,create_ppo_graph
np.set_printoptions(precision=2)
suppress_tf_warning() # suppress warning 
gym.logger.set_level(40) # gym logger 
print ("Packaged loaded. TF version is [%s]."%(tf.__version__))

Packaged loaded. TF version is [1.15.0].


### Rollout Worker

In [2]:
def get_env():
    import pybullet_envs,gym
    gym.logger.set_level(40) # gym logger 
    return gym.make('AntBulletEnv-v0')

def get_eval_env():
    import pybullet_envs,gym
    gym.logger.set_level(40) # gym logger
    eval_env = gym.make('AntBulletEnv-v0')
#     _ = eval_env.render(mode='human') # enable rendering
    _ = eval_env.reset()
    for _ in range(3): # dummy run for proper rendering 
        a = eval_env.action_space.sample()
        o,r,d,_ = eval_env.step(a)
        time.sleep(0.01)
    return eval_env

In [3]:
import scipy.signal

def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.
    input: 
        vector x, 
        [x0, 
         x1, 
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [4]:
# model
hdims = [64,64]
# graph
clip_ratio = 0.2
lr = 3e-4
ent_coef = 0.01
vf_coef = 0.5
max_grad_norm = 0.5
# buffer
steps_per_epoch = 5000
gamma = 0.99
lam = 0.95
# update
train_iters = 100
target_kl = 0.01
epochs = 1000
max_ep_len = 1000

In [5]:
class RolloutWorkerClass(object):
    """
    Worker without RAY (for update purposes)
    """
    def __init__(self,seed=1):
        self.seed = seed
        # Each worker should maintain its own environment
#         import pybullet_envs,gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 
        # self.env = gym.make('AntBulletEnv-v0')
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        
        # Create PPO model and computational graph 
        self.model,self.sess = create_ppo_model(env=self.env,hdims=hdims)
        self.graph = create_ppo_graph(
            self.model,lr=lr,clip_ratio=clip_ratio,
            ent_coef=ent_coef,vf_coef=vf_coef,max_grad_norm = max_grad_norm)
        
        # Initialize model 
        tf.set_random_seed(self.seed)
        np.random.seed(self.seed)
        self.sess.run(tf.global_variables_initializer())
    
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]

    def get_weights(self):
        """
        Get weights
        """
        weight_vals = self.sess.run(self.model['pi_vars'] + self.model['v_vars'])
        return weight_vals
 
            
@ray.remote
class RayRolloutWorkerClass(object):
    """
    Rollout Worker with RAY
    """
    def __init__(self,worker_id=0,ep_len_rollout=1000):
        # Parse
        self.worker_id = worker_id
        self.ep_len_rollout = ep_len_rollout
        # Each worker should maintain its own environment
        # import pybullet_envs,gym
        from util import suppress_tf_warning
        suppress_tf_warning() # suppress TF warnings
        gym.logger.set_level(40) # gym logger 
        # self.env = gym.make('AntBulletEnv-v0')
        self.env = get_env()
        odim,adim = self.env.observation_space.shape[0],self.env.action_space.shape[0]
        self.odim = odim
        self.adim = adim
        
        # Create PPO model
        self.model,self.sess = create_ppo_model(env=self.env,hdims=hdims)
        self.sess.run(tf.global_variables_initializer())
        self.buf = PPOBuffer(odim=odim, adim=adim, size=ep_len_rollout, gamma=gamma, lam=lam)
        print ("Ray Worker [%d] Ready."%(self.worker_id))
        
        # Flag to initialize assign operations for 'set_weights()'
        self.FIRST_SET_FLAG = True
        
        # Flag to initialize rollout
        self.FIRST_ROLLOUT_FLAG = True
        
    def get_action(self,o,deterministic=False):
        act_op = self.model['mu'] if deterministic else self.model['pi']
        return self.sess.run(act_op, feed_dict={self.model['o_ph']:o.reshape(1,-1)})[0]
                                    
    def set_weights(self,weight_vals):
        """
        Set weights without memory leakage
        """
        if self.FIRST_SET_FLAG:
            self.FIRST_SET_FLAG = False
            self.assign_placeholders = []
            self.assign_ops = []
            for w_idx,weight_tf_var in enumerate(self.model['pi_vars'] + self.model['v_vars']):
                a = weight_tf_var
                assign_placeholder = tf.placeholder(a.dtype, shape=a.get_shape())
                assign_op = a.assign(assign_placeholder)
                self.assign_placeholders.append(assign_placeholder)
                self.assign_ops.append(assign_op)
                
        for w_idx,weight_tf_var in enumerate(self.model['pi_vars'] + self.model['v_vars']):
            self.sess.run(self.assign_ops[w_idx],
                          {self.assign_placeholders[w_idx]:weight_vals[w_idx]})
            
    def rollout(self):
        """
        Rollout
        """
        if self.FIRST_ROLLOUT_FLAG:
            self.FIRST_ROLLOUT_FLAG = False
            self.o = self.env.reset() # reset environment
            
        # Loop
        for t in range(ep_len_rollout):
            a,v_t,logp_t = self.sess.run(
                self.model['get_action_ops'],feed_dict={self.model['o_ph']:self.o.reshape(1,-1)})
            o2,r,d,_ = self.env.step(a[0])
            
            self.buf.store(self.o, a, r, v_t, logp_t)
            
            # Save next state 
            self.o = o2
            
            if d:
                self.buf.finish_path(last_val=0.)
                self.o = self.env.reset() # reset when done
        
        last_val = self.sess.run(self.model['v'], feed_dict={self.model['o_ph']:self.o.reshape(1,-1)})
        
        self.buf.finish_path(last_val=last_val)
        
        return self.buf.get()

In [6]:
# vb = np.zeros((100,1))
# ab = np.zeros((100,8))

In [7]:
# a = [[0.96776974,-0.7034082,0.5083786,0.06832138,0.2833113,0.32790965,0.8092029,-0.29485628]]
# v_t = [0.00756513]

In [8]:
# vb[t,:] = v_t

In [9]:
# ab[0:3,:]

### Initilize PyBullet Ant Environment

In [10]:
# general
n_cpu = n_workers = 3
total_steps,evaluate_every,print_every = 300,5,1
ep_len_rollout = 5000
batch_size,update_count = 128,ep_len_rollout
num_eval,max_ep_len_eval = 3,1e3

### Initialize Workers

In [11]:
eval_env = get_eval_env()

In [12]:
ray.init(num_cpus=n_cpu,
         memory = 5*1024*1024*1024,
         object_store_memory = 10*1024*1024*1024,
         driver_object_store_memory = 1*1024*1024*1024)
tf.reset_default_graph()
R = RolloutWorkerClass(seed=1)
workers = [RayRolloutWorkerClass.remote(
    worker_id=i,ep_len_rollout=ep_len_rollout) 
           for i in range(n_workers)]
print ("RAY initialized with [%d] cpus and [%d] workers."%
       (n_cpu,n_workers))

2020-07-02 18:20:45,743	INFO resource_spec.py:212 -- Starting Ray with 4.98 GiB memory available for workers and up to 10.0 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-02 18:20:46,095	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


[I] Created trainer
RAY initialized with [3] cpus and [3] workers.


In [13]:
time.sleep(1)

### Loop

In [14]:
start_time = time.time()
n_env_step = 0 # number of environment steps

for t in range(int(total_steps)):
    esec = time.time()-start_time
    
    # Synchronize worker weights
    weights = R.get_weights()
    set_weights_list = [worker.set_weights.remote(weights) for worker in workers] 
    
    # Make rollout and accumulate to Buffers
    t_start = time.time()
    ops = [worker.rollout.remote() for worker in workers]
    rollout_vals = ray.get(ops)
    sec_rollout = time.time() - t_start
    
    # Get stats before update
    t_start = time.time()
    feeds_list = []
    for rollout_val in rollout_vals:
        feeds = {k:v for k,v in zip(R.model['all_phs'],rollout_val)}
        feeds_list.append(feeds)
        pi_l_old, v_l_old, ent = R.sess.run(
            [R.graph['pi_loss'],R.graph['v_loss'],R.graph['approx_ent']],feed_dict=feeds)
        
    # Update the central agent
    for i in range(train_iters):
        for r_idx,rollout_val in enumerate(rollout_vals):
            feeds = feeds_list[r_idx]
            _, kl = R.sess.run([R.graph['train_op'], R.graph['approx_kl']], feed_dict=feeds)
            if kl > 1.5 * target_kl:
                break
            
    # Get stats after update
    for r_idx,rollout_val in enumerate(rollout_vals):
        feeds = feeds_list[r_idx]
        pi_l_new,v_l_new,kl,cf = R.sess.run(
            [R.graph['pi_loss'],R.graph['v_loss'],R.graph['approx_kl'],R.graph['clipfrac']],
            feed_dict=feeds)
    sec_update = time.time() - t_start

    # Print
    if (t == 0) or (((t+1)%print_every) == 0): 
        print ("[%d/%d] rollout:[%.1f]s update:[%.1f]s."%
               (t+1,total_steps,sec_rollout,sec_update))

    # Evaluate
    if (t == 0) or (((t+1)%evaluate_every) == 0) or (t == (total_steps-1)): 
        ram_percent = psutil.virtual_memory().percent # memory usage
        print ("[Evaluate] step:[%d/%d][%.1f%%] #step:[%.1e] time:[%s] ram:[%.1f%%]."%
               (t+1,total_steps,t/total_steps*100,
                n_env_step,
                time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time)),
                ram_percent))
        
        o,d,ep_ret,ep_len = eval_env.reset(),False,0,0
#         _ = eval_env.render(mode='human') 
        while not(d or (ep_len == max_ep_len_eval)):
            a = R.sess.run(R.model['mu'],feed_dict={R.model['o_ph']:o.reshape(1,-1)})
            o,r,d,_ = eval_env.step(a[0])
#             _ = eval_env.render(mode='human') 
            ep_ret += r # compute return 
            ep_len += 1
        print ("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"%(ep_ret,ep_len)) 
            
#         # Save current SAC model and replay buffers 
#         npz_path = '../data/net/pybullet_ant/ppo_model_and_buffers.npz'
#         save_ppo_model_and_buffer(npz_path,R,ppobuf,VERBOSE=False)
    
print ("Done.")

[2m[36m(pid=43923)[0m Ray Worker [2] Ready.
[2m[36m(pid=43924)[0m Ray Worker [1] Ready.
[2m[36m(pid=43925)[0m Ray Worker [0] Ready.
[1/300] rollout:[9.4]s update:[1.9]s.
[Evaluate] step:[1/300][0.0%] #step:[0.0e+00] time:[00:00:11] ram:[65.1%].
[Evaluate] ep_ret:[278.6957] ep_len:[1000]
[2/300] rollout:[8.1]s update:[1.6]s.
[3/300] rollout:[7.6]s update:[1.7]s.
[4/300] rollout:[7.7]s update:[1.8]s.
[5/300] rollout:[7.9]s update:[1.8]s.
[Evaluate] step:[5/300][1.3%] #step:[0.0e+00] time:[00:00:51] ram:[67.5%].
[Evaluate] ep_ret:[586.4084] ep_len:[1000]
[6/300] rollout:[8.3]s update:[1.8]s.
[7/300] rollout:[8.6]s update:[1.9]s.
[8/300] rollout:[12.6]s update:[2.3]s.
[9/300] rollout:[11.6]s update:[2.2]s.
[10/300] rollout:[11.2]s update:[2.1]s.
[Evaluate] step:[10/300][3.0%] #step:[0.0e+00] time:[00:01:55] ram:[68.1%].
[Evaluate] ep_ret:[23.5409] ep_len:[33]
[11/300] rollout:[9.9]s update:[1.9]s.
[12/300] rollout:[9.5]s update:[1.9]s.
[13/300] rollout:[10.2]s update:[2.0]s.
[14/3

KeyboardInterrupt: 

### Close

In [None]:
eval_env.close()

In [None]:
ray.shutdown()

### Save model weights and replay buffers

In [None]:
# Path to save the npz file 
npz_path = '../data/net/pybullet_ant/ppo_model_and_buffers_final.npz'
save_ppo_model_and_buffer(npz_path,R,ppobuf,VERBOSE=False)

### Reset the worker

In [None]:
R.sess.run(tf.global_variables_initializer())

### Load and assign model weights

In [None]:
# Load npz
npz_path = '../data/net/pybullet_ant/model_and_buffers_final.npz'
restore_ppo_model_and_buffer(npz_path,R,ppobuf,VERBOSE=True)

### Test-Run

In [None]:
eval_env = get_eval_env()
o,d,ep_ret,ep_len = eval_env.reset(),False,0,0
_ = eval_env.render(mode='human') 
while not(d or (ep_len == max_ep_len_eval)):
    a = R.get_action(o,deterministic=True)
    o,r,d,_ = eval_env.step(a)
    _ = eval_env.render(mode='human') 
    ep_ret += r # compute return 
    ep_len += 1
print ("[Evaluate] ep_ret:[%.4f] ep_len:[%d]"
    %(ep_ret,ep_len))
eval_env.close() # close env