From eee7f71943dc9efd9e6c85c7c400891682b0a991 Mon Sep 17 00:00:00 2001
From: quantumiracle <1402434478@qq.com>
Date: Tue, 11 Jun 2019 16:02:47 +0100
Subject: [PATCH 1/8] change readme

---
 examples/reinforcement_learning/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index dc9b412f5..3a1262ccd 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -113,11 +113,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
   
   -The max operator in standard DQN uses the same values both to select and to evaluate an action by:
   
-     Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a).
+     Q(s_t, a_t) = R_{t+1} + gamma * max_{a}Q_{target}(s_{t+1}, a).
   
   -Double DQN proposes to use following evaluation to address overestimation problem of max operator:
   
-     Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)).
+     Q(s_t, a_t) = R_{t+1} + gamma * Q_{target}(s_{t+1}, max_{a}Q(s_{t+1}, a)).
   
   -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
   

From 35ee5d24c28ab3016e96cc3b51e43b234b3032e8 Mon Sep 17 00:00:00 2001
From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com>
Date: Wed, 12 Jun 2019 19:03:55 +0800
Subject: [PATCH 2/8] Add files via upload

---
 .../reinforcement_learning/tutorial_DDPG.py   | 37 +++++--------
 .../reinforcement_learning/tutorial_DPPO.py   | 51 ++++++++----------
 .../reinforcement_learning/tutorial_PG.py     | 51 ++++++++----------
 .../reinforcement_learning/tutorial_PPO.py    | 54 ++++++++-----------
 .../reinforcement_learning/tutorial_TRPO.py   | 48 ++++++++---------
 5 files changed, 102 insertions(+), 139 deletions(-)

diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py
index 43efe0ad1..0bd9cadd0 100644
--- a/examples/reinforcement_learning/tutorial_DDPG.py
+++ b/examples/reinforcement_learning/tutorial_DDPG.py
@@ -27,16 +27,14 @@
 
 """
 
-import argparse
-import os
-import time
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-import gym
 import tensorflow as tf
 import tensorlayer as tl
+import numpy as np
+import gym
+import time
+import matplotlib.pyplot as plt
+import os
+import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -60,6 +58,7 @@
 TEST_PER_EPISODES = 10  # test the model per episodes
 VAR = 3  # control exploration
 
+
 ###############################  DDPG  ####################################
 
 
@@ -160,8 +159,8 @@ def learn(self):
         indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
         bt = self.memory[indices, :]
         bs = bt[:, :self.s_dim]
-        ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
-        br = bt[:, -self.s_dim - 1:-self.s_dim]
+        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
+        br = bt[:, -self.s_dim - 1: -self.s_dim]
         bs_ = bt[:, -self.s_dim:]
 
         with tf.GradientTape() as tape:
@@ -176,7 +175,7 @@ def learn(self):
         with tf.GradientTape() as tape:
             a = self.actor(bs)
             q = self.critic([bs, a])
-            a_loss = -tf.reduce_mean(q)  # maximize the q
+            a_loss = - tf.reduce_mean(q)  # maximize the q
         a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
         self.actor_opt.apply_gradients(zip(a_grads, self.actor.trainable_weights))
 
@@ -260,12 +259,8 @@ def load_ckpt(self):
                 s = s_
                 ep_reward += r
                 if j == MAX_EP_STEPS - 1:
-                    print(
-                        '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                            i, MAX_EPISODES, ep_reward,
-                            time.time() - t1
-                        ), end=''
-                    )
+                    print('\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
+                          .format(i, MAX_EPISODES, ep_reward, time.time() - t1), end='')
                 plt.show()
             # test
             if i and not i % TEST_PER_EPISODES:
@@ -280,12 +275,8 @@ def load_ckpt(self):
                     s = s_
                     ep_reward += r
                     if j == MAX_EP_STEPS - 1:
-                        print(
-                            '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                                i, MAX_EPISODES, ep_reward,
-                                time.time() - t1
-                            )
-                        )
+                        print('\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
+                              .format(i, MAX_EPISODES, ep_reward, time.time() - t1))
 
                         reward_buffer.append(ep_reward)
 
diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py
index 62eb7f7fb..c9747c867 100644
--- a/examples/reinforcement_learning/tutorial_DPPO.py
+++ b/examples/reinforcement_learning/tutorial_DPPO.py
@@ -1,7 +1,7 @@
 """
 Distributed Proximal Policy Optimization (DPPO)
 ----------------------------
-A distributed version of OpenAI's Proximal Policy Optimization (PPO).
+A distributing version of OpenAI's Proximal Policy Optimization (PPO).
 Workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
 Restart workers once PPO is updated.
 
@@ -29,19 +29,16 @@
 
 """
 
-import argparse
-import os
-import queue
-import threading
-import time
-
-import matplotlib.pyplot as plt
+import tensorflow as tf
 import numpy as np
+import matplotlib.pyplot as plt
+import gym, threading, queue
+import time
 
-import gym
-import tensorflow as tf
-import tensorflow_probability as tfp
 import tensorlayer as tl
+import tensorflow_probability as tfp
+import os
+import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -63,17 +60,16 @@
 C_UPDATE_STEPS = 10  # critic update steps
 S_DIM, A_DIM = 3, 1  # state dimension, action dimension
 EPS = 1e-8  # epsilon
-METHOD = [
-    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
-    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
-][1]  # choose the method for optimization
+METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+          dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+          ][1]  # choose the method for optimization
 
 N_WORKER = 4  # parallel workers
 MIN_BATCH_SIZE = 64  # minimum batch size for updating PPO
 UPDATE_STEP = 10  # loop update operation n-steps
 
-###############################  DPPO  ####################################
 
+###############################  DPPO  ####################################
 
 class PPO(object):
     '''
@@ -92,6 +88,8 @@ def __init__(self):
         # actor
         self.actor = self._build_anet('pi', trainable=True)
         self.actor_old = self._build_anet('oldpi', trainable=False)
+        self.actor_opt = tf.optimizers.Adam(A_LR)
+        self.critic_opt = tf.optimizers.Adam(C_LR)
 
     def a_train(self, tfs, tfa, tfadv):
         '''
@@ -120,13 +118,12 @@ def a_train(self, tfs, tfa, tfadv):
                 kl_mean = tf.reduce_mean(kl)
                 aloss = -(tf.reduce_mean(surr - tflam * kl))
             else:  # clipping method, find this is better
-                aloss = -tf.reduce_mean(
-                    tf.minimum(surr,
-                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
-                )
+                aloss = -tf.reduce_mean(tf.minimum(
+                    surr,
+                    tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv))
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
-        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+        self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
 
         if METHOD['name'] == 'kl_pen':
             return kl_mean
@@ -151,7 +148,7 @@ def c_train(self, tfdc_r, s):
             advantage = tfdc_r - self.critic(s)
             closs = tf.reduce_mean(tf.square(advantage))
         grad = tape.gradient(closs, self.critic.trainable_weights)
-        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+        self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
 
     def cal_adv(self, tfs, tfdc_r):
         '''
@@ -284,7 +281,7 @@ class Worker(object):
     def __init__(self, wid):
         self.wid = wid
         self.env = gym.make(GAME).unwrapped
-        self.env.seed(wid * 100 + RANDOMSEED)
+        self.env.seed(wid*100 + RANDOMSEED)
         self.ppo = GLOBAL_PPO
 
     def work(self):
@@ -337,12 +334,8 @@ def work(self):
                 GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1)
             GLOBAL_EP += 1
 
-            print(
-                'Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                    GLOBAL_EP, EP_MAX, self.wid, ep_r,
-                    time.time() - t0
-                )
-            )
+            print('Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'
+                  .format(GLOBAL_EP, EP_MAX, self.wid, ep_r, time.time() - t0))
 
 
 if __name__ == '__main__':
diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py
index 42c47aacc..7adb76d2d 100644
--- a/examples/reinforcement_learning/tutorial_PG.py
+++ b/examples/reinforcement_learning/tutorial_PG.py
@@ -27,39 +27,38 @@
 
 """
 
-import argparse
-import os
-import time
-
-import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorlayer as tl
 import numpy as np
 
 import gym
-import tensorflow as tf
-import tensorlayer as tl
+import matplotlib.pyplot as plt
+import time
+import os
+import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
 parser.add_argument('--test', dest='train', action='store_false')
 args = parser.parse_args()
 
+
 #####################  hyper parameters  ####################
 
 ENV_NAME = 'CartPole-v0'  # environment name
 RANDOMSEED = 1  # random seed
 
-DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
-RENDER = False  # rendering wastes time
+DISPLAY_REWARD_THRESHOLD = 400      # renders environment if total episode reward is greater then this threshold
+RENDER = False                      # rendering wastes time
 num_episodes = 3000
 
-###############################  PG  ####################################
 
+###############################  PG  ####################################
 
 class PolicyGradient:
     """
     PG class
     """
-
     def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95):
         self.n_actions = n_actions
         self.n_features = n_features
@@ -76,22 +75,16 @@ def get_model(inputs_shape):
             """
             with tf.name_scope('inputs'):
                 self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations")
-                self.tf_acts = tl.layers.Input([
-                    None,
-                ], tf.int32, name="actions_num")
-                self.tf_vt = tl.layers.Input([
-                    None,
-                ], tf.float32, name="actions_value")
+                self.tf_acts = tl.layers.Input([None, ], tf.int32, name="actions_num")
+                self.tf_vt = tl.layers.Input([None, ], tf.float32, name="actions_value")
             # fc1
-            layer = tl.layers.Dense(
-                n_units=30, act=tf.nn.tanh, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
-                b_init=tf.constant_initializer(0.1), name='fc1'
-            )(self.tf_obs)
+            layer = tl.layers.Dense(n_units=30, act=tf.nn.tanh,
+                                    W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                                    b_init=tf.constant_initializer(0.1), name='fc1')(self.tf_obs)
             # fc2
-            all_act = tl.layers.Dense(
-                n_units=self.n_actions, act=None, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
-                b_init=tf.constant_initializer(0.1), name='all_act'
-            )(layer)
+            all_act = tl.layers.Dense(n_units=self.n_actions, act=None,
+                                      W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                                      b_init=tf.constant_initializer(0.1), name='all_act')(layer)
             return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model')
 
         self.model = get_model([None, n_features])
@@ -198,7 +191,7 @@ def load_ckpt(self):
     tl.logging.set_verbosity(tl.logging.DEBUG)
 
     env = gym.make(ENV_NAME)
-    env.seed(RANDOMSEED)  # reproducible, general Policy gradient has high variance
+    env.seed(RANDOMSEED)                         # reproducible, general Policy gradient has high variance
     env = env.unwrapped
 
     print(env.action_space)
@@ -245,10 +238,8 @@ def load_ckpt(self):
 
                     # print("episode:", i_episode, "  reward:", int(running_reward))
 
-                    print(
-                        "Episode [%d/%d] \tsum reward: %d  \trunning reward: %f \ttook: %.5fs " %
-                        (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)
-                    )
+                    print("Episode [%d/%d] \tsum reward: %d  \trunning reward: %f \ttook: %.5fs " %
+                          (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time))
                     reward_buffer.append(running_reward)
 
                     vt = RL.learn()
diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py
index d95633234..c84effaf6 100644
--- a/examples/reinforcement_learning/tutorial_PPO.py
+++ b/examples/reinforcement_learning/tutorial_PPO.py
@@ -28,17 +28,15 @@
 
 """
 
-import argparse
-import os
-import time
-
-import matplotlib.pyplot as plt
+import tensorflow as tf
 import numpy as np
-
+import matplotlib.pyplot as plt
 import gym
-import tensorflow as tf
-import tensorflow_probability as tfp
 import tensorlayer as tl
+import tensorflow_probability as tfp
+import time
+import os
+import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -60,13 +58,12 @@
 C_UPDATE_STEPS = 10  # critic update steps
 S_DIM, A_DIM = 3, 1  # state dimension, action dimension
 EPS = 1e-8  # epsilon
-METHOD = [
-    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
-    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
-][1]  # choose the method for optimization
+METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+          dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+          ][1]  # choose the method for optimization
 
-###############################  PPO  ####################################
 
+###############################  PPO  ####################################
 
 class PPO(object):
     '''
@@ -85,6 +82,8 @@ def __init__(self):
         # actor
         self.actor = self._build_anet('pi', trainable=True)
         self.actor_old = self._build_anet('oldpi', trainable=False)
+        self.actor_opt = tf.optimizers.Adam(A_LR)
+        self.critic_opt = tf.optimizers.Adam(C_LR)
 
     def a_train(self, tfs, tfa, tfadv):
         '''
@@ -113,13 +112,12 @@ def a_train(self, tfs, tfa, tfadv):
                 kl_mean = tf.reduce_mean(kl)
                 aloss = -(tf.reduce_mean(surr - tflam * kl))
             else:  # clipping method, find this is better
-                aloss = -tf.reduce_mean(
-                    tf.minimum(surr,
-                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
-                )
+                aloss = -tf.reduce_mean(tf.minimum(
+                    surr,
+                    tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv))
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
-        tf.optimizers.Adam(A_LR).apply_gradients(zip(a_gard, self.actor.trainable_weights))
+        self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
 
         if METHOD['name'] == 'kl_pen':
             return kl_mean
@@ -146,7 +144,7 @@ def c_train(self, tfdc_r, s):
             closs = tf.reduce_mean(tf.square(advantage))
         # print('tfdc_r value', tfdc_r)
         grad = tape.gradient(closs, self.critic.trainable_weights)
-        tf.optimizers.Adam(C_LR).apply_gradients(zip(grad, self.critic.trainable_weights))
+        self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights))
 
     def cal_adv(self, tfs, tfdc_r):
         '''
@@ -177,16 +175,14 @@ def update(self, s, a, r):
         if METHOD['name'] == 'kl_pen':
             for _ in range(A_UPDATE_STEPS):
                 kl = self.a_train(s, a, adv)
-                if kl > 4 * METHOD['kl_target']:  # this in in google's paper
+                if kl > 4 * METHOD['kl_target']:              # this in in google's paper
                     break
-            if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
+            if kl < METHOD['kl_target'] / 1.5:                # adaptive lambda, this is in OpenAI's paper
                 METHOD['lam'] /= 2
             elif kl > METHOD['kl_target'] * 1.5:
                 METHOD['lam'] *= 2
-            METHOD['lam'] = np.clip(
-                METHOD['lam'], 1e-4, 10
-            )  # sometimes explode, this clipping is MorvanZhou's solution
-        else:  # clipping method, find this is better (OpenAI's paper)
+            METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)  # sometimes explode, this clipping is MorvanZhou's solution
+        else:                                                 # clipping method, find this is better (OpenAI's paper)
             for _ in range(A_UPDATE_STEPS):
                 self.a_train(s, a, adv)
 
@@ -301,12 +297,8 @@ def load_ckpt(self):
                 all_ep_r.append(ep_r)
             else:
                 all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
-            print(
-                'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
-                    ep, EP_MAX, ep_r,
-                    time.time() - t0
-                )
-            )
+            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
+                  .format(ep, EP_MAX, ep_r, time.time() - t0))
 
             plt.ion()
             plt.cla()
diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py
index f64a0a0c0..017ac086d 100644
--- a/examples/reinforcement_learning/tutorial_TRPO.py
+++ b/examples/reinforcement_learning/tutorial_TRPO.py
@@ -28,20 +28,19 @@
 python tutorial_TRPO.py --train/test
 
 """
-import argparse
-import copy
-import os
-import time
-
-import matplotlib.pyplot as plt
 import numpy as np
-import scipy.signal
-
-import gym
 import tensorflow as tf
 import tensorflow_probability as tfp
 import tensorlayer as tl
+import gym
+import time
+
+import matplotlib.pyplot as plt
+import scipy.signal
+import copy
 from gym.spaces import Box, Discrete
+import os
+import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -77,8 +76,8 @@
 SAVE_FREQ = 10  # How often (in terms of gap between epochs) to save the current policy and value function
 EPS = 1e-8  # epsilon
 
-#####################  functions  ####################
 
+#####################  functions  ####################
 
 def combined_shape(length, shape=None):
     """
@@ -137,7 +136,7 @@ def input_layer_from_space(space):
     if isinstance(space, Box):
         return input_layer(space.shape)
     elif isinstance(space, Discrete):
-        return tl.layers.Input(dtype=tf.int32, shape=(None, ))
+        return tl.layers.Input(dtype=tf.int32, shape=(None,))
     raise NotImplementedError
 
 
@@ -150,7 +149,7 @@ def input_layers_from_spaces(*args):
     return [input_layer_from_space(space) for space in args]
 
 
-def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None):
+def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
     """
     create Multi-Layer Perception
     :param x: tensorlayer input layer
@@ -191,7 +190,7 @@ def gaussian_likelihood(x, mu, log_std):
     :param log_std: log std
     :return: gaussian likelihood
     """
-    pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
+    pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
     return tf.reduce_sum(pre_sum, axis=1)
 
 
@@ -202,7 +201,7 @@ def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1):
     (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions)
     """
     var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
-    pre_sum = 0.5 * (((mu1 - mu0)**2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0
+    pre_sum = 0.5 * (((mu1 - mu0) ** 2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0
     all_kls = tf.reduce_sum(pre_sum, axis=1)
     return tf.reduce_mean(all_kls)
 
@@ -222,7 +221,7 @@ def flat_concat(xs):
     :param xs: a list of tensor
     :return: flat tensor
     """
-    return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0)
+    return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0)
 
 
 def assign_params_from_flat(x, params):
@@ -334,10 +333,8 @@ def cal_outputs_1(self, states, actions, old_log_std_ph, old_mu_ph):
 """
 
 
-def mlp_actor_critic(
-        x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh,
-        output_activation=None
-):
+def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh,
+                     output_activation=None):
     """
     create actor and critic
     :param x: observation space
@@ -356,7 +353,6 @@ def mlp_actor_critic(
         raise ValueError('action space type error')
 
     class Critic:
-
         def __init__(self, obs_space, hidden_layer_sizes, activation_funcs):
             inputs = input_layer_from_space(obs_space)
             self.model = tl.models.Model(inputs, mlp(inputs, list(hidden_layer_sizes) + [1], activation_funcs, None))
@@ -446,11 +442,12 @@ def get(self):
         # the next two lines implement the advantage normalization trick
         adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
         self.adv_buf = (self.adv_buf - adv_mean) / adv_std
-        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
-               ] + values_as_sorted_list(self.info_bufs)
+        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf,
+                self.logp_buf] + values_as_sorted_list(self.info_bufs)
 
 
 #####################  TRPO  ####################
+
 """
 
 Trust Region Policy Optimization 
@@ -464,7 +461,6 @@ class TRPO:
     """
     trpo class
     """
-
     def __init__(self, obs_space, act_space):
 
         obs_dim = obs_space.shape
@@ -500,7 +496,7 @@ def get_action_ops(self, states):
         res0 = [pi, v, logp_pi] + values_as_sorted_list(info)
         res = []
         for i in res0:
-            res.append(i + 0)  # transfer to tensor
+            res.append(i + 0)   # transfer to tensor
         return res
 
     # TRPO losses
@@ -525,7 +521,7 @@ def v_loss(self, inputs):
         """
         x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
         v = self.critic.critic_cal_func(x_ph)
-        v_loss = tf.reduce_mean((ret_ph - v)**2)
+        v_loss = tf.reduce_mean((ret_ph - v) ** 2)
         return v_loss
 
     def train_vf(self, inputs):
@@ -656,7 +652,7 @@ def set_and_eval(step):
 
         # trpo augments npg with backtracking line search, hard kl
         for j in range(BACKTRACK_ITERS):
-            kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF**j)
+            kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF ** j)
             if kl <= DELTA and pi_l_new <= pi_l_old:
                 # Accepting new params at step of line search
                 break

From 2a12a04b288a270956a4da11375ea216523b5589 Mon Sep 17 00:00:00 2001
From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com>
Date: Wed, 12 Jun 2019 19:54:49 +0800
Subject: [PATCH 3/8] fix opt and make format

---
 .../reinforcement_learning/tutorial_DDPG.py   | 35 ++++++++-----
 .../reinforcement_learning/tutorial_DPPO.py   | 45 ++++++++++-------
 .../reinforcement_learning/tutorial_PG.py     | 50 +++++++++++--------
 .../reinforcement_learning/tutorial_PPO.py    | 49 ++++++++++--------
 .../reinforcement_learning/tutorial_TRPO.py   | 47 +++++++++--------
 5 files changed, 133 insertions(+), 93 deletions(-)

diff --git a/examples/reinforcement_learning/tutorial_DDPG.py b/examples/reinforcement_learning/tutorial_DDPG.py
index 0bd9cadd0..a0079a014 100644
--- a/examples/reinforcement_learning/tutorial_DDPG.py
+++ b/examples/reinforcement_learning/tutorial_DDPG.py
@@ -27,14 +27,16 @@
 
 """
 
-import tensorflow as tf
-import tensorlayer as tl
-import numpy as np
-import gym
+import argparse
+import os
 import time
+
 import matplotlib.pyplot as plt
-import os
-import argparse
+import numpy as np
+
+import gym
+import tensorflow as tf
+import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -58,7 +60,6 @@
 TEST_PER_EPISODES = 10  # test the model per episodes
 VAR = 3  # control exploration
 
-
 ###############################  DDPG  ####################################
 
 
@@ -159,8 +160,8 @@ def learn(self):
         indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
         bt = self.memory[indices, :]
         bs = bt[:, :self.s_dim]
-        ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
-        br = bt[:, -self.s_dim - 1: -self.s_dim]
+        ba = bt[:, self.s_dim:self.s_dim + self.a_dim]
+        br = bt[:, -self.s_dim - 1:-self.s_dim]
         bs_ = bt[:, -self.s_dim:]
 
         with tf.GradientTape() as tape:
@@ -259,8 +260,12 @@ def load_ckpt(self):
                 s = s_
                 ep_reward += r
                 if j == MAX_EP_STEPS - 1:
-                    print('\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
-                          .format(i, MAX_EPISODES, ep_reward, time.time() - t1), end='')
+                    print(
+                        '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                            i, MAX_EPISODES, ep_reward,
+                            time.time() - t1
+                        ), end=''
+                    )
                 plt.show()
             # test
             if i and not i % TEST_PER_EPISODES:
@@ -275,8 +280,12 @@ def load_ckpt(self):
                     s = s_
                     ep_reward += r
                     if j == MAX_EP_STEPS - 1:
-                        print('\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
-                              .format(i, MAX_EPISODES, ep_reward, time.time() - t1))
+                        print(
+                            '\rEpisode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                                i, MAX_EPISODES, ep_reward,
+                                time.time() - t1
+                            )
+                        )
 
                         reward_buffer.append(ep_reward)
 
diff --git a/examples/reinforcement_learning/tutorial_DPPO.py b/examples/reinforcement_learning/tutorial_DPPO.py
index c9747c867..abe4be035 100644
--- a/examples/reinforcement_learning/tutorial_DPPO.py
+++ b/examples/reinforcement_learning/tutorial_DPPO.py
@@ -1,7 +1,7 @@
 """
 Distributed Proximal Policy Optimization (DPPO)
 ----------------------------
-A distributing version of OpenAI's Proximal Policy Optimization (PPO).
+A distributed version of OpenAI's Proximal Policy Optimization (PPO).
 Workers in parallel to collect data, then stop worker's roll-out and train PPO on collected data.
 Restart workers once PPO is updated.
 
@@ -29,16 +29,19 @@
 
 """
 
-import tensorflow as tf
-import numpy as np
-import matplotlib.pyplot as plt
-import gym, threading, queue
+import argparse
+import os
+import queue
+import threading
 import time
 
-import tensorlayer as tl
+import matplotlib.pyplot as plt
+import numpy as np
+
+import gym
+import tensorflow as tf
 import tensorflow_probability as tfp
-import os
-import argparse
+import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -60,17 +63,18 @@
 C_UPDATE_STEPS = 10  # critic update steps
 S_DIM, A_DIM = 3, 1  # state dimension, action dimension
 EPS = 1e-8  # epsilon
-METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
-          dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
-          ][1]  # choose the method for optimization
+METHOD = [
+    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+][1]  # choose the method for optimization
 
 N_WORKER = 4  # parallel workers
 MIN_BATCH_SIZE = 64  # minimum batch size for updating PPO
 UPDATE_STEP = 10  # loop update operation n-steps
 
-
 ###############################  DPPO  ####################################
 
+
 class PPO(object):
     '''
     PPO class
@@ -118,9 +122,10 @@ def a_train(self, tfs, tfa, tfadv):
                 kl_mean = tf.reduce_mean(kl)
                 aloss = -(tf.reduce_mean(surr - tflam * kl))
             else:  # clipping method, find this is better
-                aloss = -tf.reduce_mean(tf.minimum(
-                    surr,
-                    tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv))
+                aloss = -tf.reduce_mean(
+                    tf.minimum(surr,
+                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
+                )
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
         self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
@@ -281,7 +286,7 @@ class Worker(object):
     def __init__(self, wid):
         self.wid = wid
         self.env = gym.make(GAME).unwrapped
-        self.env.seed(wid*100 + RANDOMSEED)
+        self.env.seed(wid * 100 + RANDOMSEED)
         self.ppo = GLOBAL_PPO
 
     def work(self):
@@ -334,8 +339,12 @@ def work(self):
                 GLOBAL_RUNNING_R.append(GLOBAL_RUNNING_R[-1] * 0.9 + ep_r * 0.1)
             GLOBAL_EP += 1
 
-            print('Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'
-                  .format(GLOBAL_EP, EP_MAX, self.wid, ep_r, time.time() - t0))
+            print(
+                'Episode: {}/{}  | Worker: {} | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    GLOBAL_EP, EP_MAX, self.wid, ep_r,
+                    time.time() - t0
+                )
+            )
 
 
 if __name__ == '__main__':
diff --git a/examples/reinforcement_learning/tutorial_PG.py b/examples/reinforcement_learning/tutorial_PG.py
index 7adb76d2d..c4a658a99 100644
--- a/examples/reinforcement_learning/tutorial_PG.py
+++ b/examples/reinforcement_learning/tutorial_PG.py
@@ -26,39 +26,39 @@
 python tutorial_PG.py --train/test
 
 """
+import argparse
+import os
+import time
 
-import tensorflow as tf
-import tensorlayer as tl
+import matplotlib.pyplot as plt
 import numpy as np
 
 import gym
-import matplotlib.pyplot as plt
-import time
-import os
-import argparse
+import tensorflow as tf
+import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
 parser.add_argument('--test', dest='train', action='store_false')
 args = parser.parse_args()
 
-
 #####################  hyper parameters  ####################
 
 ENV_NAME = 'CartPole-v0'  # environment name
 RANDOMSEED = 1  # random seed
 
-DISPLAY_REWARD_THRESHOLD = 400      # renders environment if total episode reward is greater then this threshold
-RENDER = False                      # rendering wastes time
+DISPLAY_REWARD_THRESHOLD = 400  # renders environment if total episode reward is greater then this threshold
+RENDER = False  # rendering wastes time
 num_episodes = 3000
 
-
 ###############################  PG  ####################################
 
+
 class PolicyGradient:
     """
     PG class
     """
+
     def __init__(self, n_features, n_actions, learning_rate=0.01, reward_decay=0.95):
         self.n_actions = n_actions
         self.n_features = n_features
@@ -75,16 +75,22 @@ def get_model(inputs_shape):
             """
             with tf.name_scope('inputs'):
                 self.tf_obs = tl.layers.Input(inputs_shape, tf.float32, name="observations")
-                self.tf_acts = tl.layers.Input([None, ], tf.int32, name="actions_num")
-                self.tf_vt = tl.layers.Input([None, ], tf.float32, name="actions_value")
+                self.tf_acts = tl.layers.Input([
+                    None,
+                ], tf.int32, name="actions_num")
+                self.tf_vt = tl.layers.Input([
+                    None,
+                ], tf.float32, name="actions_value")
             # fc1
-            layer = tl.layers.Dense(n_units=30, act=tf.nn.tanh,
-                                    W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
-                                    b_init=tf.constant_initializer(0.1), name='fc1')(self.tf_obs)
+            layer = tl.layers.Dense(
+                n_units=30, act=tf.nn.tanh, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                b_init=tf.constant_initializer(0.1), name='fc1'
+            )(self.tf_obs)
             # fc2
-            all_act = tl.layers.Dense(n_units=self.n_actions, act=None,
-                                      W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
-                                      b_init=tf.constant_initializer(0.1), name='all_act')(layer)
+            all_act = tl.layers.Dense(
+                n_units=self.n_actions, act=None, W_init=tf.random_normal_initializer(mean=0, stddev=0.3),
+                b_init=tf.constant_initializer(0.1), name='all_act'
+            )(layer)
             return tl.models.Model(inputs=self.tf_obs, outputs=all_act, name='PG model')
 
         self.model = get_model([None, n_features])
@@ -191,7 +197,7 @@ def load_ckpt(self):
     tl.logging.set_verbosity(tl.logging.DEBUG)
 
     env = gym.make(ENV_NAME)
-    env.seed(RANDOMSEED)                         # reproducible, general Policy gradient has high variance
+    env.seed(RANDOMSEED)  # reproducible, general Policy gradient has high variance
     env = env.unwrapped
 
     print(env.action_space)
@@ -238,8 +244,10 @@ def load_ckpt(self):
 
                     # print("episode:", i_episode, "  reward:", int(running_reward))
 
-                    print("Episode [%d/%d] \tsum reward: %d  \trunning reward: %f \ttook: %.5fs " %
-                          (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time))
+                    print(
+                        "Episode [%d/%d] \tsum reward: %d  \trunning reward: %f \ttook: %.5fs " %
+                        (i_episode, num_episodes, ep_rs_sum, running_reward, time.time() - episode_time)
+                    )
                     reward_buffer.append(running_reward)
 
                     vt = RL.learn()
diff --git a/examples/reinforcement_learning/tutorial_PPO.py b/examples/reinforcement_learning/tutorial_PPO.py
index c84effaf6..b20d03196 100644
--- a/examples/reinforcement_learning/tutorial_PPO.py
+++ b/examples/reinforcement_learning/tutorial_PPO.py
@@ -27,16 +27,17 @@
 python tutorial_PPO.py --train/test
 
 """
+import argparse
+import os
+import time
 
-import tensorflow as tf
-import numpy as np
 import matplotlib.pyplot as plt
+import numpy as np
+
 import gym
-import tensorlayer as tl
+import tensorflow as tf
 import tensorflow_probability as tfp
-import time
-import os
-import argparse
+import tensorlayer as tl
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -58,13 +59,14 @@
 C_UPDATE_STEPS = 10  # critic update steps
 S_DIM, A_DIM = 3, 1  # state dimension, action dimension
 EPS = 1e-8  # epsilon
-METHOD = [dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
-          dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
-          ][1]  # choose the method for optimization
-
+METHOD = [
+    dict(name='kl_pen', kl_target=0.01, lam=0.5),  # KL penalty
+    dict(name='clip', epsilon=0.2),  # Clipped surrogate objective, find this is better
+][1]  # choose the method for optimization
 
 ###############################  PPO  ####################################
 
+
 class PPO(object):
     '''
     PPO class
@@ -112,9 +114,10 @@ def a_train(self, tfs, tfa, tfadv):
                 kl_mean = tf.reduce_mean(kl)
                 aloss = -(tf.reduce_mean(surr - tflam * kl))
             else:  # clipping method, find this is better
-                aloss = -tf.reduce_mean(tf.minimum(
-                    surr,
-                    tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv))
+                aloss = -tf.reduce_mean(
+                    tf.minimum(surr,
+                               tf.clip_by_value(ratio, 1. - METHOD['epsilon'], 1. + METHOD['epsilon']) * tfadv)
+                )
         a_gard = tape.gradient(aloss, self.actor.trainable_weights)
 
         self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
@@ -169,20 +172,22 @@ def update(self, s, a, r):
 
         self.update_old_pi()
         adv = self.cal_adv(s, r)
-        # adv = (adv - adv.mean())/(adv.std()+1e-6)     # sometimes helpful
+        # adv = (adv - adv.mean())/(adv.std()+1e-6)  # sometimes helpful
 
         # update actor
         if METHOD['name'] == 'kl_pen':
             for _ in range(A_UPDATE_STEPS):
                 kl = self.a_train(s, a, adv)
-                if kl > 4 * METHOD['kl_target']:              # this in in google's paper
+                if kl > 4 * METHOD['kl_target']:  # this in in google's paper
                     break
-            if kl < METHOD['kl_target'] / 1.5:                # adaptive lambda, this is in OpenAI's paper
+            if kl < METHOD['kl_target'] / 1.5:  # adaptive lambda, this is in OpenAI's paper
                 METHOD['lam'] /= 2
             elif kl > METHOD['kl_target'] * 1.5:
                 METHOD['lam'] *= 2
-            METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)  # sometimes explode, this clipping is MorvanZhou's solution
-        else:                                                 # clipping method, find this is better (OpenAI's paper)
+            METHOD['lam'] = np.clip(
+                METHOD['lam'], 1e-4, 10
+            )  # sometimes explode, this clipping is MorvanZhou's solution
+        else:  # clipping method, find this is better (OpenAI's paper)
             for _ in range(A_UPDATE_STEPS):
                 self.a_train(s, a, adv)
 
@@ -297,8 +302,12 @@ def load_ckpt(self):
                 all_ep_r.append(ep_r)
             else:
                 all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
-            print('Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'
-                  .format(ep, EP_MAX, ep_r, time.time() - t0))
+            print(
+                'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.format(
+                    ep, EP_MAX, ep_r,
+                    time.time() - t0
+                )
+            )
 
             plt.ion()
             plt.cla()
diff --git a/examples/reinforcement_learning/tutorial_TRPO.py b/examples/reinforcement_learning/tutorial_TRPO.py
index 017ac086d..6f90b9aad 100644
--- a/examples/reinforcement_learning/tutorial_TRPO.py
+++ b/examples/reinforcement_learning/tutorial_TRPO.py
@@ -28,19 +28,20 @@
 python tutorial_TRPO.py --train/test
 
 """
-import numpy as np
-import tensorflow as tf
-import tensorflow_probability as tfp
-import tensorlayer as tl
-import gym
+import argparse
+import copy
+import os
 import time
 
 import matplotlib.pyplot as plt
+import numpy as np
 import scipy.signal
-import copy
+
+import gym
+import tensorflow as tf
+import tensorflow_probability as tfp
+import tensorlayer as tl
 from gym.spaces import Box, Discrete
-import os
-import argparse
 
 parser = argparse.ArgumentParser(description='Train or test neural net motor controller.')
 parser.add_argument('--train', dest='train', action='store_true', default=True)
@@ -76,9 +77,9 @@
 SAVE_FREQ = 10  # How often (in terms of gap between epochs) to save the current policy and value function
 EPS = 1e-8  # epsilon
 
-
 #####################  functions  ####################
 
+
 def combined_shape(length, shape=None):
     """
     combine length and shape based on shape type
@@ -136,7 +137,7 @@ def input_layer_from_space(space):
     if isinstance(space, Box):
         return input_layer(space.shape)
     elif isinstance(space, Discrete):
-        return tl.layers.Input(dtype=tf.int32, shape=(None,))
+        return tl.layers.Input(dtype=tf.int32, shape=(None, ))
     raise NotImplementedError
 
 
@@ -149,7 +150,7 @@ def input_layers_from_spaces(*args):
     return [input_layer_from_space(space) for space in args]
 
 
-def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
+def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None):
     """
     create Multi-Layer Perception
     :param x: tensorlayer input layer
@@ -190,7 +191,7 @@ def gaussian_likelihood(x, mu, log_std):
     :param log_std: log std
     :return: gaussian likelihood
     """
-    pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS)) ** 2 + 2 * log_std + np.log(2 * np.pi))
+    pre_sum = -0.5 * (((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi))
     return tf.reduce_sum(pre_sum, axis=1)
 
 
@@ -201,7 +202,7 @@ def diagonal_gaussian_kl(mu0, log_std0, mu1, log_std1):
     (https://en.wikipedia.org/wiki/Kullback-Leibler_divergence#Multivariate_normal_distributions)
     """
     var0, var1 = tf.exp(2 * log_std0), tf.exp(2 * log_std1)
-    pre_sum = 0.5 * (((mu1 - mu0) ** 2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0
+    pre_sum = 0.5 * (((mu1 - mu0)**2 + var0) / (var1 + EPS) - 1) + log_std1 - log_std0
     all_kls = tf.reduce_sum(pre_sum, axis=1)
     return tf.reduce_mean(all_kls)
 
@@ -221,7 +222,7 @@ def flat_concat(xs):
     :param xs: a list of tensor
     :return: flat tensor
     """
-    return tf.concat([tf.reshape(x, (-1,)) for x in xs], axis=0)
+    return tf.concat([tf.reshape(x, (-1, )) for x in xs], axis=0)
 
 
 def assign_params_from_flat(x, params):
@@ -333,8 +334,10 @@ def cal_outputs_1(self, states, actions, old_log_std_ph, old_mu_ph):
 """
 
 
-def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh,
-                     output_activation=None):
+def mlp_actor_critic(
+        x: 'env.observation_space', a: 'env.action_space', hidden_sizes=(64, 64), activation=tf.tanh,
+        output_activation=None
+):
     """
     create actor and critic
     :param x: observation space
@@ -353,6 +356,7 @@ def mlp_actor_critic(x: 'env.observation_space', a: 'env.action_space', hidden_s
         raise ValueError('action space type error')
 
     class Critic:
+
         def __init__(self, obs_space, hidden_layer_sizes, activation_funcs):
             inputs = input_layer_from_space(obs_space)
             self.model = tl.models.Model(inputs, mlp(inputs, list(hidden_layer_sizes) + [1], activation_funcs, None))
@@ -442,8 +446,8 @@ def get(self):
         # the next two lines implement the advantage normalization trick
         adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
         self.adv_buf = (self.adv_buf - adv_mean) / adv_std
-        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf,
-                self.logp_buf] + values_as_sorted_list(self.info_bufs)
+        return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf
+                ] + values_as_sorted_list(self.info_bufs)
 
 
 #####################  TRPO  ####################
@@ -461,6 +465,7 @@ class TRPO:
     """
     trpo class
     """
+
     def __init__(self, obs_space, act_space):
 
         obs_dim = obs_space.shape
@@ -496,7 +501,7 @@ def get_action_ops(self, states):
         res0 = [pi, v, logp_pi] + values_as_sorted_list(info)
         res = []
         for i in res0:
-            res.append(i + 0)   # transfer to tensor
+            res.append(i + 0)  # transfer to tensor
         return res
 
     # TRPO losses
@@ -521,7 +526,7 @@ def v_loss(self, inputs):
         """
         x_ph, a_ph, adv_ph, ret_ph, logp_old_ph, *info_values = inputs
         v = self.critic.critic_cal_func(x_ph)
-        v_loss = tf.reduce_mean((ret_ph - v) ** 2)
+        v_loss = tf.reduce_mean((ret_ph - v)**2)
         return v_loss
 
     def train_vf(self, inputs):
@@ -652,7 +657,7 @@ def set_and_eval(step):
 
         # trpo augments npg with backtracking line search, hard kl
         for j in range(BACKTRACK_ITERS):
-            kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF ** j)
+            kl, pi_l_new = set_and_eval(step=BACKTRACK_COEFF**j)
             if kl <= DELTA and pi_l_new <= pi_l_old:
                 # Accepting new params at step of line search
                 break

From 8521836046144d51707d4d91c26a503ca25ced2c Mon Sep 17 00:00:00 2001
From: quantumiracle <1402434478@qq.com>
Date: Wed, 12 Jun 2019 21:01:05 +0100
Subject: [PATCH 4/8] readme

---
 examples/reinforcement_learning/README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index 3a1262ccd..2d57d462e 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -41,11 +41,11 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 | Algorithms      | Observation Space | Action Space | Tutorial Env   |
 | --------------- | ----------------- | ------------ | -------------- |
 | Q-learning      | Discrete          | Discrete     | FrozenLake     |
-| C51             | Discrete          | Discrete     | Pong, CartPole |
+| C51             | Continuous        | Discrete     | Pong, CartPole |
 | DQN             | Discrete          | Discrete     | FrozenLake     |
-| Variants of DQN | Discrete          | Discrete     | Pong, CartPole |
-| Retrace         | Discrete          | Discrete     | Pong, CartPole |
-| PER             | Discrete          | Discrete     | Pong, CartPole |
+| Variants of DQN | Continuous        | Discrete     | Pong, CartPole |
+| Retrace         | Continuous        | Discrete     | Pong, CartPole |
+| PER             | Continuous        | Discrete     | Pong, CartPole |
 | Actor-Critic    | Continuous        | Discrete     | CartPole       |
 | A3C             | Continuous        | Continuous   | BipedalWalker  |
 | DDPG            | Continuous        | Continuous   | Pendulum       |
@@ -106,6 +106,10 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 
   <u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
 
+  [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)
+
+  [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
+
   <u>Description</u>: 
 
   ```

From c167c18729588e4c0aea41e82be43876157d1268 Mon Sep 17 00:00:00 2001
From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com>
Date: Thu, 13 Jun 2019 11:50:44 +0800
Subject: [PATCH 5/8] update readme

add paper links and removed observation space in table
---
 examples/reinforcement_learning/README.md | 693 +++++++++++-----------
 1 file changed, 344 insertions(+), 349 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index 2d57d462e..c8a0a82e6 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -1,349 +1,344 @@
-# Reinforcement Learning Tutorial with Tensorlayer
-
-<br/>
-
-<a href="https://join.slack.com/t/tensorlayer/shared_invite/enQtMjUyMjczMzU2Njg4LWI0MWU0MDFkOWY2YjQ4YjVhMzI5M2VlZmE4YTNhNGY1NjZhMzUwMmQ2MTc0YWRjMjQzMjdjMTg2MWQ2ZWJhYzc" target="\_blank">
-	<div align="center">
-		<img src="../../img/join_slack.png" width="40%"/>
-	</div>
-</a>
-
-<br/>
-
-This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting [Tensorflow 2.0](https://www.tensorflow.org/alpha/guide/effective_tf2). We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly.
-
-## Prerequisites:
-
-* python 3.5
-* tensorflow >= 2.0.0 or tensorflow-gpu >= 2.0.0a0
-* tensorlayer >= 2.0.1
-* tensorflow-probability
-* tf-nightly-2.0-preview
-
-*** If you meet the error`AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after installing tensorflow-probability, try:
-
-`pip install --upgrade tf-nightly-2.0-preview tfp-nightly`
-
-## Status: Beta
-
-We are currently open to any suggestions or pull requests from you to make the reinforcement learning tutorial with TensorLayer2.0 a better code repository for both new learners and senior researchers. Some of the algorithms mentioned in the this markdown may be not yet available, since we are still trying to implement more RL algorithms and optimize their performances. However, those algorithms listed above will come out in a few weeks, and the repository will keep updating more advanced RL algorithms in the future.
-
-## To Use:
-
-For each tutorial, open a terminal and run:
-
- `python ***.py --train` for training and `python ***.py --test` for testing.
-
-The tutorial algorithms follow the same basic structure, as shown in file: [`./tutorial_format.py`](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)
-
-## Table of Contents:
-
-| Algorithms      | Observation Space | Action Space | Tutorial Env   |
-| --------------- | ----------------- | ------------ | -------------- |
-| Q-learning      | Discrete          | Discrete     | FrozenLake     |
-| C51             | Continuous        | Discrete     | Pong, CartPole |
-| DQN             | Discrete          | Discrete     | FrozenLake     |
-| Variants of DQN | Continuous        | Discrete     | Pong, CartPole |
-| Retrace         | Continuous        | Discrete     | Pong, CartPole |
-| PER             | Continuous        | Discrete     | Pong, CartPole |
-| Actor-Critic    | Continuous        | Discrete     | CartPole       |
-| A3C             | Continuous        | Continuous   | BipedalWalker  |
-| DDPG            | Continuous        | Continuous   | Pendulum       |
-| TD3             | Continuous        | Continuous   | Pendulum       |
-| SAC             | Continuous        | Continuous   | Pendulum       |
-| PG              | Continuous        | Discrete     | CartPole       |
-| TRPO            | Continuous        | Continuous   | Pendulum       |
-| PPO             | Continuous        | Continuous   | Pendulum       |
-| DPPO            | Continuous        | Continuous   | Pendulum       |
-
-
-## Examples of RL Algorithms:
-
-* **Q-learning**
-
-  Code: `./tutorial_Qlearning.py`
-
-  <u>Paper</u>: [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)
-
-  <u>Description</u>: 
-
-  ```
-  Q-learning is a non-deep-learning method with TD Learning, Off-Policy, e-Greedy Exploration.
-  
-  Central formula:
-  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A))
-  
-  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
-  ```
-
-  ​    
-
-* **Deep Q-Network (DQN)**
-
-  <u>Code:</u> `./tutorial_DQN.py`
-
-  <u>Paper</u>: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
-
-  [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
-
-  <u>Description</u>: 
-
-  ```
-  Deep Q-Network (DQN) is a method of TD Learning, Off-Policy, e-Greedy Exploration (GLIE).
-  
-  Central formula:
-  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)),
-  delta_w = R + lambda * Q(newS, newA).
-  
-  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
-  ```
-
-  
-
-* **Double DQN / Dueling DQN / Noisy DQN**
-
-  <u>Code:</u> `./tutorial_DQN_variants.py`
-
-  <u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
-
-  [Dueling Network Architectures for Deep Reinforcement Learning](https://arxiv.org/abs/1511.06581)
-
-  [Noisy Networks for Exploration](https://arxiv.org/abs/1706.10295)
-
-  <u>Description</u>: 
-
-  ```
-  We implement Double DQN, Dueling DQN and Noisy DQN here.
-  
-  -The max operator in standard DQN uses the same values both to select and to evaluate an action by:
-  
-     Q(s_t, a_t) = R_{t+1} + gamma * max_{a}Q_{target}(s_{t+1}, a).
-  
-  -Double DQN proposes to use following evaluation to address overestimation problem of max operator:
-  
-     Q(s_t, a_t) = R_{t+1} + gamma * Q_{target}(s_{t+1}, max_{a}Q(s_{t+1}, a)).
-  
-  -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
-  
-  -Noisy DQN propose to explore by adding parameter noises.
-  
-  
-  ```
-
-  
-
-
-* **Prioritized Experience Replay**
-
-  <u>Code</u>: `./tutorial_prioritized_replay.py`
-
-  <u>Paper</u>: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
-
-  <u>Description:</u>
-
-  ```
-  Prioritized experience replay is an efficient replay method that replay important transitions more frequently. Segment tree data structure is used to speed up indexing.
-  ```
-
-  
-
-* **Distributed DQN (C51)**
-
-  <u>Code</u>: `./tutorial_C51.py`
-
-  <u>Paper</u>: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf)
-
-  <u>Description</u>:
-
-  ```
-  Categorical 51 distributional RL algorithm is a distrbuted DQN, where 51 means the number of atoms. In this algorithm, instead of estimating actual expected value, value distribution over a series of  continuous sub-intervals (atoms) is considered.
-  ```
-
-  
-
-
-* **Retrace(lambda) DQN**
-
-  <u>Code</u>: `./tutorial_Retrace.py`
-
-  <u>Paper</u>: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647)
-
-  <u>Description:</u>
-
-  ```
-  Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning.
-  ```
-
-  
-
-
-* **Actor-Critic (AC)**
-
-  <u>Code</u>:`./tutorial_AC.py`
-
-  <u>Paper</u>: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)
-
-  <u>Description</u>:
-
-  ```
-  The implementation of Advantage Actor-Critic, using TD-error as the advantage.
-  ```
-
-  
-
-* **Asynchronous Advantage Actor-Critic (A3C)**
-
-  <u>Code</u>: `./tutorial_A3C.py`
-
-  <u>Paper</u>: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
-
-  <u>Description</u>:
-
-  ```
-  The implementation of Asynchronous Advantage Actor-Critic (A3C), using multi-threading for distributed policy learning on Actor-Critic structure.
-  ```
-
-  
-
-* **Soft Actor-Critic (SAC)**
-
-  <u>Code</u>: `./tutorial_SAC.py`
-
-  <u>Paper</u>: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf)
-
-  <u>Description:</u>
-
-  ```
-  Actor policy in SAC is stochastic, with off-policy training.  And 'soft' in SAC indicates the trade-off between the entropy and expected return.  The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor.
-  
-  This version of Soft Actor-Critic (SAC) implementation contains 5 networks: 
-  2 Q-networks, 2 target Q-networks and 1 policy network.
-  ```
-
-  
-
-
-* **Vanilla Policy Gradient (PG or REINFORCE)** 
-
-  <u>Code</u>: `./tutorial_PG.py`
-
-  <u>Paper</u>: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)
-
-  <u>Description:</u>
-
-  ```
-  The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces.
-  
-  To apply it on continuous action space, you need to change the last softmax layer and the choose_action function.
-  ```
-
-  
-
-* **Deep Deterministic Policy Gradient (DDPG)**
-
-  <u>Code:</u> `./tutorial_DDPG.py`
-
-  <u>Paper:</u> [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf)
-
-  <u>Description:</u>
-
-  ```
-  An algorithm concurrently learns a Q-function and a policy.
-  
-  It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy.
-  ```
-
-  
-
-
-* **Twin Delayed DDPG (TD3)**
-
-  <u>Code</u>: `./tutorial_TD3.py`
-
-  <u>Paper</u>: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf)
-
-  <u>Description</u>:
-
-  ```
-  DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters.
-  
-  Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks:
-  
-  - Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.
-  - Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function. 
-  - Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action.
-  
-  The implementation of TD3 includes 6 networks: 
-  2 Q-networks, 2 target Q-networks, 1 policy network, 1 target policy network.
-  
-  Actor policy in TD3 is deterministic, with Gaussian exploration noise.
-  ```
-
-  
-
-* **Trust Region Policy Optimization (TRPO)**
-
-  <u>Code</u>: `./tutorial_TRPO.py`
-
-  <u>Paper</u>: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf)
-
-  <u>Description:</u>
-
-  ```
-  PG method with a large step can crash the policy performance, even with a small step can lead a large differences in policy.
-  
-  TRPO constraints the step in policy space using KL divergence (rather than in parameter space), which can monotonically improve performance and avoid a collapsed update.
-  ```
-
-  
-
-* **Proximal Policy Optimization (PPO)**
-
-  <u>Code:</u> `./tutorial_PPO.py`
-
-  <u>Paper</u>: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf)
-
-  <u>Description:</u>
-
-  ```
-  A simple version of Proximal Policy Optimization (PPO) using single thread.
-  
-  PPO is a family of first-order methods that use a few other tricks to keep new policies close to old.
-  
-  PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO.
-  
-  
-  ```
-
-  
-
-* **Distributed Proximal Policy Optimization (DPPO)**
-
-  <u>Code</u>: `./tutorial_DPPO.py`
-
-  <u>Paper</u>: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf)
-
-  <u>Description:</u>
-
-  ```
-  A distributed version of OpenAI's Proximal Policy Optimization (PPO).
-  
-  Distribute the workers to collect data in parallel, then stop worker's roll-out and train PPO on collected data.
-  ```
-
-  
-
-* **More in recent weeks**
-
-## Environment:
-
-We typically apply game environments in [Openai Gym](https://gym.openai.com/) for our tutorials. For other environment sources like [DeepMind Control Suite](https://github.com/deepmind/dm_control) and [Marathon-Envs in Unity](https://github.com/Unity-Technologies/marathon-envs), they all have wrappers to convert into format of Gym environments, see [here](https://github.com/martinseilair/dm_control2gym) and [here](https://github.com/Unity-Technologies/marathon-envs/tree/master/gym-unity).
-
-Our env wrapper: `./tutorial_wrappers.py` 
-
-## Authors
-- @xxxx XXXXX : AC, A3C
-- @quantumiracle Zihan Ding: SAC, TD3.
-- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO
-- @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers.
-
+# Reinforcement Learning Tutorial with Tensorlayer
+
+<br/>
+
+<a href="https://join.slack.com/t/tensorlayer/shared_invite/enQtMjUyMjczMzU2Njg4LWI0MWU0MDFkOWY2YjQ4YjVhMzI5M2VlZmE4YTNhNGY1NjZhMzUwMmQ2MTc0YWRjMjQzMjdjMTg2MWQ2ZWJhYzc" target="\_blank">
+	<div align="center">
+		<img src="../../img/join_slack.png" width="40%"/>
+	</div>
+</a>
+
+<br/>
+
+This repository contains implementation of most popular reinforcement learning algorithms with Tensorlayer 2.0, supporting [Tensorflow 2.0](https://www.tensorflow.org/alpha/guide/effective_tf2). We aim to make the reinforcement learning tutorial for each algorithm simple and straight-forward to use, as this would not only benefits new learners of reinforcement learning, but also provide convenience for senior researchers to testify their new ideas quickly.
+
+## Prerequisites:
+
+* python 3.5
+* tensorflow >= 2.0.0 or tensorflow-gpu >= 2.0.0a0
+* tensorlayer >= 2.0.1
+* tensorflow-probability
+* tf-nightly-2.0-preview
+
+*** If you meet the error`AttributeError: module 'tensorflow' has no attribute 'contrib'` when running the code after installing tensorflow-probability, try:
+
+`pip install --upgrade tf-nightly-2.0-preview tfp-nightly`
+
+## Status: Beta
+
+We are currently open to any suggestions or pull requests from you to make the reinforcement learning tutorial with TensorLayer2.0 a better code repository for both new learners and senior researchers. Some of the algorithms mentioned in the this markdown may be not yet available, since we are still trying to implement more RL algorithms and optimize their performances. However, those algorithms listed above will come out in a few weeks, and the repository will keep updating more advanced RL algorithms in the future.
+
+## To Use:
+
+For each tutorial, open a terminal and run:
+
+ `python ***.py --train` for training and `python ***.py --test` for testing.
+
+The tutorial algorithms follow the same basic structure, as shown in file: [`./tutorial_format.py`](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)
+
+## Table of Contents:
+
+| Algorithms      | Observation Space | Action Space | Tutorial Env   | Papers |
+| --------------- | ----------------- | ------------ | -------------- | -------|
+| Q-learning      | Discrete          | Discrete     | FrozenLake     | [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
+| DQN             | Discrete          | Discrete     | FrozenLake     | [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) |
+| Variants of DQN | Discrete          | Discrete     | Pong, CartPole | [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) |
+| PER             | Discrete          | Discrete     | Pong, CartPole | [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) |
+| C51             | Discrete          | Discrete     | Pong, CartPole | [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) |
+| Retrace         | Discrete          | Discrete     | Pong, CartPole | [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) |
+| Actor-Critic    | Continuous        | Discrete     | CartPole       | [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) |
+| A3C             | Continuous        | Continuous   | BipedalWalker  | [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) |
+| SAC             | Continuous        | Continuous   | Pendulum       | [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) |
+| PG              | Continuous        | Discrete     | CartPole       | [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) |
+| DDPG            |	Continuous        | Continuous   | Pendulum       | [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) |
+| TD3             | Continuous        | Continuous   | Pendulum       | [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) |
+| TRPO            | Continuous        | Continuous   | Pendulum       | [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) |
+| PPO             | Continuous        | Continuous   | Pendulum       | [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) |
+| DPPO            | Continuous        | Continuous   | Pendulum       | [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) |
+
+
+## Examples of RL Algorithms:
+
+* **Q-learning**
+
+  Code: `./tutorial_Qlearning.py`
+
+  <u>Paper</u>: [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)
+
+  <u>Description</u>:
+
+  ```
+  Q-learning is a non-deep-learning method with TD Learning, Off-Policy, e-Greedy Exploration.
+
+  Central formula:
+  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A))
+
+  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
+  ```
+
+  ​    
+
+* **Deep Q-Network (DQN)**
+
+  <u>Code:</u> `./tutorial_DQN.py`
+
+  <u>Paper</u>: [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
+
+  [Playing Atari with Deep Reinforcement Learning](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf)
+
+  <u>Description</u>:
+
+  ```
+  Deep Q-Network (DQN) is a method of TD Learning, Off-Policy, e-Greedy Exploration (GLIE).
+
+  Central formula:
+  Q(S, A) <- Q(S, A) + alpha * (R + lambda * Q(newS, newA) - Q(S, A)),
+  delta_w = R + lambda * Q(newS, newA).
+
+  See David Silver RL Tutorial Lecture 5 - Q-Learning for more details.
+  ```
+
+
+
+* **Double DQN / Dueling DQN / Noisy DQN**
+
+  <u>Code:</u> `./tutorial_DQN_variants.py`
+
+  <u>Paper</u>: [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461)
+
+  <u>Description</u>:
+
+  ```
+  We implement Double DQN, Dueling DQN and Noisy DQN here.
+
+  -The max operator in standard DQN uses the same values both to select and to evaluate an action by:
+
+     Q(s_t, a_t) = R\_{t+1\} + gamma \* max\_{a}Q\_\{target\}(s_{t+1}, a).
+
+  -Double DQN proposes to use following evaluation to address overestimation problem of max operator:
+
+     Q(s_t, a_t) = R\_{t+1\} + gamma \* Q\_{target}(s\_\{t+1\}, max{a}Q(s_{t+1}, a)).
+
+  -Dueling DQN uses dueling architecture where the value of state and the advantage of each action is estimated separately.
+
+  -Noisy DQN propose to explore by adding parameter noises.
+
+
+  ```
+
+
+
+
+* **Prioritized Experience Replay**
+
+  <u>Code</u>: `./tutorial_prioritized_replay.py`
+
+  <u>Paper</u>: [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952)
+
+  <u>Description:</u>
+
+  ```
+  Prioritized experience replay is an efficient replay method that replay important transitions more frequently. Segment tree data structure is used to speed up indexing.
+  ```
+
+
+
+* **Distributed DQN (C51)**
+
+  <u>Code</u>: `./tutorial_C51.py`
+
+  <u>Paper</u>: [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf)
+
+  <u>Description</u>:
+
+  ```
+  Categorical 51 distributional RL algorithm is a distrbuted DQN, where 51 means the number of atoms. In this algorithm, instead of estimating actual expected value, value distribution over a series of  continuous sub-intervals (atoms) is considered.
+  ```
+
+
+
+
+* **Retrace(lambda) DQN**
+
+  <u>Code</u>: `./tutorial_Retrace.py`
+
+  <u>Paper</u>: [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647)
+
+  <u>Description:</u>
+
+  ```
+  Retrace (lambda) is an off-policy algorithm that extend the idea of eligibility trace. It apply an importance sampling ratio truncated at 1 to several behaviour policies, which suffer from the variance explosion of standard IS and lead to safe and efficient learning.
+  ```
+
+
+
+
+* **Actor-Critic (AC)**
+
+  <u>Code</u>:`./tutorial_AC.py`
+
+  <u>Paper</u>: [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)
+
+  <u>Description</u>:
+
+  ```
+  The implementation of Advantage Actor-Critic, using TD-error as the advantage.
+  ```
+
+
+
+* **Asynchronous Advantage Actor-Critic (A3C)**
+
+  <u>Code</u>: `./tutorial_A3C.py`
+
+  <u>Paper</u>: [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf)
+
+  <u>Description</u>:
+
+  ```
+  The implementation of Asynchronous Advantage Actor-Critic (A3C), using multi-threading for distributed policy learning on Actor-Critic structure.
+  ```
+
+
+
+* **Soft Actor-Critic (SAC)**
+
+  <u>Code</u>: `./tutorial_SAC.py`
+
+  <u>Paper</u>: [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf)
+
+  <u>Description:</u>
+
+  ```
+  Actor policy in SAC is stochastic, with off-policy training.  And 'soft' in SAC indicates the trade-off between the entropy and expected return.  The additional consideration of entropy term helps with more explorative policy. And this implementation contains an automatic update for the entropy factor.
+
+  This version of Soft Actor-Critic (SAC) implementation contains 5 networks:
+  2 Q-networks, 2 target Q-networks and 1 policy network.
+  ```
+
+
+
+
+* **Vanilla Policy Gradient (PG or REINFORCE)**
+
+  <u>Code</u>: `./tutorial_PG.py`
+
+  <u>Paper</u>: [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf)
+
+  <u>Description:</u>
+
+  ```
+  The policy gradient algorithm works by updating policy parameters via stochastic gradient ascent on policy performance. It's an on-policy algorithm can be used for environments with either discrete or continuous action spaces.
+
+  To apply it on continuous action space, you need to change the last softmax layer and the choose_action function.
+  ```
+
+
+
+* **Deep Deterministic Policy Gradient (DDPG)**
+
+  <u>Code:</u> `./tutorial_DDPG.py`
+
+  <u>Paper:</u> [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf)
+
+  <u>Description:</u>
+
+  ```
+  An algorithm concurrently learns a Q-function and a policy.
+
+  It uses off-policy data and the Bellman equation to learn the Q-function, and uses the Q-function to learn the policy.
+  ```
+
+
+
+
+* **Twin Delayed DDPG (TD3)**
+
+  <u>Code</u>: `./tutorial_TD3.py`
+
+  <u>Paper</u>: [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf)
+
+  <u>Description</u>:
+
+  ```
+  DDPG suffers from problems like overestimate of Q-values and sensitivity to hyper-parameters.
+
+  Twin Delayed DDPG (TD3) is a variant of DDPG with several tricks:
+
+  - Trick One: Clipped Double-Q Learning. TD3 learns two Q-functions instead of one (hence “twin”), and uses the smaller of the two Q-values to form the targets in the Bellman error loss functions.
+  - Trick Two: “Delayed” Policy Updates. TD3 updates the policy (and target networks) less frequently than the Q-function.
+  - Trick Three: Target Policy Smoothing. TD3 adds noise to the target action, to make it harder for the policy to exploit Q-function errors by smoothing out Q along changes in action.
+
+  The implementation of TD3 includes 6 networks:
+  2 Q-networks, 2 target Q-networks, 1 policy network, 1 target policy network.
+
+  Actor policy in TD3 is deterministic, with Gaussian exploration noise.
+  ```
+
+
+
+* **Trust Region Policy Optimization (TRPO)**
+
+  <u>Code</u>: `./tutorial_TRPO.py`
+
+  <u>Paper</u>: [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf)
+
+  <u>Description:</u>
+
+  ```
+  PG method with a large step can crash the policy performance, even with a small step can lead a large differences in policy.
+
+  TRPO constraints the step in policy space using KL divergence (rather than in parameter space), which can monotonically improve performance and avoid a collapsed update.
+  ```
+
+
+
+* **Proximal Policy Optimization (PPO)**
+
+  <u>Code:</u> `./tutorial_PPO.py`
+
+  <u>Paper</u>: [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf)
+
+  <u>Description:</u>
+
+  ```
+  A simple version of Proximal Policy Optimization (PPO) using single thread.
+
+  PPO is a family of first-order methods that use a few other tricks to keep new policies close to old.
+
+  PPO methods are significantly simpler to implement, and empirically seem to perform at least as well as TRPO.
+
+
+  ```
+
+
+
+* **Distributed Proximal Policy Optimization (DPPO)**
+
+  <u>Code</u>: `./tutorial_DPPO.py`
+
+  <u>Paper</u>: [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf)
+
+  <u>Description:</u>
+
+  ```
+  A distributed version of OpenAI's Proximal Policy Optimization (PPO).
+
+  Distribute the workers to collect data in parallel, then stop worker's roll-out and train PPO on collected data.
+  ```
+
+
+
+* **More in recent weeks**
+
+## Environment:
+
+We typically apply game environments in [Openai Gym](https://gym.openai.com/) for our tutorials. For other environment sources like [DeepMind Control Suite](https://github.com/deepmind/dm_control) and [Marathon-Envs in Unity](https://github.com/Unity-Technologies/marathon-envs), they all have wrappers to convert into format of Gym environments, see [here](https://github.com/martinseilair/dm_control2gym) and [here](https://github.com/Unity-Technologies/marathon-envs/tree/master/gym-unity).
+
+Our env wrapper: `./tutorial_wrappers.py`
+
+## Authors
+- @xxxx XXXXX : AC, A3C
+- @quantumiracle Zihan Ding: SAC, TD3.
+- @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO
+- @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers.

From 1d2df0b2542323fb6fe2d496dccf40133252e622 Mon Sep 17 00:00:00 2001
From: Tokarev-TT-33 <34995488+Tokarev-TT-33@users.noreply.github.com>
Date: Thu, 13 Jun 2019 11:53:57 +0800
Subject: [PATCH 6/8] update readme

---
 examples/reinforcement_learning/README.md | 36 +++++++++++------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index c8a0a82e6..b7e48abe4 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -38,23 +38,23 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 
 ## Table of Contents:
 
-| Algorithms      | Observation Space | Action Space | Tutorial Env   | Papers |
-| --------------- | ----------------- | ------------ | -------------- | -------|
-| Q-learning      | Discrete          | Discrete     | FrozenLake     | [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
-| DQN             | Discrete          | Discrete     | FrozenLake     | [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) |
-| Variants of DQN | Discrete          | Discrete     | Pong, CartPole | [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) |
-| PER             | Discrete          | Discrete     | Pong, CartPole | [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) |
-| C51             | Discrete          | Discrete     | Pong, CartPole | [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) |
-| Retrace         | Discrete          | Discrete     | Pong, CartPole | [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) |
-| Actor-Critic    | Continuous        | Discrete     | CartPole       | [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) |
-| A3C             | Continuous        | Continuous   | BipedalWalker  | [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) |
-| SAC             | Continuous        | Continuous   | Pendulum       | [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) |
-| PG              | Continuous        | Discrete     | CartPole       | [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) |
-| DDPG            |	Continuous        | Continuous   | Pendulum       | [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) |
-| TD3             | Continuous        | Continuous   | Pendulum       | [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) |
-| TRPO            | Continuous        | Continuous   | Pendulum       | [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) |
-| PPO             | Continuous        | Continuous   | Pendulum       | [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) |
-| DPPO            | Continuous        | Continuous   | Pendulum       | [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) |
+| Algorithms      | Action Space | Tutorial Env   | Papers |
+| --------------- | ------------ | -------------- | -------|
+| Q-learning      | Discrete     | FrozenLake     | [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
+| DQN             | Discrete     | FrozenLake     | [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) |
+| Variants of DQN | Discrete     | Pong, CartPole | [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) |
+| PER             | Discrete     | Pong, CartPole | [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) |
+| C51             | Discrete     | Pong, CartPole | [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) |
+| Retrace         | Discrete     | Pong, CartPole | [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) |
+| Actor-Critic    | Discrete     | CartPole       | [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) |
+| A3C             | Continuous   | BipedalWalker  | [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) |
+| SAC             | Continuous   | Pendulum       | [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) |
+| PG              | Discrete     | CartPole       | [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) |
+| DDPG            | Continuous   | Pendulum       | [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) |
+| TD3             | Continuous   | Pendulum       | [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) |
+| TRPO            | Continuous   | Pendulum       | [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) |
+| PPO             | Continuous   | Pendulum       | [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) |
+| DPPO            | Continuous   | Pendulum       | [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) |
 
 
 ## Examples of RL Algorithms:
@@ -338,7 +338,7 @@ We typically apply game environments in [Openai Gym](https://gym.openai.com/) fo
 Our env wrapper: `./tutorial_wrappers.py`
 
 ## Authors
-- @xxxx XXXXX : AC, A3C
+- @zsdonghao Hao Dong: AC, A3C, Q-Learning, DQN, PG
 - @quantumiracle Zihan Ding: SAC, TD3.
 - @Tokarev-TT-33 Tianyang Yu @initial-h Hongming Zhang : PG, DDPG, PPO, DPPO, TRPO
 - @Officium Yanhua Huang: C51, Retrace, DQN_variants, prioritized_replay, wrappers.

From 99cfc9c5b7bab30b86e5d71588c126f8e0536608 Mon Sep 17 00:00:00 2001
From: initial-h <18811472492@163.com>
Date: Thu, 13 Jun 2019 13:19:13 +0800
Subject: [PATCH 7/8] Update README.md

---
 examples/reinforcement_learning/README.md | 41 ++++++++++++++---------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index b7e48abe4..102b7a90e 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -37,25 +37,34 @@ For each tutorial, open a terminal and run:
 The tutorial algorithms follow the same basic structure, as shown in file: [`./tutorial_format.py`](https://github.com/tensorlayer/tensorlayer/blob/reinforcement-learning/examples/reinforcement_learning/tutorial_format.py)
 
 ## Table of Contents:
-
+### value-based
+| Algorithms      | Action Space | Tutorial Env   | Papers |
+| --------------- | ------------ | -------------- | -------|
+| Q-learning      | Discrete     | FrozenLake     | [Technical note: Q-learning. Watkins et al. 1992](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
+| Deep Q-Network (DQN)| Discrete     | FrozenLake     | [Human-level control through deep reinforcement learning, Mnih et al. 2015.](https://www.nature.com/articles/nature14236/) |
+| Prioritized Experience Replay | Discrete     | Pong, CartPole | [Schaul et al. Prioritized experience replay. Schaul et al. 2015.](https://arxiv.org/abs/1511.05952) |
+|Dueling DQN|Discrete     | Pong, CartPole |[Dueling network architectures for deep reinforcement learning. Wang et al. 2015.](https://arxiv.org/abs/1511.06581)|
+|Double DQN| Discrete     | Pong, CartPole |[Deep reinforcement learning with double q-learning. Van et al. 2016.](https://arxiv.org/abs/1509.06461)|
+|Retrace|Discrete     | Pong, CartPole |[Safe and efficient off-policy reinforcement learning. Munos et al. 2016: ](https://arxiv.org/pdf/1606.02647.pdf)|
+|Noisy DQN|Discrete     | Pong, CartPole |[Noisy networks for exploration. Fortunato et al. 2017.](https://arxiv.org/pdf/1706.10295.pdf)|
+| Distributed DQN (C51)| Discrete     | Pong, CartPole | [A distributional perspective on reinforcement learning. Bellemare et al. 2017.](https://arxiv.org/pdf/1707.06887.pdf) |
+
+### policy-based
 | Algorithms      | Action Space | Tutorial Env   | Papers |
 | --------------- | ------------ | -------------- | -------|
-| Q-learning      | Discrete     | FrozenLake     | [Technical  Note Q-Learning](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
-| DQN             | Discrete     | FrozenLake     | [Human-level control through deep reinforcementlearning](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) |
-| Variants of DQN | Discrete     | Pong, CartPole | [Deep Reinforcement Learning with Double Q-learning](https://arxiv.org/abs/1509.06461) |
-| PER             | Discrete     | Pong, CartPole | [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) |
-| C51             | Discrete     | Pong, CartPole | [A Distributional Perspective on Reinforcement Learning](https://arxiv.org/pdf/1707.06887.pdf) |
-| Retrace         | Discrete     | Pong, CartPole | [Safe and Efficient Off-Policy Reinforcement Learning](https://arxiv.org/abs/1606.02647) |
-| Actor-Critic    | Discrete     | CartPole       | [Actor-Critic Algorithms](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf) |
-| A3C             | Continuous   | BipedalWalker  | [Asynchronous Methods for Deep Reinforcement Learning](https://arxiv.org/pdf/1602.01783.pdf) |
-| SAC             | Continuous   | Pendulum       | [Soft Actor-Critic Algorithms and Applications](https://arxiv.org/pdf/1812.05905.pdf) |
-| PG              | Discrete     | CartPole       | [Policy Gradient Methods for Reinforcement Learning with Function Approximation](https://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation.pdf) |
-| DDPG            | Continuous   | Pendulum       | [Continuous Control With Deep Reinforcement Learning](https://arxiv.org/pdf/1509.02971.pdf) |
-| TD3             | Continuous   | Pendulum       | [Addressing Function Approximation Error in Actor-Critic Methods](https://arxiv.org/pdf/1802.09477.pdf) |
-| TRPO            | Continuous   | Pendulum       | [Trust Region Policy Optimization](https://arxiv.org/pdf/1502.05477.pdf) |
-| PPO             | Continuous   | Pendulum       | [Proximal Policy Optimization Algorithms](https://arxiv.org/pdf/1707.06347.pdf) |
-| DPPO            | Continuous   | Pendulum       | [Emergence of Locomotion Behaviours in Rich Environments](https://arxiv.org/pdf/1707.02286.pdf) |
+|REINFORCE(PG) |Discrete/Continuous|CartPole | [Reinforcement learning: An introduction. Sutton et al. 2011.](https://www.cambridge.org/core/journals/robotica/article/robot-learning-edited-by-jonathan-h-connell-and-sridhar-mahadevan-kluwer-boston-19931997-xii240-pp-isbn-0792393651-hardback-21800-guilders-12000-8995/737FD21CA908246DF17779E9C20B6DF6)|
+| Trust Region Policy Optimization (TRPO)| Discrete/Continuous | Pendulum | [Abbeel et al. Trust region policy optimization. Schulman et al.2015.](https://arxiv.org/pdf/1502.05477.pdf) |
+| Proximal Policy Optimization (PPO) |Discrete/Continuous |Pendulum| [Proximal policy optimization algorithms. Schulman et al. 2017.](https://arxiv.org/abs/1707.06347) |
+|Distributed Proximal Policy Optimization (DPPO)|Discrete/Continuous |Pendulum|[Emergence of locomotion behaviours in rich environments. Heess et al. 2017.](https://arxiv.org/abs/1707.02286)|
 
+### actor-critic
+| Algorithms      | Action Space | Tutorial Env   | Papers |
+| --------------- | ------------ | -------------- | -------|
+|Actor-Critic (AC)|Discrete/Continuous|CartPole| [Actor-critic algorithms. Konda er al. 2000.](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)|
+| Asynchronous Advantage Actor-Critic (A3C)| Discrete/Continuous | BipedalWalker| [Asynchronous methods for deep reinforcement learning. Mnih et al. 2016.](https://arxiv.org/pdf/1602.01783.pdf) |
+| DDPG|Discrete/Continuous |Pendulum| [Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016](https://arxiv.org/pdf/1509.02971.pdf) |
+|TD3|Discrete/Continuous |Pendulum|[Addressing function approximation error in actor-critic methods. Fujimoto et al. 2018.](https://arxiv.org/pdf/1802.09477.pdf)|
+|Soft Actor-Critic (SAC)|Discrete/Continuous |Pendulum|[Soft actor-critic algorithms and applications. Haarnoja et al. 2018.](https://arxiv.org/abs/1812.05905)|
 
 ## Examples of RL Algorithms:
 

From dc6d270ce2bfcaa5055ddb76a181c04710bac45e Mon Sep 17 00:00:00 2001
From: initial-h <18811472492@163.com>
Date: Thu, 13 Jun 2019 13:24:08 +0800
Subject: [PATCH 8/8] Update README.md

---
 examples/reinforcement_learning/README.md | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/reinforcement_learning/README.md b/examples/reinforcement_learning/README.md
index 102b7a90e..b25275481 100644
--- a/examples/reinforcement_learning/README.md
+++ b/examples/reinforcement_learning/README.md
@@ -40,6 +40,7 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 ### value-based
 | Algorithms      | Action Space | Tutorial Env   | Papers |
 | --------------- | ------------ | -------------- | -------|
+|**value-based**||||
 | Q-learning      | Discrete     | FrozenLake     | [Technical note: Q-learning. Watkins et al. 1992](http://www.gatsby.ucl.ac.uk/~dayan/papers/cjch.pdf)|
 | Deep Q-Network (DQN)| Discrete     | FrozenLake     | [Human-level control through deep reinforcement learning, Mnih et al. 2015.](https://www.nature.com/articles/nature14236/) |
 | Prioritized Experience Replay | Discrete     | Pong, CartPole | [Schaul et al. Prioritized experience replay. Schaul et al. 2015.](https://arxiv.org/abs/1511.05952) |
@@ -48,18 +49,12 @@ The tutorial algorithms follow the same basic structure, as shown in file: [`./t
 |Retrace|Discrete     | Pong, CartPole |[Safe and efficient off-policy reinforcement learning. Munos et al. 2016: ](https://arxiv.org/pdf/1606.02647.pdf)|
 |Noisy DQN|Discrete     | Pong, CartPole |[Noisy networks for exploration. Fortunato et al. 2017.](https://arxiv.org/pdf/1706.10295.pdf)|
 | Distributed DQN (C51)| Discrete     | Pong, CartPole | [A distributional perspective on reinforcement learning. Bellemare et al. 2017.](https://arxiv.org/pdf/1707.06887.pdf) |
-
-### policy-based
-| Algorithms      | Action Space | Tutorial Env   | Papers |
-| --------------- | ------------ | -------------- | -------|
+|**policy-based**||||
 |REINFORCE(PG) |Discrete/Continuous|CartPole | [Reinforcement learning: An introduction. Sutton et al. 2011.](https://www.cambridge.org/core/journals/robotica/article/robot-learning-edited-by-jonathan-h-connell-and-sridhar-mahadevan-kluwer-boston-19931997-xii240-pp-isbn-0792393651-hardback-21800-guilders-12000-8995/737FD21CA908246DF17779E9C20B6DF6)|
 | Trust Region Policy Optimization (TRPO)| Discrete/Continuous | Pendulum | [Abbeel et al. Trust region policy optimization. Schulman et al.2015.](https://arxiv.org/pdf/1502.05477.pdf) |
 | Proximal Policy Optimization (PPO) |Discrete/Continuous |Pendulum| [Proximal policy optimization algorithms. Schulman et al. 2017.](https://arxiv.org/abs/1707.06347) |
 |Distributed Proximal Policy Optimization (DPPO)|Discrete/Continuous |Pendulum|[Emergence of locomotion behaviours in rich environments. Heess et al. 2017.](https://arxiv.org/abs/1707.02286)|
-
-### actor-critic
-| Algorithms      | Action Space | Tutorial Env   | Papers |
-| --------------- | ------------ | -------------- | -------|
+|**actor-critic**||||
 |Actor-Critic (AC)|Discrete/Continuous|CartPole| [Actor-critic algorithms. Konda er al. 2000.](https://papers.nips.cc/paper/1786-actor-critic-algorithms.pdf)|
 | Asynchronous Advantage Actor-Critic (A3C)| Discrete/Continuous | BipedalWalker| [Asynchronous methods for deep reinforcement learning. Mnih et al. 2016.](https://arxiv.org/pdf/1602.01783.pdf) |
 | DDPG|Discrete/Continuous |Pendulum| [Continuous Control With Deep Reinforcement Learning, Lillicrap et al. 2016](https://arxiv.org/pdf/1509.02971.pdf) |