In [2]:
!pip install pybullet

import pybullet_envs
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow_probability.python.distributions import MultivariateNormalDiag
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np

class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='linear')

    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)
        pi = self.pi(value)
        return pi

class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
                 layer1_size=32, layer2_size=32):

        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.state_memory_full = []
        self.action_memory_full = []
        self.reward_memory_full = []
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))
        # self.policy.compile(optimizer=SGD(learning_rate=self.lr, decay=0.0))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = MultivariateNormalDiag(probs)
        action = action_probs.sample()
        # action = tf.squeeze(action)
        # print(action.numpy())
        action = np.nan_to_num(action)
        return action

    def store_transition(self, observation, action, reward):
        # (iter, r, H)
        self.state_memory = observation
        self.action_memory = action
        self.reward_memory = reward
    def compute_ilr(self):
        return


    def learn(self):
        # actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        n_k = len(self.reward_memory)
        G = {}
        for j in range(n_k):
            rewards = self.reward_memory[j]
            H = len(self.reward_memory[j])
            G_j = np.zeros_like(rewards)
            for t in range(H):
                G_sum = 0
                discount = 1
                for k in range(t, H):
                    G_sum += rewards[k] * discount
                    discount *= self.gamma
                G_j[t] = G_sum
            G[j] = G_j

        with tf.GradientTape() as tape:
            loss = 0
            for j in range(n_k):
                for idx, (g, state) in enumerate(zip(G[j][:], self.state_memory[j])):
                    state = tf.convert_to_tensor([state], dtype=tf.float32)
                    probs = self.policy(state)
                    action_probs = MultivariateNormalDiag(probs)
                    log_prob = action_probs.log_prob(self.action_memory[j][idx])
                    loss += -g * tf.squeeze(log_prob)
            loss = loss / n_k
        gradient = tape.gradient(loss, self.policy.trainable_variables)
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))

        self.state_memory_full.append(self.state_memory)
        self.action_memory_full.append(self.action_memory)
        self.reward_memory_full.append(self.reward_memory)

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []


Collecting pybullet
  Downloading pybullet-3.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (91.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.7/91.7 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-3.2.5
[0m

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool:
  import imp
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def randint(low, high=None, size=None, dtype=onp.int):  # pylint: disable=missing-function-docstring
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):


In [3]:
if __name__ == '__main__':
    problem = "InvertedPendulumBulletEnv-v0" # "LunarLander-v2"
    macro = 0
    n_k = 4
    num_episodes = 500
    lr = 0.0007
    for m in range(macro, macro + 1):
        seed = 2021 + m + 1
        env = gym.make(problem)
        num_states = env.observation_space.shape[0]
        num_actions = env.action_space.shape[0]
        env.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        agent = Agent(alpha=lr, gamma=0.99, n_actions=num_actions)
        score_history = []
        path = "VPG/{}/approx-2nd-seed-{}-n_k-{}-id-{}".format(problem, seed, n_k, m+1)
        isExist = os.path.exists(path)
        if not isExist:
            # Create a new directory because it does not exist
            os.makedirs(path)
        for i in range(num_episodes):
            score = 0
            old_weights = agent.policy.get_weights()
            model = [old_weights]
            observations = {}
            actions = {}
            rewards = {}
            for j in range(n_k):
    
                observations[j] = []
                actions[j] = []
                rewards[j] = []
                done = False
                observation = env.reset()
                while not done:
                    action = agent.choose_action(observation)
                    observation_, reward, done, info = env.step(action)
                    observations[j].append(observation)
                    actions[j].append(action)
                    rewards[j].append(reward)
                    observation = observation_
                    score += reward
                # print(rewards)
            agent.store_transition(observations, actions, rewards)
            score_history.append(score / n_k)

            agent.learn()
            avg_score = np.mean(score_history[-100:])
            print('macro-replicate: ', m,'episode: ', i,'score: %.1f' % (score / n_k), 'average score %.1f' % avg_score)
            if avg_score >= 1000:  # Condition to consider the task solved
                print("Solved at episode {}!".format(i))
                break
        np.save(path, np.array(score_history))

pybullet build time: May 20 2022 19:43:01
2022-06-25 15:46:04.010189: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


macro-replicate:  0 episode:  0 score: 19.5 average score 19.5
macro-replicate:  0 episode:  1 score: 23.5 average score 21.5
macro-replicate:  0 episode:  2 score: 30.8 average score 24.6
macro-replicate:  0 episode:  3 score: 49.8 average score 30.9
macro-replicate:  0 episode:  4 score: 26.5 average score 30.0
macro-replicate:  0 episode:  5 score: 31.5 average score 30.2
macro-replicate:  0 episode:  6 score: 25.2 average score 29.5
macro-replicate:  0 episode:  7 score: 21.2 average score 28.5
macro-replicate:  0 episode:  8 score: 18.8 average score 27.4
macro-replicate:  0 episode:  9 score: 20.8 average score 26.8
macro-replicate:  0 episode:  10 score: 29.0 average score 27.0
macro-replicate:  0 episode:  11 score: 24.0 average score 26.7
macro-replicate:  0 episode:  12 score: 24.5 average score 26.5
macro-replicate:  0 episode:  13 score: 25.8 average score 26.5
macro-replicate:  0 episode:  14 score: 54.5 average score 28.4
macro-replicate:  0 episode:  15 score: 26.8 avera