In [2]:
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np

class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')

    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)

        pi = self.pi(value)

        return pi


class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4,
                 layer1_size=256, layer2_size=256):

        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.state_memory_full = []
        self.action_memory_full = []
        self.reward_memory_full = []
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        # self.policy.compile(optimizer=Adam(learning_rate=self.lr))
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))

    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()

        return action.numpy()[0]

    def store_transition(self, observation, action, reward):
        # (iter, r, H)
        self.state_memory.append(observation)
        self.action_memory.append(action)
        self.reward_memory.append(reward)
    def compute_ilr(self):
        return


    def learn(self):
        actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        rewards = np.array(self.reward_memory)
        n_k, H = rewards.shape
        G = np.zeros_like(rewards)

        for j in range(n_k):
            for t in range(H):
                G_sum = 0
                discount = 1
                for k in range(t, H):
                    G_sum += rewards[j, k] * discount
                    discount *= self.gamma
                G[j, t] = G_sum

        with tf.GradientTape() as tape:
            loss = 0
            for j in range(n_k):
                for idx, (g, state) in enumerate(zip(G[j,:], self.state_memory[j])):
                    state = tf.convert_to_tensor([state], dtype=tf.float32)
                    probs = self.policy(state)
                    action_probs = tfp.distributions.Categorical(probs=probs)
                    log_prob = action_probs.log_prob(actions[j,idx])
                    loss += -g * tf.squeeze(log_prob)
            loss = loss / n_k
        gradient = tape.gradient(loss, self.policy.trainable_variables)
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))

        self.state_memory_full.append(self.state_memory)
        self.action_memory_full.append(self.action_memory)
        self.reward_memory_full.append(self.reward_memory)

        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

In [3]:
problem = "Acrobot-v1" # "LunarLander-v2"
macro = 0
n_k = 4
num_episodes = 500
lr = 0.0003
for m in range(macro, macro + 5):
    seed = 2021 + m + 1
    env = gym.make(problem)
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.n
    env.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    agent = Agent(alpha=lr, gamma=0.99, n_actions=num_actions)
    score_history = []
    path = "VPG/{}/approx-2nd-seed-{}-n_k-{}-id-{}".format(problem, seed, n_k, m+1)
    isExist = os.path.exists(path)
    if not isExist:
        # Create a new directory because it does not exist
        os.makedirs(path)
    for i in range(num_episodes):
        score = 0

        observations = []
        actions = []
        rewards = []

        for j in range(n_k):
            observation = env.reset()
            done = False
            while not done:
                action = agent.choose_action(observation)
                observation_, reward, done, info = env.step(action)
                observations.append(observation)
                actions.append(action)
                rewards.append(reward)
                observation = observation_
                score += reward
        agent.store_transition(observations, actions, rewards)
        score_history.append(score / n_k)

        agent.learn()
        avg_score = np.mean(score_history[-100:])
        print('macro-replicate: ', m,'episode: ', i,'score: %.1f' % (score / n_k), 'average score %.1f' % avg_score)
        if avg_score >= 1000:  # Condition to consider the task solved
            print("Solved at episode {}!".format(i))
            break
        np.save(path, np.array(score_history))

macro-replicate:  0 episode:  0 score: -500.0 average score -500.0
macro-replicate:  0 episode:  1 score: -500.0 average score -500.0
macro-replicate:  0 episode:  2 score: -500.0 average score -500.0
macro-replicate:  0 episode:  3 score: -500.0 average score -500.0
macro-replicate:  0 episode:  4 score: -500.0 average score -500.0
macro-replicate:  0 episode:  5 score: -500.0 average score -500.0
macro-replicate:  0 episode:  6 score: -500.0 average score -500.0
macro-replicate:  0 episode:  7 score: -500.0 average score -500.0
macro-replicate:  0 episode:  8 score: -500.0 average score -500.0
macro-replicate:  0 episode:  9 score: -500.0 average score -500.0
macro-replicate:  0 episode:  10 score: -500.0 average score -500.0
macro-replicate:  0 episode:  11 score: -500.0 average score -500.0
macro-replicate:  0 episode:  12 score: -500.0 average score -500.0
macro-replicate:  0 episode:  13 score: -500.0 average score -500.0
macro-replicate:  0 episode:  14 score: -500.0 average sco