In [None]:
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np
import time
import matplotlib.pyplot as plt
import pymc3 as pm
from scipy.stats import beta
import scipy


class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')
    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)
        pi = self.pi(value)
        return pi


class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4, n_k=4, num_episodes=2000, layer1_size=256, layer2_size=256):
        self.c = 1.5
        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.state_memory_full = []
        self.action_memory_full = []
        self.reward_memory_full = []
        self.G_memory_full = []
        self.state_memory = []
        self.model_memory = []
        self.reuses = []
        self.variance = []
        self.time_elapsed = []
        self.gradient_norm = []
        self.loglikelihoods = np.zeros((num_episodes, n_k, num_episodes))
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        self.policy.compile(optimizer=SGD(learning_rate=self.lr, decay=0.0))
        # self.policy.compile(optimizer=Adam(learning_rate=self.lr))
        self._policy_hist = PolicyGradientNetwork(n_actions=n_actions)
        # self._policy_hist.compile(optimizer=SGD(learning_rate=self.lr))
    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        return action.numpy()[0]
    def store_transition(self, observation, action, reward):
        # (iter, r, H)
        self.state_memory = observation
        self.action_memory = action
        self.reward_memory = reward
    def compute_ilr(self):
        return
    def gradient_compute(self, model, i, j, p, n_k, H):
        with tf.GradientTape(persistent=True) as tape:
            cur_likelihood = 0
            loss = 0
            for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                if model == None:
                    model = self._policy_hist.set_weights(self.model_memory[p])
                probs = model(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                # loss[j, idx] = -g * tf.squeeze(log_prob)
                loss = -g * tf.squeeze(log_prob)
                cur_likelihood += tf.squeeze(log_prob)
        return tape.gradient(loss, model.trainable_variables), cur_likelihood
    def mixture_gradient_compute(self, reuse, n_k, num_iters):
        with tf.GradientTape(persistent=True) as tape:
            loss = 0
            for i in reuse:
                for j in range(n_k):
                    numerator = np.exp(self.loglikelihoods[i, j, num_iters - 1])
                    reuse_mixture = [k for k in reuse if k >= i]
                    denominator = np.sum(np.exp(self.loglikelihoods[i, j, [k for k in reuse if k >= i]])) / len(reuse_mixture)
                    for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                        state = tf.convert_to_tensor([state], dtype=tf.float32)
                        probs = self.policy(state)
                        action_probs = tfp.distributions.Categorical(probs=probs)
                        log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                        loss += - numerator/denominator * g * tf.squeeze(log_prob)
            loss = loss / (len(reuse) * n_k)
        return tape.gradient(loss, self.policy.trainable_variables)
    def learn(self):
        # actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        # rewards = np.array(self.reward_memory)
        n_k = len(self.reward_memory)
        G = {}
        for j in range(n_k):
            rewards = self.reward_memory[j]
            H = len(self.reward_memory[j])
            G_j = np.zeros_like(rewards)
            for t in range(H):
                G_sum = 0
                discount = 1
                for k in range(t, H):
                    G_sum += rewards[k] * discount
                    discount *= self.gamma
                G_j[t] = G_sum
            G[j] = G_j
        # store cur info to full
        self.G_memory_full.append(G)
        self.state_memory_full.append(self.state_memory)
        self.action_memory_full.append(self.action_memory)
        self.reward_memory_full.append(self.reward_memory)
        num_iters = len(self.reward_memory_full)
        # loss = np.zeros((n_k, H), dtype = 'float32') # tf.zeros((n_k, H))
        # cur_likelihood = np.zeros((n_k, H))
        grad_agg = []
        timer1 = time.time()
        for j in range(n_k):
            grad, ll = self.gradient_compute(self.policy, -1, j, -1, n_k, H)
            grad_numpy = [g.numpy().flatten() for g in grad]
            grad_numpy = np.concatenate(grad_numpy)
            grad_agg.append(grad_numpy)
            self.loglikelihoods[num_iters - 1, j, num_iters - 1] = ll
        policy_param_size = len(grad_numpy)
        cur_pg_variance = np.stack(grad_agg, axis=0)
        self.gradient_norm.append(np.mean(np.linalg.norm(cur_pg_variance, ord=1, axis=1)))
        cur_pg_variance = np.mean(np.linalg.norm(cur_pg_variance, ord=2, axis=1))
        self.variance.append(cur_pg_variance)
        # compute the nested likelihood ratio
        timer2 = time.time()
        gradient = np.zeros((num_iters, n_k, policy_param_size))
        for j in range(n_k):
            gradient[num_iters - 1, j, :] = grad_agg[j]
        # i-th iter
        for i in range(num_iters):
            # j-th replicate data
            for j in range(n_k):
                grad, ll = self.gradient_compute(self.policy, i, j, -1, n_k, H)
                grad_numpy = [g.numpy().flatten() for g in grad]
                grad_numpy = np.concatenate(grad_numpy)
                gradient[i, j, :] = grad_numpy
                self.loglikelihoods[i, j, num_iters-1] = ll
        loss_ilr_i_j = np.zeros((n_k, policy_param_size))
        reuse_iter = []
        for i in range(num_iters):
            for j in range(n_k):
                numerator = np.exp(self.loglikelihoods[i,j,num_iters-1])
                denominator = np.exp(self.loglikelihoods[i,j,i])
                loss_ilr_i_j[j, :] = numerator / denominator * gradient[i, j, :]
            cur_ilr_variance = np.mean(np.linalg.norm(loss_ilr_i_j, ord=2, axis=1)) # ith
            if cur_ilr_variance <= self.c * cur_pg_variance:
                reuse_iter.append(i)
        timer3 = time.time()
        self.reuses.append(reuse_iter)
        gradient = self.mixture_gradient_compute(reuse_iter, n_k, num_iters)
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
        timer4 = time.time()
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.time_elapsed.append([timer2 - timer1, timer3 -timer2, timer4 - timer3])



if __name__ == '__main__':
    index = 2
    seed = 2021 + index
    n_k = 4
    path = "./seed-{}-n_k-{}-id-{}".format(seed, n_k, index)
    num_episodes = 2000 # iteraction
    problem = "CartPole-v0" # "LunarLander-v2"
    env = gym.make(problem)
    env._max_episode_steps = 100
    num_states = env.observation_space.shape[0]
    print("Size of State Space ->  {}".format(num_states))
    num_actions = env.action_space
    print("Size of Action Space ->  {}".format(num_actions))
    env.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    agent = Agent(alpha=0.01, gamma=0.99, n_actions=2, n_k=4, num_episodes=2000)
    score_history = []
    for i in range(num_episodes):
        score = 0
        old_weights = agent.policy.get_weights()
        model = [old_weights]
        observations = {}
        actions = {}
        rewards = {}
        for j in range(n_k):
            observations[j] = []
            actions[j] = []
            rewards[j] = []
            done = False
            observation = env.reset()
            while not done:
                action = agent.choose_action(observation)
                observation_, reward, done, info = env.step(action)
                observations[j].append(observation)
                actions[j].append(action)
                rewards[j].append(reward)
                observation = observation_
                score += reward
            # print(rewards)
        agent.store_transition(observations, actions, rewards)
        score_history.append(score / n_k)
        # old_weights = agent.policy.get_weights()
        # agent.model_memory.append(old_weights)
        agent.learn()
        agent.policy.save_weights(path + "/model-{}".format(i))
        # Update running reward to check condition for solving
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % (score / n_k),
            'average score %.1f' % avg_score)
        # template = "reuse window: {}"
        # print(template.format(agent.reuses[-1]))
        if avg_score > 95:  # Condition to consider the task solved
            print("Solved at episode {}!".format(i))
            break


Size of State Space ->  4
Size of Action Space ->  Discrete(2)
episode:  0 score: 21.8 average score 21.8
episode:  1 score: 20.2 average score 21.0
episode:  2 score: 36.8 average score 26.2
episode:  3 score: 40.0 average score 29.7
episode:  4 score: 24.5 average score 28.6
episode:  5 score: 49.8 average score 32.2
episode:  6 score: 22.0 average score 30.7
episode:  7 score: 38.8 average score 31.7
episode:  8 score: 70.5 average score 36.0
episode:  9 score: 18.8 average score 34.3
episode:  10 score: 56.5 average score 36.3
episode:  11 score: 13.5 average score 34.4
episode:  12 score: 24.2 average score 33.6
episode:  13 score: 66.8 average score 36.0
episode:  14 score: 18.8 average score 34.9
episode:  15 score: 34.2 average score 34.8
episode:  16 score: 19.5 average score 33.9
episode:  17 score: 28.5 average score 33.6
episode:  18 score: 71.0 average score 35.6
episode:  19 score: 14.2 average score 34.5
episode:  20 score: 11.0 average score 33.4
episode:  21 score: 12.

In [None]:
np.save('cartpole-fixed0.01-mlr',score_history)