In [None]:
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np
import time
import matplotlib.pyplot as plt
# import pymc3 as pm
from scipy.stats import beta
import scipy
import gc

class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')
    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)
        pi = self.pi(value)
        return pi


class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4, n_k=4, num_episodes=2000, layer1_size=256, layer2_size=256):
        self.c = 1.2
        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.state_memory_full = []
        self.action_memory_full = []
        self.reward_memory_full = []
        self.G_memory_full = []
        self.state_memory = []
        self.model_memory = []
        self.reuses = []
        self.variance = []
        self.time_elapsed = []
        self.gradient_norm = []
        self.loglikelihoods = np.zeros((num_episodes, n_k, num_episodes))
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        # self.policy.compile(optimizer=SGD(learning_rate=self.lr, decay=0.0))
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))
        self._policy_hist = PolicyGradientNetwork(n_actions=n_actions)
        # self._policy_hist.compile(optimizer=SGD(learning_rate=self.lr))
    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        return action.numpy()[0]
    def store_transition(self, observation, action, reward):
        # (iter, r, H)
        self.state_memory = observation
        self.action_memory = action
        self.reward_memory = reward
    def compute_ilr(self):
        return
    def gradient_compute(self, model, i, j, p, n_k, H):
        with tf.GradientTape(persistent=True) as tape:
            cur_likelihood = 0
            loss = 0
            for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                if model == None:
                    model = self._policy_hist.set_weights(self.model_memory[p])
                probs = model(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                # loss[j, idx] = -g * tf.squeeze(log_prob)
                loss = -g * tf.squeeze(log_prob)
                cur_likelihood += tf.squeeze(log_prob)
        return tape.gradient(loss, model.trainable_variables), cur_likelihood
    def mixture_gradient_compute(self, reuse, n_k, num_iters):
        with tf.GradientTape(persistent=True) as tape:
            loss = 0
            for i in reuse:
                for j in range(n_k):
                    numerator = np.exp(self.loglikelihoods[i, j, num_iters - 1])
                    reuse_mixture = [k for k in reuse if k >= i]
                    denominator = np.sum(np.exp(self.loglikelihoods[i, j, [k for k in reuse if k >= i]])) / len(reuse_mixture)
                    for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                        state = tf.convert_to_tensor([state], dtype=tf.float32)
                        probs = self.policy(state)
                        action_probs = tfp.distributions.Categorical(probs=probs)
                        log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                        loss += - numerator/denominator * g * tf.squeeze(log_prob)
            loss = loss / (len(reuse) * n_k)
        return tape.gradient(loss, self.policy.trainable_variables)
    def learn(self):
        # actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        # rewards = np.array(self.reward_memory)
        n_k = len(self.reward_memory)
        G = {}
        for j in range(n_k):
            rewards = self.reward_memory[j]
            H = len(self.reward_memory[j])
            G_j = np.zeros_like(rewards)
            for t in range(H):
                G_sum = 0
                discount = 1
                for k in range(t, H):
                    G_sum += rewards[k] * discount
                    discount *= self.gamma
                G_j[t] = G_sum
            G[j] = G_j
        # store cur info to full
        self.G_memory_full.append(G)
        self.state_memory_full.append(self.state_memory)
        self.action_memory_full.append(self.action_memory)
        self.reward_memory_full.append(self.reward_memory)
        num_iters = len(self.reward_memory_full)
        # loss = np.zeros((n_k, H), dtype = 'float32') # tf.zeros((n_k, H))
        # cur_likelihood = np.zeros((n_k, H))
        grad_agg = []
        timer1 = time.time()
        for j in range(n_k):
            grad, ll = self.gradient_compute(self.policy, -1, j, -1, n_k, H)
            grad_numpy = [g.numpy().flatten() for g in grad]
            grad_numpy = np.concatenate(grad_numpy)
            grad_agg.append(grad_numpy)
            self.loglikelihoods[num_iters - 1, j, num_iters - 1] = ll
        policy_param_size = len(grad_numpy)
        cur_pg_variance = np.stack(grad_agg, axis=0)
        # self.gradient_norm.append(np.mean(np.linalg.norm(cur_pg_variance, ord=1, axis=1)))
        cur_pg_variance = np.mean(np.linalg.norm(cur_pg_variance, ord=2, axis=1))
        self.variance.append(cur_pg_variance)
        # compute the nested likelihood ratio
        timer2 = time.time()
        gradient = np.zeros((num_iters, n_k, policy_param_size))
        for j in range(n_k):
            gradient[num_iters - 1, j, :] = grad_agg[j]
        # i-th iter
        for i in range(num_iters):
            # j-th replicate data
            for j in range(n_k):
                grad, ll = self.gradient_compute(self.policy, i, j, -1, n_k, H)
                grad_numpy = [g.numpy().flatten() for g in grad]
                grad_numpy = np.concatenate(grad_numpy)
                gradient[i, j, :] = grad_numpy
                self.loglikelihoods[i, j, num_iters-1] = ll
        loss_ilr_i_j = np.zeros((n_k, policy_param_size))
        reuse_iter = []
        for i in range(num_iters):
            for j in range(n_k):
                numerator = np.exp(self.loglikelihoods[i,j,num_iters-1])
                denominator = np.exp(self.loglikelihoods[i,j,i])
                loss_ilr_i_j[j, :] = numerator / denominator * gradient[i, j, :]
            cur_ilr_variance = np.mean(np.linalg.norm(loss_ilr_i_j, ord=2, axis=1)) # ith
            if cur_ilr_variance <= self.c * cur_pg_variance:
                reuse_iter.append(i)
        timer3 = time.time()
        self.reuses.append(reuse_iter)
        gradient = self.mixture_gradient_compute(reuse_iter, n_k, num_iters)
        ## 
        grad_numpy = [g.numpy().flatten() for g in gradient]
        grad_numpy = np.concatenate(grad_numpy)
        grad_agg.append(grad_numpy)
        cur_pg_variance = np.stack(grad_agg, axis=0)
        self.gradient_norm.append(np.mean(np.linalg.norm(cur_pg_variance, ord=2, axis=1)))
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
        timer4 = time.time()
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.time_elapsed.append([timer2 - timer1, timer3 -timer2, timer4 - timer3])
        gc.collect()



if __name__ == '__main__':
    index = 33
    seed = 2021 + index
    n_k = 4
    path = "/content/drive/MyDrive/Cartpole/seed-{}-n_k-{}-id-{}-c-{}".format(seed, n_k, index, 1.2)
    num_episodes = 2000 # iteraction
    problem = "CartPole-v0" # "LunarLander-v2"
    env = gym.make(problem)
    # env._max_episode_steps = 200
    num_states = env.observation_space.shape[0]
    print("Size of State Space ->  {}".format(num_states))
    num_actions = env.action_space
    print("Size of Action Space ->  {}".format(num_actions))
    env.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    agent = Agent(alpha=0.01, gamma=0.99, n_actions=2, n_k=4, num_episodes=500)
    score_history = []
    for i in range(num_episodes):
        score = 0
        old_weights = agent.policy.get_weights()
        model = [old_weights]
        observations = {}
        actions = {}
        rewards = {}
        for j in range(n_k):
            observations[j] = []
            actions[j] = []
            rewards[j] = []
            done = False
            observation = env.reset()
            while not done:
                action = agent.choose_action(observation)
                observation_, reward, done, info = env.step(action)
                observations[j].append(observation)
                actions[j].append(action)
                rewards[j].append(reward)
                observation = observation_
                score += reward
            # print(rewards)
        agent.store_transition(observations, actions, rewards)
        score_history.append(score / n_k)
        # old_weights = agent.policy.get_weights()
        # agent.model_memory.append(old_weights)
        agent.learn()
        agent.policy.save_weights(path + "/model-{}".format(i))
        # Update running reward to check condition for solving
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % (score / n_k),
            'average score %.1f' % avg_score)
        # template = "reuse window: {}"
        # print(template.format(agent.reuses[-1]))
        if avg_score >= 195:  # Condition to consider the task solved
            print("Solved at episode {}!".format(i))
            break
    with open(path+'/reuses.txt', 'w') as f:
        for _list in agent.reuses:
            for i in range(len(_list)):
                #f.seek(0)
                if i == len(_list) - 1:
                    f.write(str(_list[i]) + '\n')
                else:
                    f.write(str(_list[i]) + ',')
    np.save(path+'/variance', agent.variance)
    np.save(path+'/time_elapsed', agent.time_elapsed)
    np.save(path+"/score_history",score_history)
    np.save(path+"/gradient_norm",agent.gradient_norm)

Size of State Space ->  4
Size of Action Space ->  Discrete(2)
episode:  0 score: 17.5 average score 17.5
episode:  1 score: 25.8 average score 21.6
episode:  2 score: 43.2 average score 28.8
episode:  3 score: 12.2 average score 24.7
episode:  4 score: 27.5 average score 25.2
episode:  5 score: 20.0 average score 24.4
episode:  6 score: 54.5 average score 28.7
episode:  7 score: 34.0 average score 29.3
episode:  8 score: 35.2 average score 30.0
episode:  9 score: 42.2 average score 31.2
episode:  10 score: 40.0 average score 32.0
episode:  11 score: 41.5 average score 32.8
episode:  12 score: 50.8 average score 34.2
episode:  13 score: 37.2 average score 34.4
episode:  14 score: 39.8 average score 34.8
episode:  15 score: 55.5 average score 36.1
episode:  16 score: 70.0 average score 38.1
episode:  17 score: 63.8 average score 39.5
episode:  18 score: 66.0 average score 40.9
episode:  19 score: 57.5 average score 41.7
episode:  20 score: 23.0 average score 40.8
episode:  21 score: 40.

In [None]:
# episode:  0 score: 40.5 average score 40.5
# episode:  1 score: 18.0 average score 29.2
# episode:  2 score: 33.2 average score 30.6
# episode:  3 score: 17.0 average score 27.2
# episode:  4 score: 24.5 average score 26.6
# episode:  5 score: 32.0 average score 27.5
# episode:  6 score: 31.0 average score 28.0
# episode:  7 score: 30.5 average score 28.3
# episode:  8 score: 36.5 average score 29.2
# episode:  9 score: 36.5 average score 30.0
# episode:  10 score: 22.8 average score 29.3
# episode:  11 score: 31.5 average score 29.5
# episode:  12 score: 32.2 average score 29.7
# episode:  13 score: 38.0 average score 30.3
# episode:  14 score: 28.8 average score 30.2
# episode:  15 score: 40.5 average score 30.8
# episode:  16 score: 33.2 average score 31.0
# episode:  17 score: 52.5 average score 32.2
# episode:  18 score: 35.0 average score 32.3
# episode:  19 score: 28.8 average score 32.1
# episode:  20 score: 47.8 average score 32.9
# episode:  21 score: 19.5 average score 32.3
# episode:  22 score: 41.5 average score 32.7
# episode:  23 score: 39.2 average score 33.0
# episode:  24 score: 56.8 average score 33.9
# episode:  25 score: 29.8 average score 33.8
# episode:  26 score: 44.8 average score 34.2
# episode:  27 score: 58.2 average score 35.0
# episode:  28 score: 63.2 average score 36.0
# episode:  29 score: 41.2 average score 36.2
# episode:  30 score: 46.0 average score 36.5
# episode:  31 score: 32.5 average score 36.4
# episode:  32 score: 63.0 average score 37.2
# episode:  33 score: 47.2 average score 37.5
# episode:  34 score: 44.2 average score 37.7
# episode:  35 score: 64.8 average score 38.4
# episode:  36 score: 61.8 average score 39.0
# episode:  37 score: 84.8 average score 40.2
# episode:  38 score: 52.5 average score 40.6
# episode:  39 score: 76.2 average score 41.5
# episode:  40 score: 31.5 average score 41.2
# episode:  41 score: 35.2 average score 41.1
# episode:  42 score: 102.8 average score 42.5
# episode:  43 score: 61.5 average score 42.9
# episode:  44 score: 51.8 average score 43.1
# episode:  45 score: 117.5 average score 44.7
# episode:  46 score: 85.5 average score 45.6
# episode:  47 score: 88.8 average score 46.5
# episode:  48 score: 112.2 average score 47.9
# episode:  49 score: 64.0 average score 48.2
# episode:  50 score: 111.5 average score 49.4
# episode:  51 score: 98.8 average score 50.4
# episode:  52 score: 101.0 average score 51.3
# episode:  53 score: 153.5 average score 53.2
# episode:  54 score: 152.2 average score 55.0
# episode:  55 score: 104.2 average score 55.9
# episode:  56 score: 141.5 average score 57.4
# episode:  57 score: 115.8 average score 58.4
# episode:  58 score: 107.0 average score 59.2
# episode:  59 score: 134.0 average score 60.5
# episode:  60 score: 78.5 average score 60.8
# episode:  61 score: 186.8 average score 62.8
# episode:  62 score: 165.8 average score 64.4
# episode:  63 score: 162.2 average score 66.0
# episode:  64 score: 153.8 average score 67.3
# episode:  65 score: 168.0 average score 68.8
# episode:  66 score: 82.0 average score 69.0
# episode:  67 score: 130.0 average score 69.9
# episode:  68 score: 150.2 average score 71.1
# episode:  69 score: 150.0 average score 72.2
# episode:  70 score: 194.0 average score 73.9
# episode:  71 score: 159.0 average score 75.1
# episode:  72 score: 175.8 average score 76.5
# episode:  73 score: 169.8 average score 77.8
# episode:  74 score: 192.8 average score 79.3
# episode:  75 score: 161.8 average score 80.4
# episode:  76 score: 200.0 average score 81.9
# episode:  77 score: 174.8 average score 83.1
# episode:  78 score: 174.2 average score 84.3
# episode:  79 score: 119.8 average score 84.7
# episode:  80 score: 185.2 average score 86.0
# episode:  81 score: 192.0 average score 87.2
# episode:  82 score: 148.5 average score 88.0
# episode:  83 score: 151.8 average score 88.7
# episode:  84 score: 142.2 average score 89.4
# episode:  85 score: 135.2 average score 89.9
# episode:  86 score: 185.5 average score 91.0
# episode:  87 score: 191.8 average score 92.2
# episode:  88 score: 182.5 average score 93.2
# episode:  89 score: 200.0 average score 94.4
# episode:  90 score: 200.0 average score 95.5
# episode:  91 score: 200.0 average score 96.7
# episode:  92 score: 157.5 average score 97.3
# episode:  93 score: 179.5 average score 98.2
# episode:  94 score: 187.2 average score 99.1
# episode:  95 score: 172.8 average score 99.9
# episode:  96 score: 109.5 average score 100.0
# episode:  97 score: 155.5 average score 100.6
# episode:  98 score: 103.5 average score 100.6
# episode:  99 score: 159.0 average score 101.2
# episode:  100 score: 166.5 average score 102.4
# episode:  101 score: 120.0 average score 103.4
# episode:  102 score: 200.0 average score 105.1
# episode:  103 score: 200.0 average score 106.9
# episode:  104 score: 181.8 average score 108.5
# episode:  105 score: 187.8 average score 110.1
# episode:  106 score: 193.5 average score 111.7
# episode:  107 score: 200.0 average score 113.4
# episode:  108 score: 200.0 average score 115.0
# episode:  109 score: 200.0 average score 116.7
# episode:  110 score: 200.0 average score 118.4
# episode:  111 score: 200.0 average score 120.1
# episode:  112 score: 197.0 average score 121.8
# episode:  113 score: 191.5 average score 123.3
# episode:  114 score: 200.0 average score 125.0
# episode:  115 score: 200.0 average score 126.6
# episode:  116 score: 200.0 average score 128.3
# episode:  117 score: 158.8 average score 129.3
# episode:  118 score: 200.0 average score 131.0
# episode:  119 score: 200.0 average score 132.7
# episode:  120 score: 200.0 average score 134.2
# episode:  121 score: 200.0 average score 136.0
# episode:  122 score: 200.0 average score 137.6
# episode:  123 score: 194.0 average score 139.2
# episode:  124 score: 200.0 average score 140.6
# episode:  125 score: 200.0 average score 142.3
# episode:  126 score: 200.0 average score 143.8
# episode:  127 score: 200.0 average score 145.3
# episode:  128 score: 200.0 average score 146.6
# episode:  129 score: 195.5 average score 148.2
# episode:  130 score: 143.2 average score 149.2
# episode:  131 score: 182.5 average score 150.7
# episode:  132 score: 200.0 average score 152.0
# episode:  133 score: 190.0 average score 153.4
# episode:  134 score: 200.0 average score 155.0
# episode:  135 score: 194.8 average score 156.3
# episode:  136 score: 182.5 average score 157.5
# episode:  137 score: 180.5 average score 158.5
# episode:  138 score: 200.0 average score 159.9
# episode:  139 score: 200.0 average score 161.2
# episode:  140 score: 200.0 average score 162.9
# episode:  141 score: 200.0 average score 164.5
# episode:  142 score: 200.0 average score 165.5
# episode:  143 score: 193.5 average score 166.8
# episode:  144 score: 200.0 average score 168.3
# episode:  145 score: 163.0 average score 168.7
# episode:  146 score: 166.8 average score 169.6
# episode:  147 score: 194.0 average score 170.6
# episode:  148 score: 174.8 average score 171.2
# episode:  149 score: 163.2 average score 172.2
# episode:  150 score: 165.0 average score 172.8
# episode:  151 score: 171.5 average score 173.5
# episode:  152 score: 141.8 average score 173.9
# episode:  153 score: 176.2 average score 174.1
# episode:  154 score: 186.0 average score 174.5
# episode:  155 score: 195.5 average score 175.4
# episode:  156 score: 189.0 average score 175.8
# episode:  157 score: 173.2 average score 176.4
# episode:  158 score: 128.0 average score 176.6
# episode:  159 score: 150.0 average score 176.8
# episode:  160 score: 176.2 average score 177.8
# episode:  161 score: 150.2 average score 177.4
# episode:  162 score: 140.2 average score 177.2
# episode:  163 score: 179.5 average score 177.3
# episode:  164 score: 200.0 average score 177.8
# episode:  165 score: 200.0 average score 178.1
# episode:  166 score: 159.5 average score 178.9
# episode:  167 score: 183.8 average score 179.4
# episode:  168 score: 200.0 average score 179.9
# episode:  169 score: 200.0 average score 180.4
# episode:  170 score: 200.0 average score 180.5
# episode:  171 score: 200.0 average score 180.9
# episode:  172 score: 200.0 average score 181.1
# episode:  173 score: 200.0 average score 181.4
# episode:  174 score: 200.0 average score 181.5
# episode:  175 score: 200.0 average score 181.9
# episode:  176 score: 200.0 average score 181.9
# episode:  177 score: 200.0 average score 182.1
# episode:  178 score: 200.0 average score 182.4
# episode:  179 score: 195.8 average score 183.2
# episode:  180 score: 200.0 average score 183.3
# episode:  181 score: 200.0 average score 183.4
# episode:  182 score: 193.5 average score 183.8
# episode:  183 score: 140.5 average score 183.7
# episode:  184 score: 200.0 average score 184.3
# episode:  185 score: 156.8 average score 184.5
# episode:  186 score: 168.2 average score 184.3
# episode:  187 score: 200.0 average score 184.4
# episode:  188 score: 200.0 average score 184.6
# episode:  189 score: 188.8 average score 184.5
# episode:  190 score: 200.0 average score 184.5
# episode:  191 score: 160.0 average score 184.1
# episode:  192 score: 188.5 average score 184.4
# episode:  193 score: 200.0 average score 184.6
# episode:  194 score: 157.2 average score 184.3
# episode:  195 score: 189.0 average score 184.5
# episode:  196 score: 151.8 average score 184.9
# episode:  197 score: 103.2 average score 184.4
# episode:  198 score: 148.0 average score 184.8
# episode:  199 score: 177.0 average score 185.0
# episode:  200 score: 166.5 average score 185.0
# episode:  201 score: 172.0 average score 185.5

In [None]:
np.save('/content/drive/MyDrive/Cartpole/cartpole-lr0005-mlr',score_history)

In [None]:
import gym
import numpy as np
# from reinforce_tf2 import Agent
# from utils import plotLearning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam, SGD
import numpy as np
import time
import matplotlib.pyplot as plt
# import pymc3 as pm
from scipy.stats import beta
import scipy
from google.colab import drive
drive.mount('/content/drive')

class PolicyGradientNetwork(keras.Model):
    def __init__(self, n_actions, fc1_dims=32, fc2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = Dense(self.fc1_dims, activation='relu')
        self.fc2 = Dense(self.fc2_dims, activation='relu')
        self.pi = Dense(n_actions, activation='softmax')
    def call(self, state):
        value = self.fc1(state)
        value = self.fc2(value)
        pi = self.pi(value)
        return pi


class Agent:
    def __init__(self, alpha=0.003, gamma=0.99, n_actions=4, n_k=4, num_episodes=2000, layer1_size=256, layer2_size=256):
        self.c = 5
        self.gamma = gamma
        self.lr = alpha
        self.n_actions = n_actions
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.state_memory_full = []
        self.action_memory_full = []
        self.reward_memory_full = []
        self.G_memory_full = []
        self.state_memory = []
        self.model_memory = []
        self.reuses = []
        self.variance = []
        self.time_elapsed = []
        self.gradient_norm = []
        self.loglikelihoods = np.zeros((num_episodes, n_k, num_episodes))
        self.policy = PolicyGradientNetwork(n_actions=n_actions)
        # self.policy.compile(optimizer=SGD(learning_rate=self.lr, decay=0.0))
        self.policy.compile(optimizer=Adam(learning_rate=self.lr))
        self._policy_hist = PolicyGradientNetwork(n_actions=n_actions)
        # self._policy_hist.compile(optimizer=SGD(learning_rate=self.lr))
    def choose_action(self, observation):
        state = tf.convert_to_tensor([observation], dtype=tf.float32)
        probs = self.policy(state)
        action_probs = tfp.distributions.Categorical(probs=probs)
        action = action_probs.sample()
        return action.numpy()[0]
    def store_transition(self, observation, action, reward):
        # (iter, r, H)
        self.state_memory = observation
        self.action_memory = action
        self.reward_memory = reward
    def compute_ilr(self):
        return
    def gradient_compute(self, model, i, j, p, n_k, H):
        with tf.GradientTape(persistent=True) as tape:
            cur_likelihood = 0
            loss = 0
            for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                state = tf.convert_to_tensor([state], dtype=tf.float32)
                if model == None:
                    model = self._policy_hist.set_weights(self.model_memory[p])
                probs = model(state)
                action_probs = tfp.distributions.Categorical(probs=probs)
                log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                # loss[j, idx] = -g * tf.squeeze(log_prob)
                loss = -g * tf.squeeze(log_prob)
                cur_likelihood += tf.squeeze(log_prob)
        return tape.gradient(loss, model.trainable_variables), cur_likelihood
    def mixture_gradient_compute(self, reuse, n_k, num_iters):
        with tf.GradientTape(persistent=True) as tape:
            loss = 0
            for i in reuse:
                for j in range(n_k):
                    numerator = np.exp(self.loglikelihoods[i, j, num_iters - 1])
                    reuse_mixture = [k for k in reuse if k >= i]
                    denominator = np.sum(np.exp(self.loglikelihoods[i, j, [k for k in reuse if k >= i]])) / len(reuse_mixture)
                    for idx, (g, state) in enumerate(zip(self.G_memory_full[i][j][:], self.state_memory_full[i][j])):
                        state = tf.convert_to_tensor([state], dtype=tf.float32)
                        probs = self.policy(state)
                        action_probs = tfp.distributions.Categorical(probs=probs)
                        log_prob = action_probs.log_prob(self.action_memory_full[i][j][idx])
                        loss += - numerator/denominator * g * tf.squeeze(log_prob)
            loss = loss / (len(reuse) * n_k)
        return tape.gradient(loss, self.policy.trainable_variables)
    def learn(self):
        # actions = tf.convert_to_tensor(self.action_memory, dtype=tf.float32)
        # rewards = np.array(self.reward_memory)
        n_k = len(self.reward_memory)
        G = {}
        for j in range(n_k):
            rewards = self.reward_memory[j]
            H = len(self.reward_memory[j])
            G_j = np.zeros_like(rewards)
            for t in range(H):
                G_sum = 0
                discount = 1
                for k in range(t, H):
                    G_sum += rewards[k] * discount
                    discount *= self.gamma
                G_j[t] = G_sum
            G[j] = G_j
        # store cur info to full
        self.G_memory_full.append(G)
        self.state_memory_full.append(self.state_memory)
        self.action_memory_full.append(self.action_memory)
        self.reward_memory_full.append(self.reward_memory)
        num_iters = len(self.reward_memory_full)
        # loss = np.zeros((n_k, H), dtype = 'float32') # tf.zeros((n_k, H))
        # cur_likelihood = np.zeros((n_k, H))
        grad_agg = []
        timer1 = time.time()
        for j in range(n_k):
            grad, ll = self.gradient_compute(self.policy, -1, j, -1, n_k, H)
            grad_numpy = [g.numpy().flatten() for g in grad]
            grad_numpy = np.concatenate(grad_numpy)
            grad_agg.append(grad_numpy)
            self.loglikelihoods[num_iters - 1, j, num_iters - 1] = ll
        policy_param_size = len(grad_numpy)
        cur_pg_variance = np.stack(grad_agg, axis=0)
        # self.gradient_norm.append(np.mean(np.linalg.norm(cur_pg_variance, ord=1, axis=1)))
        cur_pg_variance = np.mean(np.linalg.norm(cur_pg_variance, ord=2, axis=1))
        self.variance.append(cur_pg_variance)
        # compute the nested likelihood ratio
        timer2 = time.time()
        gradient = np.zeros((num_iters, n_k, policy_param_size))
        for j in range(n_k):
            gradient[num_iters - 1, j, :] = grad_agg[j]
        # i-th iter
        for i in range(num_iters):
            # j-th replicate data
            for j in range(n_k):
                grad, ll = self.gradient_compute(self.policy, i, j, -1, n_k, H)
                grad_numpy = [g.numpy().flatten() for g in grad]
                grad_numpy = np.concatenate(grad_numpy)
                gradient[i, j, :] = grad_numpy
                self.loglikelihoods[i, j, num_iters-1] = ll
        loss_ilr_i_j = np.zeros((n_k, policy_param_size))
        reuse_iter = []
        for i in range(num_iters):
            for j in range(n_k):
                numerator = np.exp(self.loglikelihoods[i,j,num_iters-1])
                denominator = np.exp(self.loglikelihoods[i,j,i])
                loss_ilr_i_j[j, :] = numerator / denominator * gradient[i, j, :]
            cur_ilr_variance = np.mean(np.linalg.norm(loss_ilr_i_j, ord=2, axis=1)) # ith
            if cur_ilr_variance <= self.c * cur_pg_variance:
                reuse_iter.append(i)
        timer3 = time.time()
        self.reuses.append(reuse_iter)
        gradient = self.mixture_gradient_compute(reuse_iter, n_k, num_iters)
        ## 
        grad_numpy = [g.numpy().flatten() for g in gradient]
        grad_numpy = np.concatenate(grad_numpy)
        grad_agg.append(grad_numpy)
        cur_pg_variance = np.stack(grad_agg, axis=0)
        self.gradient_norm.append(np.mean(np.linalg.norm(cur_pg_variance, ord=2, axis=1)))
        self.policy.optimizer.apply_gradients(zip(gradient, self.policy.trainable_variables))
        timer4 = time.time()
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        # self.time_elapsed.append([timer2 - timer1, timer3 -timer2, timer4 - timer3])



if __name__ == '__main__':
    index = 2
    c=5
    seed = 2021 + index
    n_k = 4
    path = "seed-{}-n_k-{}-id-{}-c{}".format(seed, n_k, index,c)
    num_episodes = 2000 # iteraction
    problem = "CartPole-v0" # "LunarLander-v2"
    env = gym.make(problem)
    # env._max_episode_steps = 200
    num_states = env.observation_space.shape[0]
    print("Size of State Space ->  {}".format(num_states))
    num_actions = env.action_space
    print("Size of Action Space ->  {}".format(num_actions))
    env.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    agent = Agent(alpha=0.005, gamma=0.99, n_actions=2, n_k=4, num_episodes=2000)
    score_history = []
    for i in range(num_episodes):
        score = 0
        old_weights = agent.policy.get_weights()
        model = [old_weights]
        observations = {}
        actions = {}
        rewards = {}
        for j in range(n_k):
            observations[j] = []
            actions[j] = []
            rewards[j] = []
            done = False
            observation = env.reset()
            while not done:
                action = agent.choose_action(observation)
                observation_, reward, done, info = env.step(action)
                observations[j].append(observation)
                actions[j].append(action)
                rewards[j].append(reward)
                observation = observation_
                score += reward
            # print(rewards)
        agent.store_transition(observations, actions, rewards)
        score_history.append(score / n_k)
        # old_weights = agent.policy.get_weights()
        # agent.model_memory.append(old_weights)
        agent.learn()
        # agent.policy.save_weights(path + "/model-{}".format(i))
        # Update running reward to check condition for solving
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % (score / n_k),
            'average score %.1f' % avg_score)
        # template = "reuse window: {}"
        # print(template.format(agent.reuses[-1]))
        if avg_score > 195:  # Condition to consider the task solved
            print("Solved at episode {}!".format(i))
            break
    with open(path+'/reuses.txt', 'w') as f:
        for _list in agent.reuses:
            for i in range(len(_list)):
                #f.seek(0)
                if i == len(_list) - 1:
                    f.write(str(_list[i]) + '\n')
                else:
                    f.write(str(_list[i]) + ',')
    np.save(path+'/variance', agent.variance)
    np.save(path+'/time_elapsed', agent.time_elapsed)
    np.save(path+"/score_history",score_history)
    np.save(path+"/gradient_norm",agent.gradient_norm)
    iter_reuse_len = {}
    for i in agent.reuses:
      for j in range(50):
        if j not in iter_reuse_len:
          iter_reuse_len[j] = [len(i[j])]
        else:
          iter_reuse_len[j].append(len(i[j]))
    pd.DataFrame(iter_reuse_len).to_csv('reuse.csv')