In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import json, random
import pathlib
import gym
import collections
import tqdm
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
from collections import deque

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [19]:
class AcModel(tf.keras.Model):
    def __init__(self, n_actions):
        super(AcModel, self).__init__()   
        self.n_actions = n_actions
        self.fc1 = tf.keras.layers.Dense(256, activation="relu")
        self.fc2 = tf.keras.layers.Dense(256, activation="relu")
        self.fc3 = tf.keras.layers.Dense(self.n_actions, activation="softmax")
        self.fc4 = tf.keras.layers.Dense(1, activation="linear")
        
    def call(self, x):
        x = tf.convert_to_tensor([x], dtype=tf.float32)
        x = self.fc1(x)
        x = self.fc2(x)
        actions = self.fc3(x)
        vals = self.fc4(x)
        return actions, vals[0]


In [14]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
    

In [52]:
class Agent:
    def __init__(self, env, batch_size):
        self.env = env
        self.n_actions = self.env.action_space.n
        self.gamma = 0.99
        self.gae_lambda=0.95
        self.policy_clip = 0.2
#         self.actor = ActorModel(self.n_actions)
#         self.critic = CriticModel()
        self.ac = AcModel(self.n_actions)
        self.batch_size = batch_size
        self.actor_opt = tf.optimizers.Adam(learning_rate=0.08)
        self.critic_opt = tf.optimizers.Adam(learning_rate=0.08)
        self.PPOMemory = PPOMemory(self.batch_size)
        
    def learn(self):
        
        states, actions, probs, vals, rewards, dones, batches_index = self.PPOMemory.generate_batches()
            
        advantage = np.zeros(len(rewards), dtype=np.float32)
        for t in range(len(rewards)-1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards)-1):
                a_t += discount*(rewards[k] + self.gamma*vals[k+1]*(1-int(dones[k])) - vals[k])
                discount *= self.gamma * self.gae_lambda 
            advantage[t] = a_t
        

        for batch in batches_index:
            with tf.GradientTape() as tape:
                dist, critic_value = self.ac(states[batch])
                print(dist)
                new_probs = tf.math.log(tf.math.reduce_max(dist[0, ]))
                print("new_probs", new_probs)
                old_probs = probs[batch]
                prob_ratio = tf.math.exp(new_probs) / tf.math.exp(old_probs)
                #prob_ratio = (new_probs - old_probs).exp()
                print('---prob_ratio--', prob_ratio)
                weighted_probs = advantages[i] * prob_ratio
                
                weighted_clipped_probs = tf.clip_by_value(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantages[i]
#                 print(np.min(weighted_probs, weighted_clipped_probs))
                actor_loss = -tf.math.reduce_min(tf.concat([weighted_probs, weighted_clipped_probs], axis=0))
#                 actor_loss =  tf.math.reduce_mean(weighted_clipped_probs) 
            grads = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor_opt.apply_gradients(zip(grads, self.actor.trainable_variables))
            
            with tf.GradientTape() as tape:
                critic_value = self.critic(states[i])
                returns = advantages[i] + vals[i]
                critic_loss = (returns-critic_value)**2
                critic_loss = tf.math.reduce_mean(critic_loss)         

            grads = tape.gradient(critic_loss, self.critic.trainable_variables)
            self.critic_opt.apply_gradients(zip(grads, self.critic.trainable_variables))
        
        
        
        
    def choose_action(self, state):
       
        probs, vals = self.ac(state)
        action = tf.math.argmax(probs, axis=1)
        tmp_action = action.numpy()
        prob = tf.math.log(probs[0, tmp_action[0]])
        return action, prob, vals
        
    def train(self, episode_num=1000):
        self.episode_num = episode_num
        n_steps = 0
        learn_times = 100
        score = 0
        for i in range(self.episode_num):
            state = env.reset()
            done = False
            score = 0
            while not done:
                action, probs, vals = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action.numpy()[0])
                self.PPOMemory.store_memory(state, action, probs, vals, reward, done)
                n_steps += 1
                if n_steps % learn_times and n_steps > self.batch_size:
                    self.learn()
                state = next_state
                score += reward
            print(f"{i} time is {score}")
            

In [53]:
env = gym.make("CartPole-v0")
agent = Agent(env, batch_size=64)
agent.train()

0 time is 10.0
1 time is 19.0
2 time is 10.0
3 time is 15.0
4 time is 10.0
tf.Tensor(
[[[0.51268816 0.48731178]
  [0.5022606  0.49773934]
  [0.52461904 0.475381  ]
  [0.5224362  0.4775638 ]
  [0.5085549  0.49144506]
  [0.51027375 0.48972622]
  [0.5029648  0.49703515]
  [0.50409555 0.49590445]
  [0.51671726 0.48328272]
  [0.5006018  0.4993981 ]
  [0.5140161  0.48598394]
  [0.5088054  0.49119467]
  [0.50352484 0.49647516]
  [0.50284153 0.49715844]
  [0.51447296 0.4855271 ]
  [0.4999527  0.50004727]
  [0.51328033 0.4867197 ]
  [0.5156003  0.4843997 ]
  [0.49947566 0.5005243 ]
  [0.5028746  0.49712536]
  [0.514952   0.48504803]
  [0.51116544 0.48883456]
  [0.5095299  0.49047005]
  [0.52218574 0.47781426]
  [0.51611465 0.48388535]
  [0.499723   0.50027704]
  [0.5039852  0.4960148 ]
  [0.51919734 0.48080266]
  [0.5164462  0.48355383]
  [0.5209348  0.47906518]
  [0.5034054  0.49659458]
  [0.50234455 0.49765545]
  [0.503018   0.49698195]
  [0.50505096 0.49494898]
  [0.5038811  0.49611893]
  [0

NameError: name 'advantages' is not defined

In [None]:
env.action_space.n

In [14]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [78]:
print()

tf.Tensor([[0.]], shape=(1, 1), dtype=float32)


In [None]:
tf.reduce_min