In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import json, random
import pathlib
import gym
import collections
import tqdm
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
from collections import deque

In [2]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [3]:
class AcModel(tf.keras.Model):
    def __init__(self, n_actions):
        super(AcModel, self).__init__()   
        self.n_actions = n_actions
        self.fc1 = tf.keras.layers.Dense(256, activation="relu")
        self.fc2 = tf.keras.layers.Dense(256, activation="relu")
        self.fc3 = tf.keras.layers.Dense(self.n_actions, activation="softmax")
        self.fc4 = tf.keras.layers.Dense(1, activation="linear")
        
    def call(self, x):
        x = tf.convert_to_tensor([x], dtype=tf.float32)
        x = self.fc1(x)
        x = self.fc2(x)
        actions = self.fc3(x)
        vals = self.fc4(x)
        return actions, vals[0]


In [4]:
class PPOMemory:
    def __init__(self, batch_size):
        self.states = []
        self.probs = []
        self.vals = []
        self.actions = []
        self.rewards = []
        self.dones = []

        self.batch_size = batch_size

    def generate_batches(self):
        n_states = len(self.states)
        batch_start = np.arange(0, n_states, self.batch_size)
        indices = np.arange(n_states, dtype=np.int64)
        np.random.shuffle(indices)
        batches = [indices[i:i+self.batch_size] for i in batch_start]

        return np.array(self.states),\
                np.array(self.actions),\
                np.array(self.probs),\
                np.array(self.vals),\
                np.array(self.rewards),\
                np.array(self.dones),\
                batches

    def store_memory(self, state, action, probs, vals, reward, done):
        self.states.append(state)
        self.actions.append(action)
        self.probs.append(probs)
        self.vals.append(vals)
        self.rewards.append(reward)
        self.dones.append(done)

    def clear_memory(self):
        self.states = []
        self.probs = []
        self.actions = []
        self.rewards = []
        self.dones = []
        self.vals = []
    

In [5]:
class Agent:
    def __init__(self, env, batch_size):
        self.env = env
        self.n_actions = self.env.action_space.n
        self.gamma = 0.99
        self.gae_lambda=0.95
        self.policy_clip = 0.2
#         self.actor = ActorModel(self.n_actions)
#         self.critic = CriticModel()
        self.ac = AcModel(self.n_actions)
        self.batch_size = batch_size
        self.ac_opt = tf.optimizers.Adam(learning_rate=0.08)
#         self.critic_opt = tf.optimizers.Adam(learning_rate=0.08)
        self.PPOMemory = PPOMemory(self.batch_size)
        
    def learn(self):
        
        states, actions, probs, vals, rewards, dones, batches_index = self.PPOMemory.generate_batches()   
        advantage = np.zeros(len(rewards), dtype=np.float32)
        for t in range(len(rewards)-1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards)-1):
                a_t += discount*(rewards[k] + self.gamma*vals[k+1]*(1-int(dones[k])) - vals[k])
                discount *= self.gamma * self.gae_lambda 
            advantage[t] = a_t
        for batch in batches_index:
            new_probs = []
            with tf.GradientTape() as tape:
                dist, critic_value = self.ac(states[batch])
                action = actions[batch]
                for i, data in enumerate(dist[0]):
                    new_probs.append(data[action[i][0]])
                new_probs =  tf.convert_to_tensor(new_probs, dtype=tf.float32)
                old_probs = probs[batch]
                prob_ratio = tf.math.exp(new_probs) / tf.math.exp(old_probs)
                weighted_probs = advantage[batch]* prob_ratio 
                weighted_clipped_probs = tf.clip_by_value(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantage[batch]
                weighted_probs = tf.reshape(weighted_probs, (weighted_probs.shape[0],1))
                weighted_clipped_probs = tf.reshape(weighted_clipped_probs, (weighted_clipped_probs.shape[0],1))
                tmp_data = tf.concat([weighted_probs, weighted_clipped_probs], axis=1)
                actor_loss = tf.reduce_mean(-tf.math.reduce_min(tmp_data, axis=1))
                tmp_adv = tf.reshape(advantage[batch], (advantage[batch].shape[0], 1))
                returns = tmp_adv + vals[batch]
                critic_loss_tmp = (returns-critic_value)**2
                critic_loss = tf.math.reduce_mean(critic_loss_tmp)   
                total_loss = 0.5*critic_loss + actor_loss
            grads = tape.gradient(total_loss, self.ac.trainable_variables)
            self.ac_opt.apply_gradients(zip(grads, self.ac.trainable_variables))
        
        
        
        
    def choose_action(self, state):
       
        probs, vals = self.ac(state)
        action = tf.math.argmax(probs, axis=1)
        tmp_action = action.numpy()
        prob = tf.math.log(probs[0, tmp_action[0]])
        return action, prob, vals
        
    
        

In [None]:
env = gym.make("CartPole-v0")
batch_size = 5
agent = Agent(env, batch_size)
episode_num = 1000
n_steps = 0
learn_times = 100
score = 0
for i in range(episode_num):
    state = env.reset()
    done = False
    score = 0
    while not done:
        action, probs, vals = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action.numpy()[0])
        agent.PPOMemory.store_memory(state, action, probs, vals, reward, done)
        n_steps += 1
        if n_steps % learn_times and n_steps > batch_size:
            agent.learn()
        state = next_state
        score += reward
    print(f"{i} time is {score}")


0 time is 11.0
1 time is 9.0
2 time is 10.0
3 time is 11.0
4 time is 16.0
5 time is 100.0
6 time is 33.0
7 time is 29.0
8 time is 31.0
9 time is 11.0
10 time is 10.0
11 time is 19.0
12 time is 9.0
13 time is 9.0
14 time is 10.0
15 time is 11.0
16 time is 9.0
17 time is 13.0
18 time is 12.0
19 time is 12.0
20 time is 33.0
21 time is 9.0
22 time is 9.0
23 time is 10.0
24 time is 8.0
25 time is 9.0
26 time is 10.0
27 time is 9.0
28 time is 9.0
29 time is 10.0
30 time is 9.0
31 time is 9.0
32 time is 10.0
33 time is 10.0
34 time is 9.0
35 time is 9.0
36 time is 9.0
37 time is 9.0
38 time is 10.0
39 time is 9.0
40 time is 8.0
41 time is 10.0
42 time is 9.0
43 time is 9.0
44 time is 10.0
45 time is 8.0
46 time is 9.0
47 time is 10.0
48 time is 9.0
49 time is 9.0
50 time is 10.0
51 time is 9.0
52 time is 9.0
53 time is 10.0
54 time is 10.0
55 time is 10.0
56 time is 8.0
57 time is 10.0
58 time is 10.0
59 time is 10.0
60 time is 10.0
61 time is 9.0
62 time is 9.0
63 time is 8.0
64 time is 10.0