In [5]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import json, random
import pathlib
import gym
import collections
import tqdm
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
from collections import deque

In [6]:
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [7]:
class ActorModel(tf.keras.Model):
    def __init__(self, n_actions):
        super(ActorModel, self).__init__()   
        self.n_actions = n_actions
        
        self.fc1 = tf.keras.layers.Dense(256, activation="relu")
        self.fc2 = tf.keras.layers.Dense(256, activation="relu")
        self.fc3 = tf.keras.layers.Dense(self.n_actions, activation="softmax")
        
    def call(self, x):
        x = tf.reshape(x, (1,4))
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


In [8]:
class CriticModel(tf.keras.Model):
    def __init__(self):
        super(CriticModel, self).__init__()
        
        self.fc1 = tf.keras.layers.Dense(256, activation="relu")
        self.fc2 = tf.keras.layers.Dense(256, activation="relu")
        self.fc3 = tf.keras.layers.Dense(1, activation="softmax")
        
    def call(self, x):
        x = tf.reshape(x, (1,4))
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [9]:
class Buffer:
    def __init__(self, max_length):
        self.max_length = max_length
        self.data = deque(maxlen=self.max_length)
        self.data = []
        
    def memory(self, state, action, probs, vals, reward, next_state, done):
        self.data.append((state, action, probs, vals, reward, next_state, done))
   
        
    def replay(self, batch_size):
        start = np.random.randint(0, len(self.data)-batch_size)
#         minibatch = random.sample(self.data, batch_size)
        minibatch = self.data[start: start+batch_size]
        return minibatch
    
    

In [16]:
class Agent:
    def __init__(self, env, batch_size):
        self.env = env
        n_actions = self.env.action_space.n
        self.gamma = 0.99
        self.gae_lambda=0.95
        self.policy_clip = 0.2
        self.buffer = Buffer(10000)
        self.actor = ActorModel(n_actions)
        self.critic = CriticModel()
        self.batch_size = batch_size
        self.actor_opt = tf.optimizers.Adam(learning_rate=0.08)
        self.critic_opt = tf.optimizers.Adam(learning_rate=0.08)
        
    def learn(self):
        tmp_data = self.buffer.replay(self.batch_size)
        
        
        states, actions, probs, vals, rewards, next_states, dones = [],[],[],[],[],[],[]
        
        for s, a, p, v, r, n, d in tmp_data:
            states.append(s)
            actions.append(a)
            probs.append(p)
            vals.append(v)
            rewards.append(r)
            next_states.append(n)
            dones.append(d)
            
        advantages = []
        for t in range(len(rewards)-1):
            discount = 1
            a_t = 0
            for k in range(t, len(rewards)-1):
                a_t += discount*(rewards[k] + self.gamma*vals[k+1]*(1-int(dones[k])) - vals[k])
                discount *= self.gamma * self.gae_lambda 
            advantages.append(a_t)
        advantages.append(tf.zeros((1,1)))
            
        for i in range(len(states)):
            with tf.GradientTape() as tape:
                dist = self.actor(states[i])
                critic_value = self.critic(states[i])
                new_probs = tf.math.log(tf.math.reduce_max(dist))
                old_probs = probs[i]
                prob_ratio = tf.math.exp(new_probs) / tf.math.exp(old_probs)
                #prob_ratio = (new_probs - old_probs).exp()
                weighted_probs = advantages[i] * prob_ratio
                
                weighted_clipped_probs = tf.clip_by_value(prob_ratio, 1-self.policy_clip,
                        1+self.policy_clip)*advantages[i]
#                 print(np.min(weighted_probs, weighted_clipped_probs))
                actor_loss = -tf.math.reduce_min(tf.concat([weighted_probs, weighted_clipped_probs], axis=0))
#                 actor_loss =  tf.math.reduce_mean(weighted_clipped_probs) 
            grads = tape.gradient(actor_loss, self.actor.trainable_variables)
            self.actor_opt.apply_gradients(zip(grads, self.actor.trainable_variables))
            
            with tf.GradientTape() as tape:
                critic_value = self.critic(states[i])
                returns = advantages[i] + vals[i]
                critic_loss = (returns-critic_value)**2
                critic_loss = tf.math.reduce_mean(critic_loss)         

            grads = tape.gradient(critic_loss, self.critic.trainable_variables)
            self.critic_opt.apply_gradients(zip(grads, self.critic.trainable_variables))
        
        
        
        
    def choose_action(self, state):
   
        probs = self.actor(state)
        vals = self.critic(state)
        action = tf.math.argmax(probs, axis=1)
        prob = tf.math.log(tf.math.reduce_max(probs))
        return action, prob, vals
        
    def train(self, episode_num=1000):
        self.episode_num = episode_num
        n_steps = 0
        learn_times = 20
        score = 0
        for i in range(self.episode_num):
            state = env.reset()
            done = False
            score = 0
            while not done:
                action, probs, vals = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action.numpy()[0])
                self.buffer.memory(state, action, probs, vals, reward, next_state, done)
                n_steps += 1
                if n_steps % learn_times and n_steps > self.batch_size:
                    self.learn()
                state = next_state
                score += reward
            print(f"{i} time is {score}")
            

In [None]:
env = gym.make("CartPole-v0")
agent = Agent(env, batch_size=32)
agent.train()

0 time is 10.0
1 time is 10.0
2 time is 10.0
3 time is 10.0
4 time is 9.0
5 time is 10.0
6 time is 10.0
7 time is 11.0
8 time is 9.0
9 time is 10.0
10 time is 8.0
11 time is 10.0
12 time is 9.0
13 time is 10.0
14 time is 9.0
15 time is 8.0
16 time is 10.0
17 time is 9.0
18 time is 10.0
19 time is 10.0
20 time is 9.0
21 time is 10.0
22 time is 8.0
23 time is 10.0
24 time is 10.0
25 time is 9.0
26 time is 8.0
27 time is 10.0
28 time is 8.0
29 time is 8.0
30 time is 10.0
31 time is 10.0
32 time is 10.0
33 time is 9.0
34 time is 9.0
35 time is 10.0
36 time is 10.0
37 time is 9.0
38 time is 10.0
39 time is 8.0
40 time is 10.0
41 time is 9.0
42 time is 10.0
43 time is 9.0
44 time is 10.0
45 time is 8.0
46 time is 10.0
47 time is 9.0
48 time is 8.0
49 time is 9.0
50 time is 9.0
51 time is 10.0
52 time is 10.0
53 time is 9.0
54 time is 10.0
55 time is 9.0
56 time is 11.0
57 time is 10.0


In [None]:
env.action_space.n

In [14]:
env.observation_space

Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)

In [78]:
print()

tf.Tensor([[0.]], shape=(1, 1), dtype=float32)


In [None]:
tf.reduce_min