In [43]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import json
import pathlib
import gym
import collections
import tqdm

In [31]:
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

In [32]:
env = gym.make("CartPole-v0")

In [33]:
seed = 42
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)
eps = np.finfo(np.float32).eps.item()

In [34]:
class ActorCritic(tf.keras.Model):
    def __init__(self, num_actions, num_hidden_units):
        super(ActorCritic, self).__init__()
        self.common = layers.Dense(num_hidden_units, activation="relu")
        self.actor = layers.Dense(num_actions)
        self.critic = layers.Dense(1)
    
    def call(self, x):
        x = self.common(x)
        return self.actor(x), self.critic(x)

In [35]:
num_actions = env.action_space.n
num_hidden_units = 128
model = ActorCritic(num_actions, num_hidden_units)

## collecting training data

In [36]:
def env_step(action):
    state, reward, done, _ = env.step(actio)
    return (state.astype(np.float32), np.array(reward,np.int32), np.array(done, int32))

def tf_env_step(action):
    return tf.numpy_function(env.step, [action], [tf.float32, tf.int32, tf.int32])

In [49]:
def run_episode(initial_state, model, max_steps):
    action_probs = tf.TensorArray(dtype=tf.float32,size=0, )
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    rewards = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    
    initial_state_shape = initial_state.shape
    state = initial_state
    
    for t in tf.range(max_steps):
        state = tf.expand_dims(state, 0)
        action_logits_t, value = model(state)
        
        action = tf.random.categorical(action_logits_t, 1)[0,0]
        action_probs_t = tf.nn.softmax(action_logits_t)
        
        values = values.write(t, tf.squeeze(value))
        action_probs = action_probs.write(t, action_probs_t[0, action])
        
        state, reward, done = tf_env_step(action)
        state.set_shape(initial_state_shape)
        
        rewards = rewards.write(t, reward)
        
        if tf.cast(done, tf.bool):
            break
        
        action_probs = action_pros.stack()
        values = values.stack()
        reward = rewards.stack()
        
        return action_probs, values, rewards

## return 计算

In [38]:
def  get_expected_return(rewards, gamma, standardize=True):
    n = tf.shape(rewards)[0]
    returns = tf.TensorArray(dtype=tf.float32, size=n)
    rewards = tf.cast(reward[::-1], dtype=tf.float32)
    discounted_sum = tf.constant(0.0)
    discounted_sum_shape = discounted_sum.shape
    for i in tf.range(n):
        reward = rewards[i]
        discounted_sum = reward + gamma*discounted_sum
        discounted_sum.set_shape(discounted_sum_shape)
        returns = returns.write(i, discounted_sum)
    returns = returns.stack()[::-1]
    
    if standardize:
        returns = ((returns - tf.math.reduce_mean(returns)) / 
               (tf.math.reduce_std(returns) + eps))
    return returns

## loss 计算

In [40]:
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
def compute_loss(action_probs, values, returns):
    advantage = returns - values
    action_log_probs = tf.math.log(action_probs)
    action_loss = -tf.math.reduce_sum(action_log_probs*advantage)
    
    critic_loss = huber_loss(values, returns)
    
    return actor_loss + critic_loss

In [41]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)


@tf.function
def train_step(
    initial_state, 
    model, 
    optimizer, 
    gamma ,
    max_steps_per_episode):
    
    with tf.GradientTape() as tape:
        actions_probs, values, rewards = run_episode(initial_state, model, max_steps_per_episode)
        
        returns = get_expected_return(rewards, gamma)
        
        action_probs, values, returns = [
        tf.expand_dims(x, 1) for x in [action_probs, values, returns]] 

        # Calculating loss values to update our network
        loss = compute_loss(action_probs, values, returns)

    # Compute the gradients from the loss
    grads = tape.gradient(loss, model.trainable_variables)

    # Apply the gradients to the model's parameters
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    episode_reward = tf.math.reduce_sum(rewards)

    return episode_reward


In [51]:
%%time

min_episodes_criterion = 100
max_episodes = 10000
max_steps_per_episode = 1000

# Cartpole-v0 is considered solved if average reward is >= 195 over 100 
# consecutive trials
reward_threshold = 195
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

# Keep last episodes reward
episodes_reward: collections.deque = collections.deque(maxlen=min_episodes_criterion)

with tqdm.trange(max_episodes) as t:
    for i in t:
        initial_state = tf.constant(env.reset(), dtype=tf.float32)
        episode_reward = int(train_step(
            initial_state, model, optimizer, gamma, max_steps_per_episode))

        episodes_reward.append(episode_reward)
        running_reward = statistics.mean(episodes_reward)

        t.set_description(f'Episode {i}')
        t.set_postfix(
            episode_reward=episode_reward, running_reward=running_reward)

        # Show average episode reward every 10 episodes
        if i % 10 == 0:
            pass # print(f'Episode {i}: average reward: {avg_reward}')

        if running_reward > reward_threshold and i >= min_episodes_criterion:  
            break

    print(f'\nSolved at episode {i}: average reward: {running_reward:.2f}!')

  0%|                                                | 0/10000 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not callable