In [None]:
import os
import numpy as np
import tensorflow as tf

os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

In [None]:
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

In [None]:
NUM_STACK = 4
NUM_STATE = len(game.getGameState().values())

## Actor-Critic

In [None]:
class ActorCritic(tf.keras.Model):
    def __init__(self, num_action):
        super().__init__()
        
        self.hidden_layers = tf.keras.Sequential([
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(units=128),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(units=256),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(units=512),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(units=256),
            tf.keras.layers.ReLU(),
            tf.keras.layers.Dense(units=128),
            tf.keras.layers.ReLU(),
        ])
        
        self.actor = tf.keras.layers.Dense(units=num_action)
        self.critic = tf.keras.layers.Dense(units=1)
        
    def call(self, state):
        x = self.hidden_layers(state)
        
        actor_logits = self.actor(x)
        actor_prob = tf.keras.layers.Softmax()(actor_logits)
        
        value = self.critic(x)
        
        return actor_prob, actor_logits, value

In [None]:
class Agent:
    def __init__(self, num_action, discount_factor=0.99):
        self.discount_factor = discount_factor
        self.num_action = num_action
        self.model = ActorCritic(num_action)
        self.huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
    
    def loss(self, state, action, returns, next_state):
        """
        The actor-critic loss
        """
        action_prob, action_logits, value = self.model(state)
        index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
        action_prob = tf.gather_nd(action_prob, index)
        
        advantage = returns - value
        
        action_log_prob = tf.math.log(action_prob)
        actor_loss = -tf.math.reduce_sum(action_log_prob * advantage)
        
        critic_loss = self.huber_loss(value, returns)
        
        return actor_loss + critic_loss
    
    def select_action(self, state):
        # convert state into a batched tensor
        state = tf.expand_dims(state, 0)
        
        action_prob, action_logits, value = self.model(state)
        
        # sample next action from the action probability distribution
        action = tf.random.categorical(action_logits, 1)[0, 0]
        return action

## Training

In [None]:
# init agent
num_action = len(env.getActionSet())

# agent for frequently updating
AC_Agent = Agent(num_action)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-7)

# @tf.function
def train_step(state, action, returns, next_state):
    with tf.GradientTape() as tape:
        loss = AC_Agent.loss(state, action, returns, next_state)
    gradients = tape.gradient(loss, AC_Agent.model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, AC_Agent.model.trainable_variables))
    
    return loss

In [None]:
class Replay_buffer():
    def __init__(self, gamma, buffer_size=50000):
        self.experiences = []
        self.gamma = gamma
        self.buffer_size = buffer_size

    def add(self, experience):
        if len(self.experiences) >= self.buffer_size:
            self.experiences.pop(0)
        self.experiences.append(experience)
        
    def get_expected_returns(self):
        """
        Computing expected returns
        """
        rewards = [e[2] for e in self.experiences][::-1]
        
        discounted_rewards = np.zeros(len(rewards))
        discounted_sum = 0.0
        for i in range(len(rewards)):
            discounted_sum = rewards[i] + self.gamma * discounted_sum
            discounted_rewards[i] = discounted_sum
        discounted_rewards = discounted_rewards[::-1]

        # standardize
        discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / (np.std(discounted_rewards) + 1e-7)
        
        return discounted_rewards
        
    def get_experiences(self):
        returns = self.get_expected_returns()
        
        states, actions, states_prime = [], [], []
        for state, action, reward, state_prime in self.experiences:
            states.append(state)
            actions.append(action)
            states_prime.append(state_prime)

        return states, actions, returns, states_prime


In [None]:
import moviepy.editor as mpy

def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    return clip

In [None]:
import skimage.transform

def preprocess_screen(screen):
    screen = skimage.transform.resize(screen, [IMG_WIDTH, IMG_HEIGHT, 1])
    return screen

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [None]:
from IPython.display import Image, display

update_every_episode = 1
print_every_episode = 10
save_video_every_episode = 100
NUM_EPISODE = 10000

for episode in range(0, NUM_EPISODE + 1):
    
    # reset the environment
    env.reset_game()
    
    # record frame
    if episode % save_video_every_episode == 0:
        frames = [env.getScreenRGB()]
        
    # input frame
    # --------------------------------------------------
    # (1, 84, 84, 1)
    # input_frames = [preprocess_screen(env.getScreenGrayscale())]
    
    # (4, 8)
    input_states = [list(game.getGameState().values())] * 4
    # --------------------------------------------------
    
    # cumulate reward for this episode
    cum_reward = 0
    
    # init buffer
    buffer = Replay_buffer(AC_Agent.discount_factor)
    
    t = 0
    while not env.game_over():
        
        # --------------------------------------------------
        # state = frames_to_state(input_frames)
        state = input_states[-4:]
        # --------------------------------------------------
        
        # select action using current policy
        action = AC_Agent.select_action(state)
        
        # execute the action and get reward
        reward = env.act(env.getActionSet()[action])
        
        # record frame
        if episode % save_video_every_episode == 0:
            frames.append(env.getScreenRGB())
            
        # record input frame
        # --------------------------------------------------
        # input_frames.append(preprocess_screen(env.getScreenGrayscale()))
        input_states.append(list(game.getGameState().values()))
        # --------------------------------------------------
        
        # cumulate reward
        cum_reward += reward
        
        # observe the result
        # --------------------------------------------------
        # state_prime = frames_to_state(input_frames) # get next state
        state_prime = input_states[-4:] # get next state
        # --------------------------------------------------
        
        # append experience for this episode
        buffer.add((state, action, reward, state_prime))
        
        t += 1
        
    # update agent
    train_states, train_actions, train_returns, train_states_prime = buffer.get_experiences()
    # --------------------------------------------------
    # train_states = np.asarray(train_states).reshape(-1, IMG_WIDTH, IMG_HEIGHT, NUM_STACK)
    # train_states_prime = np.asarray(train_states_prime).reshape(-1, IMG_WIDTH, IMG_HEIGHT, NUM_STACK)

    train_states = np.asarray(train_states).reshape(-1, NUM_STACK, NUM_STATE)
    train_states_prime = np.asarray(train_states_prime).reshape(-1, NUM_STACK, NUM_STATE)
    # --------------------------------------------------
    
    # convert Python object to Tensor to prevent graph re-tracing
    train_states = tf.convert_to_tensor(train_states, tf.float32)
    train_actions = tf.convert_to_tensor(train_actions, tf.int32)
    train_returns = tf.convert_to_tensor(train_returns, tf.float32)
    train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
    
    loss = train_step(train_states, train_actions, train_returns, train_states_prime)
    
    if episode % print_every_episode == 0:
        print(
            "[{}] time live:{}, cumulated reward: {}, loss: {}".
            format(episode, t, cum_reward, loss))

    if episode % save_video_every_episode == 0:  # for every 500 episode, record an animation
        clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/PG-{}.mp4".format(episode), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))