In [17]:
import tensorflow as tf
import numpy as np
import time
import gym
import flappy_bird_gym

In [18]:
env = flappy_bird_gym.make("FlappyBird-v0")

In [19]:
SEED = 42
tf.random.set_seed(SEED)

In [20]:
LAYERS = [
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid') # left prob # 1 < left , 0 > right
]

model = tf.keras.Sequential(LAYERS)

In [21]:
eg = np.array([1,2,3])
eg

array([1, 2, 3])

In [22]:
eg[np.newaxis]

array([[1, 2, 3]])

In [23]:
def pg_policy(observation, model): # policy gradient -> PG
    left_probability = model.predict(observation[np.newaxis]) # probability value between 0, and 1
    action = int(np.random.rand() > left_probability) # value {0, 1} # exploration vs exploitation concept
    return action

# Policy Gradients
Optimize learnable parameters of policy by following the gradients towards higher reward (maximizing reward)

## steps
1. let the NN play the game multiple times and at every step just calculate the gradients (wrt reward) but dont apply it immidiately.
2. Once you have completed several episodes then compute the actions using discounted method.
3. result of previous step 2 can +ve or -ve  

In [24]:
tf.random.uniform([1,1])

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.6645621]], dtype=float32)>

In [25]:
def play_one_step(env, observation, model, loss_fn):
    with tf.GradientTape() as tape:
        left_prabability = model(observation[np.newaxis])
        action = (tf.random.uniform([1,1]) > left_prabability) # True and False
        y_target = tf.constant([[1.]]) - tf.cast(action, tf.float32) # 
        loss = tf.reduce_mean(loss_fn(y_target, left_prabability)) 

    grads = tape.gradient(loss, model.trainable_variables) # dc/dw
    new_observation, reward, done, info = env.step(int(action))
    # return new_observation, reward, done, grads
    return new_observation, info["score"], done, grads


In [26]:
def play_multiple_episodes(env, n_episodes, n_max_steps, model, loss_fn):
    all_rewards = list()
    all_grads = list()
    for episode in range(n_episodes):
        current_rewards = list()
        current_grads = list()
        observation = env.reset()
        for step in range(n_max_steps):
            observation, reward, done, grads = play_one_step(env, observation, model, loss_fn)
            current_rewards.append(reward)
            current_grads.append(grads)
            if done:
                break
        all_rewards.append(current_rewards)
        all_grads.append(current_grads)
    return all_rewards, all_grads

In [27]:
def discount_rewards(rewards, discount_factor):
    discounted = np.array(rewards)
    N = len(rewards)
    for step in range(N - 2, -1, -1):
        # a_n + a_n+1*gamma
        discounted[step] = discounted[step] + discounted[step + 1] * discount_factor
    return discounted

In [28]:
arr = [10, 0, -50]
discount_rewards(arr, 0.8)

array([-22, -40, -50])

In [29]:
x = np.array([1,2])
np.concatenate([x,x])

array([1, 2, 1, 2])

In [30]:
def discount_and_normalize_rewards(all_rewards, discount_factor):
    all_discounted_rewards = list()
    for reward in all_rewards:
        # discounted rewards
        drs = discount_rewards(reward, discount_factor)
        all_discounted_rewards.append(drs)

    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()

    normalize_rewards = list()
    for discounted_rewards in all_discounted_rewards:
        nrs = (discounted_rewards - reward_mean) / reward_std
        normalize_rewards.append(nrs)
    return normalize_rewards

In [31]:
n_iterations = 150
n_episodes_per_update = 10
n_max_steps = 200
discount_factor = 0.95
learning_rate = 0.01

In [32]:
obs = env.reset()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_fn = tf.keras.losses.binary_crossentropy

In [33]:
r1 = [1,2,3]
r2 = [-1,-2,3]
all_rewards_1 = [r1, r2]
list(map(sum, all_rewards_1))

[6, 0]

In [34]:
sum(map(sum, all_rewards_1))

6

In [35]:
arr = [[1,2,3], [3,4,5]]
tf.reduce_mean(arr, axis=0)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 3, 4])>

In [36]:
for iteration in range(n_iterations):
    all_rewards, all_grads = play_multiple_episodes(
        env, n_episodes_per_update, n_max_steps, model, loss_fn
    )
    total_rewards = sum(map(sum, all_rewards))
    print(f"\rIteration: {iteration + 1}/{n_iterations}",
    f"mean rewards: {total_rewards/n_episodes_per_update}"
    )
    all_final_rewards = discount_and_normalize_rewards(all_rewards, discount_factor)

    all_mean_grads = list()
    # Weight of 5 hidden nodes, bias for 5 nodes, w for output node, bias for output node
    N = len(model.trainable_variables)
    for var_index in range(N):
        temp_reduce_mean = list()
        for episode_index, final_rewards in enumerate(all_final_rewards): # rewards for every episode
            for step, final_reward in enumerate(final_rewards): # several steps
                result = final_reward * all_grads[episode_index][step][var_index]
                temp_reduce_mean.append(result)
        mean_grads = tf.reduce_mean(temp_reduce_mean, axis=0)
        all_mean_grads.append(mean_grads)
    optimizer.apply_gradients(zip(all_mean_grads, model.trainable_variables))

Iteration: 1/150 mean rewards: 0.0


  nrs = (discounted_rewards - reward_mean) / reward_std


Iteration: 2/150 mean rewards: 0.0
Iteration: 3/150 mean rewards: 0.0
Iteration: 4/150 mean rewards: 0.0
Iteration: 5/150 mean rewards: 0.0
Iteration: 6/150 mean rewards: 0.0
Iteration: 7/150 mean rewards: 0.0
Iteration: 8/150 mean rewards: 0.0
Iteration: 9/150 mean rewards: 0.0
Iteration: 10/150 mean rewards: 0.0
Iteration: 11/150 mean rewards: 0.0
Iteration: 12/150 mean rewards: 0.0
Iteration: 13/150 mean rewards: 0.0
Iteration: 14/150 mean rewards: 0.0
Iteration: 15/150 mean rewards: 0.0
Iteration: 16/150 mean rewards: 0.0
Iteration: 17/150 mean rewards: 0.0
Iteration: 18/150 mean rewards: 0.0
Iteration: 19/150 mean rewards: 0.0
Iteration: 20/150 mean rewards: 0.0
Iteration: 21/150 mean rewards: 0.0
Iteration: 22/150 mean rewards: 0.0
Iteration: 23/150 mean rewards: 0.0
Iteration: 24/150 mean rewards: 0.0
Iteration: 25/150 mean rewards: 0.0
Iteration: 26/150 mean rewards: 0.0
Iteration: 27/150 mean rewards: 0.0
Iteration: 28/150 mean rewards: 0.0
Iteration: 29/150 mean rewards: 0.0


In [37]:
import re
import time

unique_name = re.sub(r"[\s+:]", "_", time.asctime())
model_name = f"model_at_{unique_name}_.h5"
model.save(model_name)
print(f"model is saved as '{model_name}'")

model is saved as 'model_at_Fri_Jul_15_17_02_43_2022_.h5'


In [38]:
r1 = [1,2,3]
r2 = [-1,-2,3]
list(zip(r1, r2))

[(1, -1), (2, -2), (3, 3)]

In [39]:
load_model = tf.keras.models.load_model("model_at_Fri_Jul_15_16_52_58_2022_.h5")



In [40]:
def show_one_episode(policy, model, n_max_steps=500, seed=42):
    env = flappy_bird_gym.make("FlappyBird-v0")
    obs = env.reset()
    for step in range(n_max_steps):
        env.render()
        action = policy(obs, model)
        obs, reward, done, info = env.step(action)
        if done:
            break
    env.close()
    return step, obs

show_one_episode(pg_policy, load_model)



(31, array([ 1.22569444, -0.31148438]))

In [41]:
def basic_policy(obs, model):
    PoleAngle = obs[2] 
    if PoleAngle < 0: # falling left
        return 0 # Move left
    return 1 

In [42]:
show_one_episode(basic_policy, model)

IndexError: index 2 is out of bounds for axis 0 with size 2

In [43]:
show_one_episode(pg_policy, model)




(31, array([ 1.22569444, -0.34078125]))