In [1]:
import numpy as np
import gym
import tensorflow as tf
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Input

  _RESOLVED_ROMS = _resolve_roms()


## Pong

Goal is to play the pong game, beating the machine using RL, the different variants that can be tried are as below - 
1. Policy Gradient
2. Deep Q Learning
3. Take action every 4 frames instead of every frame to speed up time
4. Memory buffer - use last N (say 100) games to update model instead of just using the last game
5. Linear annealing
6. Remove first 20 frames as the ball is not there

In [2]:
# function to convert each image into lesser size
def prepro(I):
    # preprocess each frame for learning
    # save some memory and computation
    # pre-process the image from a 210x160x3 uint8 frame into an (80x80) float array 
    I = I[35:195,:,:].copy() # crop the top of the image...score image doesn't matter for how to play
    I = I[::2,::2,0].copy()
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.array(I.copy())

  and should_run_async(code)


In [3]:
# get discounted reward - unique to policy gradient
def discount_rewards(r):
    # take 1D float array of rewards and compute discounted reward
    # gym returns a reward with every single frame.  most of those rewards are 0
    # sometimes they're 1 or -1 if we win or lose a point in that specific frame
    # we want non-0 rewards for every frame. 
    # so take each frame, figure out if we eventually won the corresponding point or not
    # if so make the reward positive, if not negative
    # but more recent actions (relative to the frame where the point is awarded) are more 
    # impactful to the score that frames a long time ago, so discount rewards...
    
    delt = 0.99 # discount factor
    nr = r.shape[0]
    # we want to change all those zeros into discounted values of the next reward (this is the value function!)
    discounted_r = np.zeros(nr)
    
    for t in range(nr):
        # start at the end
        if r[nr-t-1] > 0: # if you won a point in this frame we want a good reward
            discounted_r[nr-t-1] = 1 
        elif r[nr-t-1] < 0: # if we lost the point we want a bad reward
            discounted_r[nr-t-1] = -1
        elif t==0: # this is just for error catching...at t==0 r[nr-t-1] should have already been + or -...
            discounted_r[nr-t-1] = 0
        elif discounted_r[nr-t-1] == 0: # otherwise you want to look at the next reward value and discount it
            discounted_r[nr-t-1] = delt*discounted_r[nr-t]
    return discounted_r

In [4]:
# CNN model creation
def create_model(height,width,channels):
    # we cannot simply have 3 output nodes because we want to put a weight on each node's impact to the objective
    # that is different for each data point.  the only way to achieve this is to have 3 output layers, each having 1 node
    # the effect is the same, just the way TF/keras handles weights is different
    imp = Input(shape=(height,width,channels))
    mid = Conv2D(16,(8,8),strides=4,activation='relu')(imp)
    mid = Conv2D(32,(4,4),strides=2,activation='relu')(mid)
    mid = Flatten()(mid)
    mid = Dense(256,activation='relu')(mid)
    out0 = Dense(3,activation='softmax')(mid)
    model = Model(imp,out0) 
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),loss='sparse_categorical_crossentropy')
    
    return model

In [5]:
frames_to_net = 4              # how many previous frames will we feed the NN
possible_actions = [0,2,3]
mod = create_model(80,80,frames_to_net)
mod.call = tf.function(mod.call,experimental_relax_shapes=True)
mod.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 80, 80, 4)]       0         
                                                                 
 conv2d (Conv2D)             (None, 19, 19, 16)        4112      
                                                                 
 conv2d_1 (Conv2D)           (None, 8, 8, 32)          8224      
                                                                 
 flatten (Flatten)           (None, 2048)              0         
                                                                 
 dense (Dense)               (None, 256)               524544    
                                                                 
 dense_1 (Dense)             (None, 3)                 771       
                                                                 
Total params: 537,651
Trainable params: 537,651
Non-trainable

In [6]:
# first variant of playing the game
# Deep Q learning
# other feature
# 
def play1game(model,ep):
    env0 = gym.make("Pong-v0")
    pix = env0.reset()
    pix = prepro(pix)
    frames_this_game = 0
    feed = np.zeros((1,80,80,frames_to_net))
    feed[0,:,:,0] = pix.copy() # 0 is the most recent frame t; 1 the previous one t+1 and so one
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    while not done:
        if np.random.random() < ep:
            action = np.random.choice(3)
        else:
            vf = mod(feed,training=False)
            vf = [vf[0][0,0].numpy(),vf[1][0,0].numpy(),vf[2][0,0].numpy()]
            action = np.argmax(vf)
        action0 = possible_actions[action]
        pix_new, reward, done, info = env0.step(action0)
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro(pix_new)
        frames_this_game += 1

        for f in range(1,frames_to_net):
            feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
        feed[0,:,:,0] = pix.copy()
        score += reward
    return frame_array, action_array, reward_array, score

In [7]:
# def play1game(model):
#     env0 = gym.make("Pong-v0")
#     pix = env0.reset()
#     pix = prepro(pix)
#     frames_this_game = 0
#     feed = np.zeros((1,80,80,frames_to_net))
#     feed[0,:,:,0] = pix.copy()
    
#     frame_array = []
#     action_array = []
#     reward_array = []
    
#     score = 0
#     done = False
#     while not done:
#         vf = model(feed,training=False).numpy()[0]
#         action = np.random.choice(3,p=vf)
        
#         action0 = possible_actions[action]
#         pix_new, reward, done, info = env0.step(action0)
#         frame_array.append(pix)
#         action_array.append(action)
#         reward_array.append(reward)
#         pix = prepro(pix_new)
#         frames_this_game += 1

#         for f in range(1,frames_to_net):
#             feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
#         feed[0,:,:,0] = pix.copy()
#         score += reward
        
#     return frame_array, action_array, reward_array, score

In [8]:
# linear annealing with same actions for 4 games
def play2game(model, eps):
    env0 = gym.make("Pong-v0")
    pix = env0.reset()
    pix = prepro(pix)
    frames_this_game = 0
    feed = np.zeros((1,80,80,frames_to_net))
    feed[0,:,:,0] = pix.copy()
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    while not done:
        
        # updating action every 4 frames
        # but taking that action for the 4 frames just not using the prediction for action
        if frames_this_game % 4 == 0:
            vf = model(feed,training=False).numpy()[0]
            # print(vf)
            if np.random.random() < eps:
                action = np.random.choice(3,p=vf)
                action0 = possible_actions[action]
            else:
                action = np.argmax(vf)
                action0 = possible_actions[action]
                
            # updating epsilon for picking
            # if eps > eps_e:
            #    eps -= eps_d             
        
        pix_new, reward, done, info = env0.step(action0)
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro(pix_new)
        frames_this_game += 1

        for f in range(1,frames_to_net):
            feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
        feed[0,:,:,0] = pix.copy()
        score += reward
        
    return frame_array, action_array, reward_array, score

In [9]:
frames, actions, rewards, score = play1game(mod)
print(np.mean(np.array(actions)==0),np.mean(np.array(actions)==1),np.mean(np.array(actions)==2))


TypeError: play1game() missing 1 required positional argument: 'ep'

In [None]:
ngames = 20
# batches_per_epoch = 50
n_batch = 32

In [None]:
# for game in range(ngames):
#     start = time.time()
#     frames, actions, rewards, score = play1game(mod)
#     rewards = np.array(rewards)
#     actions = np.array(actions)
#     nframes = len(frames)
#     current_frames = np.zeros((nframes,80,80,frames_to_net))
    
#     disc_rewards = discount_rewards(rewards)
  
#     for grab in range(nframes):
#         for f in range(frames_to_net):
#             if grab-f > 0:
#                 current_frames[grab,:,:,f] = frames[grab-f].copy()
  
#     mod.fit(current_frames,actions,epochs=1,batch_size=n_batch,verbose=0,sample_weight=disc_rewards,use_multiprocessing=True)
    
#     # for some reason colab memory was blowing up...this may have fixed it?
# #     del rewards
# #     del actions
# #     del frames
# #     del current_frames
# #     del disc_rewards
#     stop = time.time()
#     print([game, score, stop-start])

In [None]:
ngames = 1000
epsvec = np.linspace(1,0.05,ngames)
ngames = 10
delt = 0.99
nbatch = 32

In [None]:
for game in range(ngames):
    start = time.time()
    frames, actions, rewards, score = play2game(mod,epsvec[game])

    nframes = len(frames)
    current_frames = np.zeros((nframes,80,80,frames_to_net))
    future_frames = np.zeros((nframes,80,80,frames_to_net))
  
    # creating frame - current t (for prediction) and t+1 (for truth)
    for grab in range(nframes):
        for f in range(frames_to_net):
            if grab-f > 0:
                current_frames[grab,:,:,f] = frames[grab-f].copy()
            if (grab-f+1 > 0) & (grab-f+1 < (nframes-1)):
                future_frames[grab,:,:,f] = frames[grab-f+1].copy()
    target_vf = mod.predict(future_frames)

    # vectors of truth
    y0 = np.zeros((nframes,1))
    y1 = np.zeros((nframes,1))
    y2 = np.zeros((nframes,1))
    
    # weight for training neural network based on the "truth"
    weight0 = np.zeros(nframes)
    weight1 = np.zeros(nframes)
    weight2 = np.zeros(nframes)
  

    for grab in range(nframes):
        rhs = rewards[grab]
        # terminal condition will be when we win a game
        # 
        if rhs == 0:
            rhs = delt*np.max([target_vf[0][grab],target_vf[1][grab],target_vf[2][grab]])
        if actions[grab] == 0:
            y0[grab,0] = rhs
            weight0[grab] = 1
        elif actions[grab] == 1:
            y1[grab,0] = rhs
            weight1[grab] = 1
        else:
            y2[grab,0] = rhs
            weight2[grab] = 1
  
    mod.fit(current_frames,[y0,y1,y2],epochs=1,batch_size=nbatch,verbose=0,sample_weight={'out0':weight0,'out1':weight1,'out2':weight2},use_multiprocessing=True)
    stop = time.time()
    print([game, score, epsvec[game], stop-start])

In [None]:
target_vf