In [1]:
import numpy as np
import gym
import tensorflow as tf
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Input

## Pong

Deep Q learning with memory buffer - varying probs to oversample minority (cases when you win), action every 4 frame, and linear annealing

In [2]:
# function to convert each image into lesser size
def prepro(I):
    # preprocess each frame for learning
    # save some memory and computation
    # pre-process the image from a 210x160x3 uint8 frame into an (80x80) float array 
    I = I[35:195,:,:].copy() # crop the top of the image...score image doesn't matter for how to play
    I = I[::2,::2,0].copy()
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.array(I.copy())

In [3]:
def create_model(height,width,channels):
    # we cannot simply have 3 output nodes because we want to put a weight on each node's impact to the objective
    # that is different for each data point.  the only way to achieve this is to have 3 output layers, each having 1 node
    # the effect is the same, just the way TF/keras handles weights is different
    imp = Input(shape=(height,width,channels))
    mid = Conv2D(16,(8,8),strides=4,activation='relu')(imp)
    mid = Conv2D(32,(4,4),strides=2,activation='relu')(mid)
    mid = Flatten()(mid)
    mid = Dense(256,activation='relu')(mid)
    out0 = Dense(1,activation='linear',name='out0')(mid)
    out1 = Dense(1,activation='linear',name='out1')(mid)
    out2 = Dense(1,activation='linear',name='out2')(mid)
    model = Model(imp,[out0,out1,out2]) 
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),loss='mean_squared_error')
    
    return model

In [4]:
frames_to_net = 4              # how many previous frames will we feed the NN
possible_actions = [0,2,3]
mod = create_model(80,80,frames_to_net)
mod.call = tf.function(mod.call,experimental_relax_shapes=True)

mod.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 80, 80, 4)]  0           []                               
                                                                                                  
 conv2d (Conv2D)                (None, 19, 19, 16)   4112        ['input_1[0][0]']                
                                                                                                  
 conv2d_1 (Conv2D)              (None, 8, 8, 32)     8224        ['conv2d[0][0]']                 
                                                                                                  
 flatten (Flatten)              (None, 2048)         0           ['conv2d_1[0][0]']               
                                                                                              

In [5]:
def play1game(model,ep):
    env0 = gym.make("Pong-v0")
    pix = env0.reset()
    pix = prepro(pix)
    frames_this_game = 0
    feed = np.zeros((1,80,80,frames_to_net))
    feed[0,:,:,0] = pix.copy() # 0 is the most recent frame t; 1 the previous one t+1 and so one
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    while not done:
        # updating action every 4 frames
        # but taking that action for the 4 frames just not using the prediction for action
        if frames_this_game % 4 == 0:
            if np.random.random() < ep:
                action = np.random.choice(3)
            else:
                vf = mod(feed,training=False)
                vf = [vf[0][0,0].numpy(),vf[1][0,0].numpy(),vf[2][0,0].numpy()]
                action = np.argmax(vf)
        action0 = possible_actions[action]
        pix_new, reward, done, info = env0.step(action0)
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro(pix_new)
        frames_this_game += 1

        for f in range(1,frames_to_net):
            feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
        feed[0,:,:,0] = pix.copy()
        score += reward
    return frame_array, action_array, reward_array, score

In [6]:
# testing the likelihood of taking each action without any training
frames, actions, rewards, score = play1game(mod,0.5)
print(np.mean(np.array(actions)==0),np.mean(np.array(actions)==1),np.mean(np.array(actions)==2))


  logger.warn(
  deprecation(
  deprecation(
  deprecation(


0.5241332426920462 0.20666213460231136 0.26920462270564244


In [11]:
# discounting factor
delt = 0.99

# setting up variable for memory buffer
# ngames = 1000
ngames = 10000
nbatch = 10
buffn = 200000
warmupgames = 50
len_buff = 0
buffer = {'frames':[],'actions':[], 'rewards':[]}

# eps vector for each gradient for linear annealing
epsvec = np.linspace(1,0.05,ngames)

In [12]:
output_qdl = {'game':[-1]*ngames, 'score':[-100]*ngames, 'time':[-1]*ngames}

overall_start = time.time()

for game in range(ngames):
    start = time.time()
    frames, actions, rewards, score = play1game(mod,epsvec[game])
    buffer['frames'] += frames.copy()
    buffer['actions'] += actions.copy()
    buffer['rewards'] += rewards.copy()
    len_buff += len(actions)
    if len_buff > buffn:
        excess = len_buff - buffn
        buffer['frames'] = buffer['frames'][excess:].copy()
        buffer['actions'] = buffer['actions'][excess:].copy()
        buffer['rewards'] = buffer['rewards'][excess:].copy()
        len_buff = len(buffer['actions'])
    
    # rewards = np.array(rewards)
    # actions = np.array(actions)
    
    nframes = len(frames)
    current_frames = np.zeros((nframes,80,80,frames_to_net))
    future_frames = np.zeros((nframes,80,80,frames_to_net))
  
    if game >= warmupgames:
        # choosing the frames from memory buffer based on the reward
        # weighing the frames 
        # something like over/under sampling in unbalanced class
        prob = np.ones(len_buff)
        prob[np.array(buffer['rewards']) > 0] = 5.0
        prob /= np.sum(prob)
        which_choose = np.random.choice(len_buff,size=nframes,replace=False,p=prob)
    
        rewards = np.zeros(nframes)
        actions = np.zeros(nframes)
        for i in range(len(which_choose)):
            grab = which_choose[i]
            rewards[i] = buffer['rewards'][grab]
            actions[i] = buffer['actions'][grab]
            # creating frame - current t (for prediction) and t+1 (for truth)
            for f in range(frames_to_net):
                if grab-f > 0:
                    current_frames[i,:,:,f] = buffer['frames'][grab-f].copy()
                if (grab-f+1 > 0) & (grab-f+1 < len_buff-1):
                    future_frames[i,:,:,f] = buffer['frames'][grab-f+1].copy()


    target_vf = mod.predict(future_frames)

    # vectors of truth
    y0 = np.zeros((nframes,1))
    y1 = np.zeros((nframes,1))
    y2 = np.zeros((nframes,1))
    
    # weight for training neural network based on the "truth"
    weight0 = np.zeros(nframes)
    weight1 = np.zeros(nframes)
    weight2 = np.zeros(nframes)
  

    for grab in range(nframes):
        rhs = rewards[grab]
        # terminal condition will be when we win a game
        # 
        if rhs == 0:
            rhs = delt*np.max([target_vf[0][grab],target_vf[1][grab],target_vf[2][grab]])
        if actions[grab] == 0:
            y0[grab,0] = rhs
            weight0[grab] = 1
        elif actions[grab] == 1:
            y1[grab,0] = rhs
            weight1[grab] = 1
        else:
            y2[grab,0] = rhs
            weight2[grab] = 1
  
    mod.fit(current_frames,[y0,y1,y2],epochs=1,batch_size=nbatch,verbose=0,sample_weight={'out0':weight0,'out1':weight1,'out2':weight2},use_multiprocessing=True)
    stop = time.time()
    # print([game, score, epsvec[game], stop-start])
    
    output_qdl['game'][game] = game
    output_qdl['score'][game] = score
    output_qdl['time'][game] = time
    
    if game % 10 == 0:
        print('Game number ', game, [score, epsvec[game], stop-start])

overall_end = time.time()
print('total run time, ', (overall_end - overall_start)/60, 'minutes')

Game number  0 [-21.0, 1.0, 2.0009453296661377]
Game number  10 [-19.0, 0.9990499049904991, 2.3219497203826904]
Game number  20 [-21.0, 0.9980998099809981, 1.8884546756744385]
Game number  30 [-21.0, 0.9971497149714972, 2.082519769668579]
Game number  40 [-21.0, 0.9961996199619962, 1.7345080375671387]
Game number  50 [-20.0, 0.9952495249524953, 2.332292318344116]
Game number  60 [-21.0, 0.9942994299429943, 1.905362606048584]
Game number  70 [-21.0, 0.9933493349334933, 2.1195008754730225]
Game number  80 [-21.0, 0.9923992399239924, 2.1663854122161865]
Game number  90 [-21.0, 0.9914491449144914, 2.25207781791687]
Game number  100 [-21.0, 0.9904990499049905, 2.182919502258301]
Game number  110 [-21.0, 0.9895489548954896, 2.067582368850708]
Game number  120 [-21.0, 0.9885988598859886, 2.103334426879883]
Game number  130 [-20.0, 0.9876487648764877, 2.1896626949310303]
Game number  140 [-21.0, 0.9866986698669867, 2.042039394378662]
Game number  150 [-21.0, 0.9857485748574858, 2.0125632286071

Game number  1280 [-20.0, 0.8783878387838784, 2.492640972137451]
Game number  1290 [-21.0, 0.8774377437743774, 2.0167293548583984]
Game number  1300 [-21.0, 0.8764876487648765, 2.4360134601593018]
Game number  1310 [-21.0, 0.8755375537553756, 2.2602922916412354]
Game number  1320 [-20.0, 0.8745874587458746, 2.897763967514038]
Game number  1330 [-21.0, 0.8736373637363737, 1.9661617279052734]
Game number  1340 [-21.0, 0.8726872687268727, 2.6773746013641357]
Game number  1350 [-20.0, 0.8717371737173718, 2.879460334777832]
Game number  1360 [-20.0, 0.8707870787078709, 2.511000394821167]
Game number  1370 [-21.0, 0.8698369836983698, 2.6160829067230225]
Game number  1380 [-21.0, 0.8688868886888689, 2.501507520675659]
Game number  1390 [-20.0, 0.8679367936793679, 2.9119338989257812]
Game number  1400 [-21.0, 0.866986698669867, 2.7998640537261963]
Game number  1410 [-21.0, 0.8660366036603661, 2.895714044570923]
Game number  1420 [-21.0, 0.8650865086508651, 2.0723369121551514]
Game number  1430

Game number  2540 [-20.0, 0.7586758675867586, 2.7009079456329346]
Game number  2550 [-21.0, 0.7577257725772577, 2.8020071983337402]
Game number  2560 [-19.0, 0.7567756775677568, 3.461941719055176]
Game number  2570 [-19.0, 0.7558255825582558, 3.022913694381714]
Game number  2580 [-21.0, 0.7548754875487549, 2.8801848888397217]
Game number  2590 [-21.0, 0.753925392539254, 2.565624713897705]
Game number  2600 [-19.0, 0.752975297529753, 3.6391308307647705]
Game number  2610 [-19.0, 0.7520252025202521, 3.2611217498779297]
Game number  2620 [-21.0, 0.7510751075107511, 2.280694007873535]
Game number  2630 [-19.0, 0.7501250125012502, 3.9094338417053223]
Game number  2640 [-20.0, 0.7491749174917492, 3.1237947940826416]
Game number  2650 [-20.0, 0.7482248224822483, 3.3144288063049316]
Game number  2660 [-21.0, 0.7472747274727474, 1.983276605606079]
Game number  2670 [-20.0, 0.7463246324632463, 2.7622649669647217]
Game number  2680 [-20.0, 0.7453745374537454, 2.80830454826355]
Game number  2690 [

Game number  3800 [-21.0, 0.6389638963896389, 2.4650304317474365]
Game number  3810 [-21.0, 0.638013801380138, 2.9867963790893555]
Game number  3820 [-20.0, 0.637063706370637, 4.1258933544158936]
Game number  3830 [-21.0, 0.6361136113611361, 2.716341018676758]
Game number  3840 [-21.0, 0.6351635163516351, 3.568660020828247]
Game number  3850 [-21.0, 0.6342134213421342, 2.8110389709472656]
Game number  3860 [-19.0, 0.6332633263326333, 3.940457344055176]
Game number  3870 [-15.0, 0.6323132313231323, 4.619136095046997]
Game number  3880 [-20.0, 0.6313631363136314, 3.375136375427246]
Game number  3890 [-20.0, 0.6304130413041305, 3.163637399673462]
Game number  3900 [-21.0, 0.6294629462946295, 2.9711475372314453]
Game number  3910 [-20.0, 0.6285128512851286, 3.5385637283325195]
Game number  3920 [-21.0, 0.6275627562756276, 2.67756986618042]
Game number  3930 [-21.0, 0.6266126612661267, 2.911644697189331]
Game number  3940 [-18.0, 0.6256625662566258, 4.0499749183654785]
Game number  3950 [-2

Game number  5060 [-19.0, 0.5192519251925193, 4.327677011489868]
Game number  5070 [-21.0, 0.5183018301830183, 3.3592000007629395]
Game number  5080 [-20.0, 0.5173517351735174, 4.304185390472412]
Game number  5090 [-21.0, 0.5164016401640165, 3.8084497451782227]
Game number  5100 [-17.0, 0.5154515451545154, 4.161413192749023]
Game number  5110 [-20.0, 0.5145014501450145, 3.9924027919769287]
Game number  5120 [-18.0, 0.5135513551355135, 4.244926452636719]
Game number  5130 [-20.0, 0.5126012601260126, 4.580743074417114]
Game number  5140 [-20.0, 0.5116511651165117, 3.6855127811431885]
Game number  5150 [-20.0, 0.5107010701070107, 4.817219018936157]
Game number  5160 [-20.0, 0.5097509750975098, 3.936868190765381]
Game number  5170 [-21.0, 0.5088008800880088, 3.4275898933410645]
Game number  5180 [-18.0, 0.5078507850785079, 4.304189920425415]
Game number  5190 [-20.0, 0.506900690069007, 3.5142433643341064]
Game number  5200 [-15.0, 0.505950595059506, 6.120219707489014]
Game number  5210 [-1

Game number  6320 [-17.0, 0.39953995399539954, 6.684007883071899]
Game number  6330 [-20.0, 0.3985898589858986, 4.949663877487183]
Game number  6340 [-18.0, 0.39763976397639766, 5.6228203773498535]
Game number  6350 [-12.0, 0.3966896689668967, 6.79299783706665]
Game number  6360 [-20.0, 0.3957395739573958, 4.693514585494995]
Game number  6370 [-21.0, 0.39478947894789485, 5.97660756111145]
Game number  6380 [-19.0, 0.3938393839383939, 5.22492790222168]
Game number  6390 [-17.0, 0.39288928892889297, 5.5152599811553955]
Game number  6400 [-20.0, 0.391939193919392, 5.10011625289917]
Game number  6410 [-20.0, 0.3909890989098911, 3.8796305656433105]
Game number  6420 [-17.0, 0.39003900390039004, 5.359833240509033]
Game number  6430 [-16.0, 0.3890889088908891, 6.127960920333862]
Game number  6440 [-20.0, 0.38813881388138816, 5.124273061752319]
Game number  6450 [-19.0, 0.3871887188718872, 4.8123743534088135]
Game number  6460 [-20.0, 0.3862386238623863, 4.9432268142700195]
Game number  6470 [

Game number  7580 [-18.0, 0.2798279827982799, 5.55283260345459]
Game number  7590 [-19.0, 0.278877887788779, 6.163572788238525]
Game number  7600 [-16.0, 0.2779277927792779, 6.192469358444214]
Game number  7610 [-18.0, 0.276977697769777, 5.776714086532593]
Game number  7620 [-19.0, 0.27602760276027605, 4.609721660614014]
Game number  7630 [-20.0, 0.2750775077507751, 5.044370651245117]
Game number  7640 [-20.0, 0.27412741274127417, 5.202885150909424]
Game number  7650 [-19.0, 0.27317731773177323, 5.7228498458862305]
Game number  7660 [-17.0, 0.2722272227222723, 6.76891303062439]
Game number  7670 [-20.0, 0.27127712771277135, 5.998743295669556]
Game number  7680 [-19.0, 0.2703270327032704, 9.275600910186768]
Game number  7690 [-21.0, 0.26937693769376947, 6.083612442016602]
Game number  7700 [-20.0, 0.26842684268426853, 6.247206687927246]
Game number  7710 [-19.0, 0.2674767476747675, 6.6852734088897705]
Game number  7720 [-18.0, 0.26652665266526654, 6.092255115509033]
Game number  7730 [-

Game number  8830 [-16.0, 0.16106610661066112, 9.787751913070679]
Game number  8840 [-20.0, 0.16011601160116018, 6.223628044128418]
Game number  8850 [-19.0, 0.15916591659165924, 6.697824001312256]
Game number  8860 [-18.0, 0.1582158215821583, 6.259171724319458]
Game number  8870 [-20.0, 0.15726572657265736, 6.67569899559021]
Game number  8880 [-19.0, 0.15631563156315642, 7.134058475494385]
Game number  8890 [-16.0, 0.15536553655365548, 6.789402723312378]
Game number  8900 [-20.0, 0.15441544154415443, 7.062979698181152]
Game number  8910 [-18.0, 0.1534653465346535, 6.519927978515625]
Game number  8920 [-18.0, 0.15251525152515255, 5.9857776165008545]
Game number  8930 [-15.0, 0.1515651565156516, 7.035210609436035]
Game number  8940 [-18.0, 0.15061506150615067, 9.14803671836853]
Game number  8950 [-19.0, 0.14966496649664973, 8.504143476486206]
Game number  8960 [-18.0, 0.1487148714871488, 6.241867780685425]
Game number  8970 [-17.0, 0.14776477647764785, 8.431192874908447]
Game number  89

In [13]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
mod.save('saved_model/dql')

INFO:tensorflow:Assets written to: saved_model/dql\assets


In [19]:
import matplotlib.pyplot as plt
plt.hist([x+21 for x in output_qdl['score']])

AttributeError: module 'matplotlib._api' has no attribute 'caching_module_getattr'