In [1]:
import tensorflow as tf
import numpy as np
import wrapped_flappy_bird as game
from collections import deque
import cv2
import sys
import random

In [2]:
GAME = 'flappybird'
ACTIONS = 2 # numbers of valid actions
INITIAL_EPSILON = 0.1
FINAL_EPSILON = 0.0001 # final value of epsilon
FRAME_PER_ACTION=1
OBSERVE = 100000. # timesteps to observe before training
EXPLORE = 3000000. # frames over which to anneal epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
GAMMA = 0.99 # decay rate of past observations


In [3]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.01)
    return tf.Variable(initial)
def bias_variable(shape):
    initial = tf.constant(0.01, shape=shape)
    return tf.Variable(initial)
def conv2d(x, W, stride):
    return tf.nn.conv2d(x, W, strides=[1, stride, stride, 1], padding='SAME')
def max_pool(x): #2*2
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

In [4]:
def createNetwork():
    W_conv1 = weight_variable([8, 8, 4, 32])
    b_conv1 = bias_variable([32])

    W_conv2 = weight_variable([4, 4, 32, 64])
    b_conv2 = bias_variable([64])

    W_conv3 = weight_variable([3, 3, 64, 64])
    b_conv3 = bias_variable([64])

    W_fc1 = weight_variable([256, 256])
    b_fc1 = bias_variable([256])

    W_fc2 = weight_variable([256, ACTIONS])
    b_fc2 = bias_variable([ACTIONS])
    
    #input layer
    s = tf.placeholder('float', [None, 80, 80, 4])
    
    #hidden layers
    h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1) #20*20*32
    h_pool1 = max_pool(h_conv1) #10*10*32
    
    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2) #5*5*64
    h_pool2 = max_pool(h_conv2) #3*3*64
    
    h_conv3 = tf.nn.relu(conv2d(h_pool2, W_conv3, 1) + b_conv3) #3*3*64
    h_pool3 = max_pool(h_conv3) #2*2*64=256
    
    #full connect
    h_pool3_flat = tf.reshape(h_pool3, [-1, 256])
    h_fc1 = tf.nn.relu(tf.matmul(h_pool3_flat, W_fc1) + b_fc1)
    
    #readout layer
    readout = tf.matmul(h_fc1, W_fc2) +b_fc2
    
    return s, readout, h_fc1
    

In [5]:
def trainNetwork(s, readout, h_fc1, sess):
    #define cost function
    a = tf.placeholder('float', [None, ACTIONS])
    y = tf.placeholder('float', [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a))
    cost = tf. reduce_mean(tf.square(y - readout_action))
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)
    
    #open up a game state to communicate with emulator
    game_state = game.GameState()
    
    #store the previous observations in replay memory
    D = deque()
    
    #get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t, 1, 255, cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    
    #saving and loading networks
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    checkpoint = tf.train.get_checkpoint_state('saved_networks')
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print('successfully loaded:', checkpoint.model_checkpoint_path)
    else:
        print('could not find old network weights')
        
    #start training 
    epsilon = INITIAL_EPSILON
    t = 0
    while 'flappy bird' !='angry bird':
        #choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s:[s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if t%FRAME_PER_ACTION==0:
            if random.random() <= epsilon:
                print('----Random Action ----')
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[0] = 1 # do nothing
        
        #scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON)/EXPLORE
        
        # run the selected action and observe next state and reward 
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2)
        
        #store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY:
            D.popleft()
            
        #only train if done observing
        if t > OBSERVE:
            #sample a minibatch to train on
            minibatch = random.sample(D, BATCH)
            
            #get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]
            
            y_batch = []
            readout_j1_batch = readout.eval(feed_dict={s: s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                #if terminal  , only queals reward 
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA*np.max(readout_j1_batch[i]))
                    
            #perform gradient step
            train_step.run(feed_dict={y:y_batch,
                                     a:a_batch,
                                     s:s_j_batch})
        
        #update the old values
        s_t = s_t1
        t += 1
        
        #save progress every 10000 iterations
        if t%10000 ==0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)
        
        #print info 
        state = ''
        if t<=OBSERVE:
            state = 'observe'
        elif t>OBSERVE and t<=OBSERVE + EXPLORE:
            state = 'explore'
        else:
            state = 'train'
            
        print('timestep', t, '/state', state, '/epsilon', epsilon, 
              '/action', action_index, '/reward', r_t, '/q_max', np.max(readout_t))
        

In [6]:
def main():
    sess = tf.InteractiveSession()
    s, readout, h_fc1 = createNetwork()
    trainNetwork(s, readout, h_fc1, sess)

In [7]:
if __name__ == '__main__':
    main()

INFO:tensorflow:Restoring parameters from saved_networks/flappybird-dqn-1390000
successfully loaded: saved_networks/flappybird-dqn-1390000
timestep 1 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152985
----Random Action ----
timestep 2 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.014329
timestep 3 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0148235
----Random Action ----
timestep 4 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0149867
timestep 5 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146487
timestep 6 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152616
timestep 7 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159049
----Random Action ----
timestep 8 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.016065
timestep 9 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162853
timestep 10 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0163233
timestep 11 /

timestep 102 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0157752
timestep 103 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0150229
----Random Action ----
timestep 104 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146109
----Random Action ----
timestep 105 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0145725
timestep 106 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0156718
timestep 107 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0156464
----Random Action ----
timestep 108 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0167699
timestep 109 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0166109
timestep 110 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159744
timestep 111 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0147566
timestep 112 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146384
timestep 113 /state observe /epsilon 0.1 /action 1 

timestep 204 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0142996
timestep 205 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.014728
timestep 206 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0155023
timestep 207 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0155016
timestep 208 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0167631
timestep 209 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.016031
----Random Action ----
timestep 210 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0152073
timestep 211 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146193
timestep 212 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146174
----Random Action ----
timestep 213 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0142889
timestep 214 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0163142
----Random Action ----
timestep 215 /state observe /epsilon 0.1 /action 0 /r

----Random Action ----
timestep 307 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0157867
timestep 308 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0168457
timestep 309 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159402
timestep 310 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152187
timestep 311 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0145327
timestep 312 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0147785
timestep 313 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.014248
timestep 314 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162952
timestep 315 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0160991
timestep 316 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0155166
timestep 317 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0148406
timestep 318 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0148961
timestep 319 /stat

timestep 410 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.016032
timestep 411 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0153851
timestep 412 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0148152
----Random Action ----
timestep 413 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0149051
----Random Action ----
timestep 414 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0155099
timestep 415 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.015879
timestep 416 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0168581
timestep 417 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0153158
timestep 418 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0156036
----Random Action ----
timestep 419 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0166545
timestep 420 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0182817
timestep 421 /state observe /epsilon 0.1 /action 1 /r

timestep 509 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0168185
timestep 510 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162724
timestep 511 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162733
timestep 512 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0154098
timestep 513 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0145856
timestep 514 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159663
timestep 515 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0163652
timestep 516 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0177765
timestep 517 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159793
timestep 518 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0168926
----Random Action ----
timestep 519 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0169625
timestep 520 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.019602
timestep 521 /stat

timestep 613 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0139901
----Random Action ----
timestep 614 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.015869
timestep 615 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152102
timestep 616 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0155273
----Random Action ----
timestep 617 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0147595
timestep 618 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0151716
timestep 619 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0170433
timestep 620 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0183273
timestep 621 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0163552
timestep 622 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0156443
timestep 623 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0164085
timestep 624 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.014

timestep 715 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0167307
timestep 716 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0160904
timestep 717 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146351
timestep 718 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0150272
timestep 719 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152227
timestep 720 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0179325
timestep 721 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0153899
----Random Action ----
timestep 722 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0147922
timestep 723 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0150437
timestep 724 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0134122
timestep 725 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0127283
timestep 726 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0139777
timestep 727 /sta

timestep 815 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0152102
timestep 816 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0155273
----Random Action ----
timestep 817 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0147595
timestep 818 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0151701
timestep 819 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0169731
timestep 820 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0182998
timestep 821 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162676
timestep 822 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0157817
timestep 823 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0162771
timestep 824 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0149685
timestep 825 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0141133
timestep 826 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0139762
----Random Action

timestep 919 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0167324
timestep 920 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0179844
timestep 921 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0159761
timestep 922 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0153089
timestep 923 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.015872
timestep 924 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0139552
timestep 925 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0133654
----Random Action ----
timestep 926 /state observe /epsilon 0.1 /action 0 /reward 0.1 /q_max 0.0131788
timestep 927 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0131564
timestep 928 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0127756
timestep 929 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.013784
timestep 930 /state observe /epsilon 0.1 /action 1 /reward 0.1 /q_max 0.0146885
timestep 931 /state

KeyboardInterrupt: 

In [None]:
s = np.random.randn(2,2,4)
print(s)
np.shape(s)
# np.append()

In [None]:
x=np.random.rand(2,2,1)
print(x)

In [None]:
s = np.append(x, s[:, :,1:], axis=2)
print(s)

In [None]:
s[:, :, :3]

In [None]:
list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  
slice = random.sample(list, 5)  #从list中随机获取5个元素，作为一个片断返回  
print(slice)  
# print list #原有序列并没有改变。  