In [1]:
# -*- coding: utf-8 -*-
from __future__ import division, print_function
from keras.models import Sequential
from keras.layers.core import Activation, Dense, Flatten
from keras.layers.convolutional import Conv2D
from keras.optimizers import Adam

from scipy.misc import imresize
#from scipy import misc

import collections
import numpy as np
import os

import wrapped_game

Using TensorFlow backend.


In [3]:
def preprocess_images(images):
    if images.shape[0] < 4:
        # single image
        x_t = images[0]
        x_t = imresize(x_t, (80, 80))
        x_t = x_t.astype("float")
        x_t /= 255.0
        s_t = np.stack((x_t, x_t, x_t, x_t), axis=2)
    else:
        # 4 images
        xt_list = []
        for i in range(images.shape[0]):
            x_t = imresize(images[i], (80, 80))
            x_t = x_t.astype("float")
            x_t /= 255.0
            xt_list.append(x_t)
        s_t = np.stack((xt_list[0], xt_list[1], xt_list[2], xt_list[3]), axis=2)
    s_t = np.expand_dims(s_t, axis=0)
    return s_t

In [4]:
# experience : (s_tm1, a_t, r_t, s_t, game_over)
def get_next_batch(experience, model, num_actions, gamma, batch_size):
    
    batch_indices = np.random.randint(low=0, high=len(experience), size=batch_size)
    
    batch = [experience[i] for i in batch_indices]
    
    X = np.zeros((batch_size, 80, 80, 4))
    Y = np.zeros((batch_size, num_actions))
    
    for i in range(len(batch)):
        s_t, a_t, r_t, s_tp1, game_over = batch[i]
        X[i] = s_t
        Y[i] = model.predict(s_t)[0]
        Q_sa = np.max(model.predict(s_tp1)[0])
    
        if game_over:
            Y[i, a_t] = r_t
        else:
            Y[i, a_t] = r_t + gamma * Q_sa
    
    return X, Y

In [5]:
# initialize parameters
DATA_DIR = "." #"../data"
NUM_ACTIONS = 3 # number of valid actions (left, stay, right)
GAMMA = 0.99 # decay rate of past observations
INITIAL_EPSILON = 0.1 # starting value of epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
MEMORY_SIZE = 50000 # number of previous transitions to remember
NUM_EPOCHS_OBSERVE = 100
NUM_EPOCHS_TRAIN = 2000

BATCH_SIZE = 32
NUM_EPOCHS = NUM_EPOCHS_OBSERVE + NUM_EPOCHS_TRAIN

In [6]:
# build the model
model = Sequential()
model.add(Conv2D(32, kernel_size=8, strides=4, kernel_initializer="normal", padding="same", input_shape=(80, 80, 4)))
model.add(Activation("relu"))
model.add(Conv2D(64, kernel_size=4, strides=2, kernel_initializer="normal", padding="same"))
model.add(Activation("relu"))
model.add(Conv2D(64, kernel_size=3, strides=1, kernel_initializer="normal", padding="same"))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dense(512, kernel_initializer="normal"))
model.add(Activation("relu"))
model.add(Dense(3, kernel_initializer="normal"))

model.compile(optimizer=Adam(lr=1e-6), loss="mse")

In [7]:
# train network
game = wrapped_game.MyWrappedGame()
experience = collections.deque(maxlen=MEMORY_SIZE)

fout = open(os.path.join(DATA_DIR, "rl-network-results2.tsv"), "wb")
num_games, num_wins = 0, 0
epsilon = INITIAL_EPSILON

In [None]:
for e in range(NUM_EPOCHS):
    
    loss = 0.0
    
    # reset
    game.reset()
    
    # get first state
    a_0 = 1  # (0 = left, 1 = stay, 2 = right)
    
    # x_t:frame, r_0:reward, game_over
    x_t, r_0, game_over = game.step(a_0)

    # 欠損補間、リサイズ、floatキャスト
    # x_t : (1, 400, 400)
    # s_t : (1,  80,  80, 4)
    s_t = preprocess_images(x_t)
    
    while not game_over:
        
        # ■状態をキャッシュする : ４フレーム分の画像（1, 80, 80, 4）
        s_tm1 = s_t
        
        # ■アクションを決定
        if e <= NUM_EPOCHS_OBSERVE:
            a_t = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0] # highは含まない
        else:
            # exploration/exploitation
            
            # exploration（探検）
            if np.random.rand() <= epsilon:
                a_t = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0] # highは含まない
            else: # exploitation（活用）
                
                # DNNに入れる
                q = model.predict(s_t)[0]
#                 print('q.shape:' + str(q.shape))
#                 print('q      :' + str(q))

                # action化する（インデックス化）
                a_t = np.argmax(q)
                #print(a_t)
        
        
        # ■アクションの実行、状態、報酬の取得 apply action, get reward 
        # ゲームオーバーとともにリワードは設定される（－１または１）、ゲームオーバー以外の場合は０
        x_t, r_t, game_over = game.step(a_t)
        #print(r_t)
        s_t = preprocess_images(x_t)
        
        
        # if reward, increment num_wins
        if r_t == 1:
            num_wins += 1
        
        # store experience
        experience.append((s_tm1, a_t, r_t, s_t, game_over))
        
        if e > NUM_EPOCHS_OBSERVE:
            # finished observing, now start training
            # get next batch
            X, Y = get_next_batch(experience, model, NUM_ACTIONS, GAMMA, BATCH_SIZE)
            loss += model.train_on_batch(X, Y)
        
    # reduce epsilon gradually
    if epsilon > FINAL_EPSILON:
        epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / NUM_EPOCHS
        
    print("Epoch {:04d}/{:d} | Loss {:.5f} | Win Count: {:d}".format(e + 1, NUM_EPOCHS, loss, num_wins))
    fout.write(bytes("{:04d}\t{:.5f}\t{:d}\n".format(e + 1, loss, num_wins).encode('utf-8')))

    if e % 100 == 0:
        model.save(os.path.join(DATA_DIR, "rl-network2.h5"), overwrite=True)
        


Epoch 0001/2100 | Loss 0.00000 | Win Count: 0
Epoch 0002/2100 | Loss 0.00000 | Win Count: 0
Epoch 0003/2100 | Loss 0.00000 | Win Count: 0
Epoch 0004/2100 | Loss 0.00000 | Win Count: 0
Epoch 0005/2100 | Loss 0.00000 | Win Count: 0
Epoch 0006/2100 | Loss 0.00000 | Win Count: 0
Epoch 0007/2100 | Loss 0.00000 | Win Count: 0
Epoch 0008/2100 | Loss 0.00000 | Win Count: 1
Epoch 0009/2100 | Loss 0.00000 | Win Count: 1
Epoch 0010/2100 | Loss 0.00000 | Win Count: 2
Epoch 0011/2100 | Loss 0.00000 | Win Count: 2
Epoch 0012/2100 | Loss 0.00000 | Win Count: 2
Epoch 0013/2100 | Loss 0.00000 | Win Count: 2
Epoch 0014/2100 | Loss 0.00000 | Win Count: 3
Epoch 0015/2100 | Loss 0.00000 | Win Count: 3
Epoch 0016/2100 | Loss 0.00000 | Win Count: 3
Epoch 0017/2100 | Loss 0.00000 | Win Count: 3
Epoch 0018/2100 | Loss 0.00000 | Win Count: 4
Epoch 0019/2100 | Loss 0.00000 | Win Count: 4
Epoch 0020/2100 | Loss 0.00000 | Win Count: 5
Epoch 0021/2100 | Loss 0.00000 | Win Count: 5
Epoch 0022/2100 | Loss 0.00000 | W

Epoch 0177/2100 | Loss 0.37603 | Win Count: 25
Epoch 0178/2100 | Loss 0.28875 | Win Count: 26
Epoch 0179/2100 | Loss 0.37455 | Win Count: 26
Epoch 0180/2100 | Loss 0.24286 | Win Count: 26
Epoch 0181/2100 | Loss 0.36060 | Win Count: 26
Epoch 0182/2100 | Loss 0.36136 | Win Count: 26
Epoch 0183/2100 | Loss 0.46239 | Win Count: 26
Epoch 0184/2100 | Loss 0.32686 | Win Count: 26
Epoch 0185/2100 | Loss 0.36616 | Win Count: 26
Epoch 0186/2100 | Loss 0.53435 | Win Count: 26
Epoch 0187/2100 | Loss 0.32727 | Win Count: 26
Epoch 0188/2100 | Loss 0.35175 | Win Count: 27
Epoch 0189/2100 | Loss 0.30908 | Win Count: 27
Epoch 0190/2100 | Loss 0.31315 | Win Count: 27
Epoch 0191/2100 | Loss 0.30380 | Win Count: 27
Epoch 0192/2100 | Loss 0.34454 | Win Count: 27
Epoch 0193/2100 | Loss 0.31691 | Win Count: 27
Epoch 0194/2100 | Loss 0.30042 | Win Count: 27
Epoch 0195/2100 | Loss 0.34462 | Win Count: 28
Epoch 0196/2100 | Loss 0.38740 | Win Count: 28
Epoch 0197/2100 | Loss 0.33269 | Win Count: 28
Epoch 0198/21

In [None]:
fout.close()
model.save(os.path.join(DATA_DIR, "rl-network.h5"), overwrite=True)