In [1]:
import cv2
import sys
sys.path.append("game/")
import numpy as np
import random
from collections import deque

import torch
import torch.nn as nn
from torch.autograd import Variable

import wrapped_flappy_bird as game

from DQN import DQN

In [2]:
cuda = True

In [3]:
#convert images to 80*80 gray images
def preprocess(observation):
    img = cv2.resize(observation, (80, 80))
    observation = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, observation = cv2.threshold(observation,1,255,cv2.THRESH_BINARY)
    return np.reshape(observation,(80,80,1))

In [4]:
class CFG(object):
    lr=0.001
    actions=2
    is_traing=True
    load_weight=False
    gamma=0.99
    batch_size=32
    mem_size=5000
    epsilon=0.9
    initial_epsilon=1.
    final_epsilon=0.1
    observation=100
    exploration=50000
    max_episode=100000
    save_checkpoint_freq = 100000

In [5]:
best_time_step = 0.
flappyBird = game.GameState()
cfg = CFG()
dqn = DQN(cfg)

In [6]:
if cuda:
    dqn.model = dqn.model.cuda()

In [7]:
action=[1,0]
o, r, terminal = flappyBird.frame_step(action)
best_time_step = 0.
o = preprocess(o)

In [8]:
for i in range(cfg.observation):
    action = dqn.get_action_randomly()
    o, r, terminal = flappyBird.frame_step(action)
    o = preprocess(o)
    dqn.storeTransition(o, action, r, terminal)

In [None]:
for episode in xrange(cfg.max_episode):
    dqn.time_step = 0
    total_reward = 0.
    while True:
        optimizer.zero_grad()
        action = dqn.get_action()
        o_next, r, terminal = flappyBird.frame_step(action)
        total_reward += cfg.gamma**model.time_step * r
        o_next = preprocess(o_next)
        dqn.store_transition(o_next, action, r, terminal)
        dqn.increase_time_step()
        
        dqn.trainByBatch()
        
        if terminal:
            break
    
    print 'episode: {}, epsilon: {:.4f}, max time step: {}, total reward: {:.6f}'.format(
            episode, dqn.epsilon, dqn.time_step, total_reward)
    
    if dqn.epsilon > cfg.final_e:
        delta = (cfg.init_e - cfg.final_e)/cfg.exploration
        dqn.epsilon -= delta
    
    if episode % 100 == 0:
        ave_time = test_dqn(model, episode)
    
    if ave_time > best_time_step:
        best_time_step = ave_time
        save_checkpoint({
                'episode': episode,
                'epsilon': dqn.epsilon,
                'state_dict': dqn.state_dict(),
                'best_time_step': best_time_step,
                 }, True, 'checkpoint-episode-%d.pth.tar' %episode)
    elif episode % cfg.save_checkpoint_freq == 0:
        save_checkpoint({
                'episode:': episode,
                'epsilon': dqn.epsilon,
                'state_dict': dqn.state_dict(),
                'time_step': ave_time,
                 }, False, 'checkpoint-episode-%d.pth.tar' %episode)
    else:
        continue
    print 'save checkpoint, episode={}, ave time step={:.2f}'.format(
                episode, ave_time)

In [None]:
def train_dqn(model, options, resume):
    """Train DQN
       model -- DQN model
       lr -- learning rate
       max_episode -- maximum episode
       resume -- resume previous model
       model_name -- checkpoint file name
    """
    best_time_step = 0.
    if resume:
        if options.weight is None:
            print 'when resume, you should give weight file name.'
            return
        print 'load previous model weight: {}'.format(options.weight)
        _, _, best_time_step = load_checkpoint(options.weight, model)

    flappyBird = game.GameState()
    optimizer = optim.RMSprop(model.parameters(), lr=options.lr)
    ceriterion = nn.MSELoss()

    action = [1, 0]
    o, r, terminal = flappyBird.frame_step(action)
    o = preprocess(o)
    model.set_initial_state()

    if options.cuda:
        model = model.cuda()
    # in the first `OBSERVE` time steos, we dont train the model
    for i in xrange(options.observation):
        action = model.get_action_randomly()
        o, r, terminal = flappyBird.frame_step(action)
        o = preprocess(o)
        model.store_transition(o, action, r, terminal)
    # start training
    for episode in xrange(options.max_episode):
        model.time_step = 0
        model.set_train()
        total_reward = 0.
        # begin an episode!
        while True:
            optimizer.zero_grad()
            action = model.get_action()
            o_next, r, terminal = flappyBird.frame_step(action)
            total_reward += options.gamma**model.time_step * r
            o_next = preprocess(o_next)
            model.store_transition(o_next, action, r, terminal)
            model.increase_time_step()
            # Step 1: obtain random minibatch from replay memory
            minibatch = random.sample(model.replay_memory, options.batch_size)
            state_batch = np.array([data[0] for data in minibatch])
            action_batch = np.array([data[1] for data in minibatch])
            reward_batch = np.array([data[2] for data in minibatch])
            next_state_batch = np.array([data[3] for data in minibatch])
            state_batch_var = Variable(torch.from_numpy(state_batch))
            next_state_batch_var = Variable(torch.from_numpy(next_state_batch),
                                           volatile=True)
            if options.cuda:
                state_batch_var = state_batch_var.cuda()
                next_state_batch_var = next_state_batch_var.cuda()
            # Step 2: calculate y
            q_value_next = model.forward(next_state_batch_var)

            q_value = model.forward(state_batch_var)

            y = reward_batch.astype(np.float32)
            max_q, _ = torch.max(q_value_next, dim=1)

            for i in xrange(options.batch_size):
                if not minibatch[i][4]:
                    y[i] += options.gamma*max_q.data[i][0]

            y = Variable(torch.from_numpy(y))
            action_batch_var = Variable(torch.from_numpy(action_batch))
            if options.cuda:
                y = y.cuda()
                action_batch_var = action_batch_var.cuda()
            q_value = torch.sum(torch.mul(action_batch_var, q_value), dim=1)

            loss = ceriterion(q_value, y)
            loss.backward()

            optimizer.step()
            # when the bird dies, the episode ends
            if terminal:
                break

        print 'episode: {}, epsilon: {:.4f}, max time step: {}, total reward: {:.6f}'.format(
                episode, cfg.epsilon, cfg.time_step, total_reward)

        if model.epsilon > options.final_e:
            delta = (options.init_e - options.final_e)/options.exploration
            model.epsilon -= delta

        if episode % 100 == 0:
            ave_time = test_dqn(model, episode)

        if ave_time > best_time_step:
            best_time_step = ave_time
            save_checkpoint({
                'episode': episode,
                'epsilon': model.epsilon,
                'state_dict': model.state_dict(),
                'best_time_step': best_time_step,
                 }, True, 'checkpoint-episode-%d.pth.tar' %episode)
        elif episode % cfg.save_checkpoint_freq == 0:
            save_checkpoint({
                'episode:': episode,
                'epsilon': model.epsilon,
                'state_dict': model.state_dict(),
                'time_step': ave_time,
                 }, False, 'checkpoint-episode-%d.pth.tar' %episode)
        else:
            continue
        print 'save checkpoint, episode={}, ave time step={:.2f}'.format(
                 episode, ave_time)


In [52]:
print(dqn.model)

Net (
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4), padding=(2, 2))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
  (fc1): Linear (9216 -> 256)
  (fc2): Linear (256 -> 2)
)


In [2]:
flappyBird = game.GameState()

In [9]:
action = [1, 0]
o, r, terminal = flappyBird.frame_step(action)

False
