In [2]:
import gym
import numpy as np

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *
from bigdl.nn.initialization_method import *

from datetime import datetime
from to_rdd import to_RDD

from rl.criterion import *

import to_rdd

init_engine()

In [3]:
def prepro(I):
    I=I[35:195]
    I=I[::2,::2,0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()

In [4]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.95, epsilon=1.0,
                 epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.01,
                load=False, load_path='/tmp/model.bigl'):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()
        if load:
            self.model = self.model.load(load_path)
        

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        
        model.add(Linear(6400, 200))
        model.add(ReLU())

        model.add(Linear(200, 1))
        model.add(Sigmoid())
        return model

    def act(self, state):
        result = self.model.forward(state)
        return 2 if result<np.random.random() else 3
    
    def save(self,name='model'+str(datetime.now())[:-7]+'bigdl'):
        self.model.save("/tmp/"+name)

In [15]:
def play_game(agent, render=False, report_score=False):
    state = env.reset()
    done = False
    actions = []
    rewards = []
    prev_obs = np.zeros(6400)
    states = [np.zeros(6400)]
    score = [0,0]
    while not done:
        if render == True:
            env.render()
        cur_obs = prepro(state)
        cur_state = cur_obs - prev_obs
        prev_obs = cur_obs
        states.append(cur_state)
        action = agent.act(cur_state)
        actions.append(action)
        state, reward, done, _ = env.step(action)
        if reward == -1:
            score[0] += 1
        elif reward == 1:
            score[1] += 1
        rewards.append(reward)
    if report_score:
        print('Score is {} : {}'.format(score[0], score[1]))
    return states[1:], actions, rewards

In [6]:
def running_reward(actions, rewards, gamma):
    result = []
    run_rew = 0
    for action, reward in list(zip(actions, rewards))[::-1]:
        if reward != 0:
            run_rew = 0
        run_rew = run_rew*gamma + reward
        result.append([action, (-1) * run_rew])
    return np.vstack(result[::-1])

In [12]:
def play_n_games(agent, n=20, report_score=False):
    X_batch = np.array(np.zeros(6400))
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c = play_game(agent, report_score=report_score)
        X_batch = np.vstack((X_batch, a))
        y_batch = np.vstack((y_batch, running_reward(b, c, agent.gamma)))
        results.append(np.sum(c))
    return X_batch[1:], y_batch[1:], np.mean(results)

In [8]:
env = gym.make('Pong-v0')
sc = SparkContext.getOrCreate(create_spark_conf())
state_size = 6400
action_size = 2
agent = PGAgent(state_size, action_size)

[2017-11-10 11:56:23,357] Making new env: Pong-v0


creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid


In [18]:
%%time
t = 0
while True:
    t +=1
    X_batch, y_batch, result = play_n_games(agent, n=10, report_score=True)
    print(t, result)
    if result > 0:
        break
    rdd_sample = to_rdd.to_RDD(X_batch, y_batch, sc)
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=VanillaPGCriterion(),
                                  optim_method=Adam(learningrate=agent.learning_rate),
                                  end_trigger=MaxEpoch(1),
                                  batch_size=1000)
    agent.model = optimizer.optimize()
    if t % 5 == 0:
        agent.model.save("PG_pong_NN", True)

Score is 21 : 3
Score is 21 : 0
Score is 21 : 0
Score is 21 : 0
Score is 21 : 0
Score is 21 : 0
Score is 21 : 1
Score is 21 : 0
Score is 21 : 2
Score is 21 : 0
1 -20.4
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer


KeyboardInterrupt: 