In [1]:
import gym
import numpy as np

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *
from bigdl.nn.initialization_method import *

from datetime import datetime

from rl.criterion import *

init_engine()

In [2]:
def to_RDD(X, y):
    return sc.parallelize(X).zip(sc.parallelize(y)).map(
            lambda x: Sample.from_ndarray(x[0], x[1]))

In [3]:
def prepro(I):
    I=I[35:195]
    I=I[::2,::2,0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float).ravel()

In [4]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.95, epsilon=1.0,
                 epsilon_min=0.01, epsilon_decay=0.995, learning_rate=0.01,
                load=False, load_path='/tmp/model.bigl'):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()
        if load:
            self.model = self.model.load(load_path)
        

    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        
        model.add(Linear(6400, 200))
        model.add(ReLU())

        model.add(Linear(200, 1))
        model.add(Sigmoid())
        return model

    def act(self, state):
        result = self.model.forward(state)
        return 2 if result<np.random.random() else 3
    
    def save(self,name='model'+str(datetime.now())[:-7]+'bigdl'):
        self.model.save("/tmp/"+name)

In [5]:
def play_game(agent, render=False, report_score=False):
    state = env.reset()
    done = False
    actions = []
    rewards = []
    prev_obs = np.zeros(6400)
    states = [np.zeros(6400)]
    score = [0,0]
    while not done:
        if render == True:
            env.render()
        cur_obs = prepro(state)
        cur_state = cur_obs - prev_obs
        prev_obs = cur_obs
        states.append(cur_state)
        action = agent.act(cur_state)
        actions.append(action)
        state, reward, done, _ = env.step(action)
        if reward == -1:
            score[0] += 1
        elif reward == 1:
            score[1] += 1
        rewards.append(reward)
    if report_score:
        print('No. of -1 score vs. 1 score is {} : {}'.format(score[0], score[1]))
    return states[1:], actions, rewards

In [6]:
def running_reward(actions, rewards, gamma):
    result = []
    run_rew = 0
    for action, reward in list(zip(actions, rewards))[::-1]:
        if reward != 0:
            run_rew = 0
        run_rew = run_rew*gamma + reward
        result.append([action, run_rew])
    return np.vstack(result[::-1])

In [7]:
def play_n_games(agent, n=20, report_score=False):
    X_batch = np.array(np.zeros(6400))
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c = play_game(agent, report_score=report_score)
        X_batch = np.vstack((X_batch, a))
        y_batch = np.vstack((y_batch, running_reward(b, c, agent.gamma)))
        results.append(np.sum(c))
    return X_batch[1:], y_batch[1:], np.mean(results)

In [8]:
env = gym.make('Pong-v0')
sc = SparkContext.getOrCreate(create_spark_conf())
state_size = 6400
action_size = 2
agent = PGAgent(state_size, action_size)

[2017-11-13 14:04:55,221] Making new env: Pong-v0


creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid


In [9]:
%%time
t = 0
core_num = 8 #set core number as optimization requires batch_size to be multiple of core_num
while True:
    t +=1
    X_batch, y_batch, result = play_n_games(agent, n=10, report_score=True)
    print "Result of update no.",t,"result is",result
    #print(t, result)
    if result > 0:
        break
    rdd_sample = to_RDD(X_batch, y_batch)
    
    #calculate batch size as the latest multiple of core_num that's less than no.of samples
    #batch_size = 1000
    batch_size = X_batch.shape[0] - X_batch.shape[0]%core_num
    print "using batch_size = ",batch_size
    
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=VanillaPGCriterion(clipping=False,size_average=True),
                                  optim_method=Adam(learningrate=agent.learning_rate),
                                  end_trigger=MaxIteration(1),
                                  batch_size=batch_size)
    agent.model = optimizer.optimize()
    if t % 5 == 0:
        agent.model.save("PG_pong_NN.model", True)

No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 2
No. of -1 score vs. 1 score is 21 : 1
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 2
No. of -1 score vs. 1 score is 21 : 3
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
Result of update no. 1 result is -20.2
using batch_size =  14384
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createOptimizer
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 1
No. of -1 score vs. 1 score is 21 : 1
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
Result of update no. 2 result is -20.8
using batch_size =  12656

using batch_size =  10176
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createOptimizer
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
Result of update no. 16 result is -21.0
using batch_size =  10192
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createOptimizer
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No. of -1 score vs. 1 score is 21 : 0
No

KeyboardInterrupt: 