# Policy gradient algrorithm with BigDL

In [1]:
import random
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *

from rl.criterion import *

init_engine()

In [2]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.95,learning_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()

        model.add(Linear(self.state_size, 24))
        model.add(ReLU())

        model.add(Linear(24, 24))
        model.add(ReLU())

        model.add(Linear(24, 1))
        model.add(Sigmoid())
        return model

    def act(self, state):
        result = self.model.forward(state)
        return 1 if result > np.random.random() else 0

In [3]:
import to_rdd

In [4]:
def play_game(agent, render=False):
    state = env.reset()
    memory = np.array([0,0,0,0])
    actions = np.array([])
    rewards = np.array([])
    for time in range(500):
        if render:
            env.render()
        memory = np.vstack((memory, state))
        action = agent.act(state)
        actions = np.append(actions, action)
        state, reward, done, _ = env.step(action)
        if done:
            reward = -10
        rewards = np.append(rewards, reward)
        if done or time == 498:
            break
    return memory[1:], actions, rewards, time

In [5]:
def running_reward(actions, rewards, gamma):
    result = []
    run_rew = 0
    for action, reward in list(zip(actions, rewards))[::-1]:
        run_rew = run_rew*gamma + reward
        result.append([action, (-1) * run_rew])
    return np.vstack(result[::-1])

In [6]:
def play_n_games(agent, n=20):
    X_batch = np.array([0,0,0,0])
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c, d = play_game(agent)
        X_batch = np.vstack((X_batch, a))
        y_batch = np.vstack((y_batch, running_reward(b, c, agent.gamma)))
        results.append(d)
    return X_batch[1:], y_batch[1:], np.mean(d)

In [7]:
env = gym.make('CartPole-v1')
sc = SparkContext.getOrCreate(create_spark_conf())
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)

[2017-11-09 11:04:58,247] Making new env: CartPole-v1


creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid


In [8]:
criterion = VanillaPGCriterion()

creating: createVanillaPGCriterion


In [10]:
%%time
t = 0
while True:
    t +=1
    X_batch, y_batch, result = play_n_games(agent)
    print(t, result)
    if result == 498:
        break
    rdd_sample = to_rdd.to_RDD(X_batch, y_batch, sc)
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=VanillaPGCriterion(),
                                  optim_method=Adam(learningrate=agent.learning_rate),
                                  end_trigger=MaxEpoch(1),
                                  batch_size=40)
    agent.model = optimizer.optimize()

1 19.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
2 33.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
3 50.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
4 25.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
5 83.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
6 134.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
7 158.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
8 137.0
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
9 498.0
CPU times: user 1min 49s, sys: 19 s, total: 2min 8s
Wall time: 3min 12s
