# Policy gradient algrorithm with BigDL

In [2]:
import random
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *

from rl.criterion import *

init_engine()

In [3]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.95,learning_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()

        model.add(Linear(self.state_size, 24))
        model.add(ReLU())

        model.add(Linear(24, 24))
        model.add(ReLU())

        model.add(Linear(24, 1))
        model.add(Sigmoid())
        return model

    def act(self, state):
        result = self.model.forward(state)
        return 1 if result > np.random.random() else 0

In [4]:
import to_rdd

In [5]:
def play_game(agent, render=False):
    state = env.reset()
    memory = np.array([0,0,0,0])
    actions = np.array([])
    rewards = np.array([])
    for time in range(500):
        if render:
            env.render()
        memory = np.vstack((memory, state))
        action = agent.act(state)
        actions = np.append(actions, action)
        state, reward, done, _ = env.step(action)
        if done:
            reward = -10
        rewards = np.append(rewards, reward)
        if done or time == 498:
            break
    return memory[1:], actions, rewards, time

In [145]:
def running_reward(actions, rewards, gamma):
    result = []
    run_rew = 0
    for action, reward in list(zip(actions, rewards))[::-1]:
        run_rew = run_rew*gamma + reward
        result.append([action, (-1) * run_rew])
    return np.vstack(result[::-1])

In [7]:
def play_n_games(agent, n=20):
    X_batch = np.array([0,0,0,0])
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c, d = play_game(agent)
        X_batch = np.vstack((X_batch, a))
        y_batch = np.vstack((y_batch, running_reward(b, c, agent.gamma)))
        results.append(d)
    return X_batch[1:], y_batch[1:], np.mean(d)

In [8]:
env = gym.make('CartPole-v1')
sc = SparkContext.getOrCreate(create_spark_conf())
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)

[2017-11-09 08:45:43,108] Making new env: CartPole-v1


creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid


In [61]:
criterion = VanillaPGCriterion()

creating: createVanillaPGCriterion


In [158]:
t = 0
while True:
    t +=1
    X_batch, y_batch, result = play_n_games(agent)
    print(t, result)
    if result == 498:
        break
    rdd_sample = to_rdd.to_RDD(X_batch, y_batch, sc)
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=VanillaPGCriterion(),
                                  optim_method=Adam(learningrate=agent.learning_rate),
                                  end_trigger=MaxEpoch(1),
                                  batch_size=40)
    agent.model = optimizer.optimize()

1 498.0


In [156]:
state = env.reset()
memory = np.array([0,0,0,0])
actions = np.array([])
rewards = np.array([])
predictions = []
render = False
for time in range(500):
    if render:
        env.render()
    memory = np.vstack((memory, state))
    prediction = agent.model.forward(state)
    predictions.append(prediction)
    k = np.random.random()
    action = 1 if prediction > k else 0
    print(prediction, k, action)
    actions = np.append(actions, action)
    state, reward, done, _ = env.step(action)
    if done:
        reward = -10
    rewards = np.append(rewards, reward)
    if done or time == 498:
        break
memory = memory[1:]

[ 0.72940034] 0.787392367493248 0
[ 0.95861894] 0.02909845795646504 1
[ 0.71000075] 0.4856671110154632 1
[ 0.12406699] 0.9778946725339449 0
[ 0.66692138] 0.09056372498809906 1
[ 0.11225829] 0.6092574333416381 0
[ 0.6092304] 0.8671392725577934 0
[ 0.93427151] 0.2690439965758562 1
[ 0.56644166] 0.36105587138772655 1
[ 0.09500801] 0.1375497279211364 0
[ 0.47769654] 0.3690678351247927 1
[ 0.08301767] 0.9021037853985928 0
[ 0.3757984] 0.6055395559501513 0
[ 0.88155466] 0.05894982336233312 1
[ 0.30747992] 0.58193581248729 0
[ 0.85935169] 0.3862830310842337 1
[ 0.26007017] 0.2801344497364676 0
[ 0.83243537] 0.08214456956413563 1
[ 0.22644944] 0.5283224570462257 0
[ 0.80030948] 0.9445480884792506 0
[ 0.97256875] 0.0530843831310116 1
[ 0.7881701] 0.7622891979429022 1
[ 0.19571149] 0.08528917419085469 1
[ 0.04878268] 0.26715410156087904 0
[ 0.16996904] 0.2928407522353137 0
[ 0.69508678] 0.34222553988500537 1
[ 0.15360638] 0.4968368155998144 0
[ 0.63472927] 0.10220746808807835 1
[ 0.13813166] 0.4

In [131]:
criterion.backward(np.array(predictions), running_reward(actions, rewards, agent.gamma))

array([  3.48283251e-08,  -4.53051996e-07,  -9.78830712e-07,
        -1.91535241e-06,  -2.11486213e-06,  -2.04607841e-06,
        -1.68681140e-06,  -1.01327896e-06,  -1.19209290e-06], dtype=float32)

In [132]:
agent._build_model()

creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid


<bigdl.nn.layer.Sequential at 0x7fc3c2387438>

In [137]:
agent.model.get_weights()[0]

array([[ 0.00676675,  0.11883467,  0.59841168,  0.20527022],
       [ 0.08665612,  0.63947171,  0.34233144,  0.04209755],
       [ 0.49580178,  0.37925962,  0.84736603,  0.74442291],
       [ 0.39595661,  0.82160586,  0.67433298,  0.64266759],
       [ 0.99089086,  0.38935891,  0.37352961,  0.80974621],
       [ 0.26870316,  0.85447091,  0.93012381,  0.99730349],
       [ 0.86939889,  0.23188262,  0.65970862,  0.85801119],
       [ 0.20092945,  0.53392005,  0.67173058,  0.58332789],
       [ 0.27892038,  0.36428255,  0.81974107,  0.52398491],
       [ 0.45531234,  0.63030338,  0.49679488,  0.38100365],
       [ 0.6047911 ,  0.50894207,  0.75917244,  0.99744695],
       [ 0.1084374 ,  0.0564434 ,  0.76158839,  0.76929349],
       [ 0.03228432,  0.76585191,  0.84930855,  0.87490374],
       [ 0.58457541,  0.23394585,  0.95764619,  0.08835996],
       [ 0.20682086,  0.75392765,  0.77005661,  0.58849925],
       [ 0.40815654,  0.86748993,  0.42850819,  0.03669885],
       [ 0.40675861,  0.

In [138]:
agent.model.get_weights()[4]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

In [139]:
agent.model.get_weights()[5]

array([-5.], dtype=float32)

In [146]:
agent.model.set_weights([np.random.rand(24,4), np.random.rand(24), np.random.rand(24,24), 
                         np.random.rand(24), np.zeros([1,24]), np.array([0])])

In [121]:
agent.model.get_weights()[0]

array([[ 0.10786043,  0.06879324,  0.18279141,  0.93403041],
       [ 0.67401427,  0.60587656,  0.54568642,  0.5862323 ],
       [ 0.79791135,  0.3021982 ,  0.23653647,  0.29738259],
       [ 0.82383907,  0.37030986,  0.21494901,  0.05929726],
       [ 0.77699906,  0.59251946,  0.64333212,  0.48012295],
       [ 0.54541343,  0.72466719,  0.6437974 ,  0.02818192],
       [ 0.34222278,  0.78598249,  0.57677299,  0.62431824],
       [ 0.11532093,  0.20792405,  0.04566034,  0.91239226],
       [ 0.77339685,  0.2493919 ,  0.52919704,  0.1662911 ],
       [ 0.94910836,  0.49553111,  0.98146671,  0.68857253],
       [ 0.04357319,  0.77741307,  0.53424758,  0.36113578],
       [ 0.10966367,  0.58220541,  0.72671372,  0.35166398],
       [ 0.8036623 ,  0.49881777,  0.65883517,  0.09157259],
       [ 0.43272579,  0.70624197,  0.90560836,  0.00630194],
       [ 0.19146028,  0.26711771,  0.20438041,  0.05637224],
       [ 0.7591815 ,  0.80192637,  0.1486169 ,  0.43165982],
       [ 0.0507346 ,  0.

In [147]:
agent.model.get_weights()[4]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

In [148]:
agent.model.get_weights()[5]

array([ 0.], dtype=float32)

In [155]:
agent.model.backward([[0.3]], [1])

Exception: Error unknown input type <class 'list'>