# Policy gradient algrorithm with BigDL

In [1]:
import random
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *

from rl.criterion import *

init_engine()

In [2]:
class PGAgent:
    def __init__(self, state_size, action_size, batch_size=32, gamma=0.95,learning_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma  # discount rate
        self.learning_rate = learning_rate
        self.model = self._build_model()
        self.value_func = self._build_vf()
        self.memory = deque(maxlen=2000)
        self.batch_size = batch_size

    def _build_model(self):
        model = Sequential()

        model.add(Linear(self.state_size, 24))
        model.add(ReLU())

        model.add(Linear(24, 24))
        model.add(ReLU())

        model.add(Linear(24, 1))
        model.add(Sigmoid())
        return model
    
    def _build_vf(self):
        model = Sequential()

        model.add(Linear(self.state_size, 24))
        model.add(ReLU())

        model.add(Linear(24, 24))
        model.add(ReLU())

        model.add(Linear(24, 2))
        return model
    
    def train_val_func(self):
        if len(agent.memory) < agent.batch_size:
            pass
        else:
            minibatch = random.sample(self.memory, self.batch_size)
            data_batch = []
            target_batch = []
            for state, action, reward, next_state, done in minibatch:
                target = reward
                if not done:
                    target = (reward + self.gamma *np.amax(self.value_func.forward(next_state)))
                target_f = self.value_func.forward(state)
                target_f[action] = target

                data_batch.append(state)
                target_batch.append(target_f)
            batch_rdd = to_rdd.to_RDD(np.array(data_batch), np.array(target_batch), sc)

            self.value_func = Optimizer(model=self.value_func,
                                  training_rdd=batch_rdd,
                                  criterion=MSECriterion(),
                                  optim_method=Adam(learningrate=0.001),
                              end_trigger=MaxEpoch(1),
                              batch_size=32).optimize()



    def act(self, state):
        result = self.model.forward(state)
        return 1 if result > np.random.random() else 0

In [3]:
import to_rdd

In [4]:
def play_game(agent, render=False):
    state = env.reset()
    memory = np.array([0,0,0,0])
    actions = np.array([])
    rewards = np.array([])
    for time in range(500):
        if render:
            env.render()
        memory = np.vstack((memory, state))
        action = agent.act(state)
        actions = np.append(actions, action)
        next_state, reward, done, _ = env.step(action)
        if done:
            reward = -10
        rewards = np.append(rewards, reward)
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state
        if done or time == 498:
            break
    return memory[1:], actions, rewards, time

In [5]:
def running_reward(actions, rewards, st_values, gamma):
    result = []
    run_rew = 0
    for action, reward, st_value in list(zip(actions, rewards, st_values))[::-1]:
        run_rew = run_rew*gamma + reward
        result.append([action, (run_rew - st_value)])
    return np.vstack(result[::-1])

In [6]:
def get_values(agent, states):
    result = []
    for state in states:
        values = agent.value_func.forward(state)
        prediction = agent.model.forward(state)
        result.append(values[0]*(1-prediction) + values[1]*prediction)
    return result

In [7]:
def play_n_games(agent, n=20):
    X_batch = np.array([0,0,0,0])
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c, d = play_game(agent)
        X_batch = np.vstack((X_batch, a))
        st_values = get_values(agent, a)
        y_batch = np.vstack((y_batch, running_reward(b, c, st_values, agent.gamma)))
        agent.train_val_func()
        results.append(d)
    return X_batch[1:], y_batch[1:], results

In [8]:
env = gym.make('CartPole-v1')
sc = SparkContext.getOrCreate(create_spark_conf())
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)

[2017-11-15 10:33:10,504] Making new env: CartPole-v1


creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSigmoid
creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear


In [18]:
%%time
t = 0
while True:
    t +=1
    X_batch, y_batch, result = play_n_games(agent, n = 10)
    print(t, np.mean(result), np.var(result), result)
    if result == 498:
        break
    rdd_sample = to_rdd.to_RDD(X_batch, y_batch, sc)
    batch_size = X_batch.shape[0] - X_batch.shape[0]%4
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=VanillaPGCriterion(),
                                  optim_method=Adam(learningrate=agent.learning_rate),
                                  end_trigger=MaxIteration(1),
                                  batch_size=batch_size)
    agent.model = optimizer.optimize()

creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: create

KeyboardInterrupt: 

In [11]:
state = env.reset()
memory = np.array([0,0,0,0])
actions = np.array([])
rewards = np.array([])
for time in range(500):
    #memory = np.vstack((memory, state))
    value = np.max(agent.value_func.forward(state))
    prediction = agent.model.forward(state)
    rand = np.random.random()
    action = 1 if prediction > rand else 0
    next_state, reward, done, _ = env.step(action)
    print(value, prediction, rand, action)
    if done:
        reward = -10
    #rewards = np.append(rewards, reward)
    #agent.memory.append((state, action, reward, next_state, done))
    state = next_state
    if done or time == 498:
        break

51.0934 [ 0.42833605] 0.4279940782013488 1
49.7476 [ 0.12392011] 0.7789476004137755 0
51.0649 [ 0.42470047] 0.30633722450750445 1
49.7196 [ 0.12322719] 0.9560686003830586 0
51.0081 [ 0.41189745] 0.14535913244265175 1
49.6873 [ 0.12105967] 0.4575701714053001 0
50.9219 [ 0.39002028] 0.21197378601991868 1
49.6491 [ 0.11747598] 0.7887638214675793 0
50.8044 [ 0.36144838] 0.9008229620168732 0
53.4443 [ 0.77365273] 0.7676517917556165 1
50.7337 [ 0.34014592] 0.6540382134687941 0
53.3416 [ 0.75980932] 0.30984334123408697 1
50.6714 [ 0.32281071] 0.8890152912967643 0
53.2521 [ 0.74725145] 0.08158086154851873 1
50.6159 [ 0.30926046] 0.6637517760399962 0
53.1734 [ 0.73581314] 0.6535097354239021 1
50.5716 [ 0.29750511] 0.5443647334479281 0
53.1167 [ 0.72530746] 0.4996315926456858 1
50.5313 [ 0.28714424] 0.6518687938475007 0
53.0698 [ 0.71553653] 0.9067403724962855 0
57.6769 [ 0.92455024] 0.7451614613615545 1
53.0826 [ 0.71836007] 0.006543745739630347 1
50.5592 [ 0.28573141] 0.2409468830451984 1
49.6

In [86]:
X_batch, y_batch, result = play_n_games(agent, n = 5)
print(t, np.mean(result), result)

rdd_sample = to_rdd.to_RDD(X_batch, y_batch, sc)
optimizer = Optimizer(model=agent.model,
                              training_rdd=rdd_sample,
                              criterion=VanillaPGCriterion(),
                              optim_method=Adam(learningrate=agent.learning_rate),
                              end_trigger=MaxEpoch(1),
                              batch_size=4)
agent.model = optimizer.optimize()

creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
creating: createMSECriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer
6 73.4 [104, 37, 78, 98, 50]
creating: createVanillaPGCriterion
creating: createAdam
creating: createMaxEpoch
creating: createOptimizer


In [87]:
for a, b in zip(get_values(agent, X_batch), y_batch):
    print(a, b)

[ 0.0827365] [  1.          19.81244469]
[ 0.10586917] [  1.          19.79413605]
[ 0.14731486] [  1.          19.75632858]
[ 0.18169846] [  0.          19.72058105]
[ 0.15204923] [  0.          19.73444366]
[ 0.11716782] [  0.          19.74938965]
[ 0.09595046] [  0.          19.74822617]
[ 0.10437021] [  1.          19.73132896]
[ 0.10042933] [  0.          19.72294617]
[ 0.10275005] [  0.          19.70770264]
[ 0.12931788] [  1.          19.68036461]
[ 0.10118809] [  1.         19.6823349]
[ 0.10844115] [  1.          19.66575623]
[ 0.14100969] [  0.          19.63124084]
[ 0.11370825] [  0.         19.6317997]
[ 0.10198044] [  0.          19.61933708]
[ 0.11848997] [  0.          19.59321022]
[ 0.15901318] [  1.          19.54726982]
[ 0.11368705] [  1.         19.5583744]
[ 0.1055854] [  0.          19.54346275]
[ 0.10970016] [  1.          19.51876068]
[ 0.10851577] [  0.          19.49891472]
[ 0.10673793] [  0.          19.47466278]
[ 0.13623157] [  1.          19.43192101]


In [75]:
agent.memory

deque([(array([ 0.02573658,  0.01537697, -0.04826151, -0.03206758]),
        0,
        1.0,
        array([ 0.02604412, -0.17902086, -0.04890286,  0.24500627]),
        False),
       (array([ 0.02604412, -0.17902086, -0.04890286,  0.24500627]),
        1,
        1.0,
        array([ 0.02246371,  0.01676422, -0.04400274, -0.06269197]),
        False),
       (array([ 0.02246371,  0.01676422, -0.04400274, -0.06269197]),
        1,
        1.0,
        array([ 0.02279899,  0.21248854, -0.04525658, -0.3689269 ]),
        False),
       (array([ 0.02279899,  0.21248854, -0.04525658, -0.3689269 ]),
        0,
        1.0,
        array([ 0.02704876,  0.01803786, -0.05263511, -0.09085042]),
        False),
       (array([ 0.02704876,  0.01803786, -0.05263511, -0.09085042]),
        1,
        1.0,
        array([ 0.02740952,  0.2138732 , -0.05445212, -0.39966418]),
        False),
       (array([ 0.02740952,  0.2138732 , -0.05445212, -0.39966418]),
        1,
        1.0,
        array([ 0

In [17]:
for a in y_batch:
    if a[1] > 0:
        print('a')