In [5]:
import random
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from bigdl.dataset.transformer import Sample
from bigdl.nn.layer import *
from bigdl.util.common import *
from bigdl.optim.optimizer import *
from bigdl.nn.criterion import *

from rl.criterion import *

init_engine()

In [6]:
class PGAgent:
    def __init__(self, state_size, action_size, gamma=0.95):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma  # discount rate
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()

        model.add(Linear(self.state_size, 24))
        model.add(ReLU())

        model.add(Linear(24, 24))
        model.add(ReLU())

        model.add(Linear(24, 2))
        model.add(SoftMax())
        return model

    def action_sampler(self,out):
        #return 1 if result > np.random.random() else 0
        return np.random.choice([0,1], p=out)
        
    def act(self, state):
        out = self.model.forward(state)
        #print out
        return self.action_sampler(out)
        

In [7]:
def to_RDD(X, y):
    return sc.parallelize(X).zip(sc.parallelize(y)).map(
            lambda x: Sample.from_ndarray(x[0],x[1]))

In [8]:
def play_game(agent, render=False):
    state = env.reset()
    memory = np.array([0,0,0,0])
    actions = np.array([])
    rewards = np.array([])
    for time in range(500):
        if render:
            env.render()
        memory = np.vstack((memory, state))
        action = agent.act(state)
        actions = np.append(actions, action)
        state, reward, done, _ = env.step(action)
        if done:
            reward = -10
        rewards = np.append(rewards, reward)
        if done or time == 498:
            break
    return memory[1:], actions, rewards, time

In [9]:
def running_reward(actions, rewards, gamma):
    result = []
    run_rew = 0
    for action, reward in list(zip(actions, rewards))[::-1]:
        run_rew = run_rew*gamma + reward
        result.append([action, run_rew])
    return np.vstack(result[::-1])

In [10]:
def get_baseline(rewards):
    return rewards.mean()

In [22]:
def get_advantage(actions, rewards, gamma):
    target = running_reward(actions,rewards,gamma)
    #print "calculating advantage"
    #print target
    baseline = get_baseline(target[:,1])
    #print baseline
    target[:,1] = target[:,1] - baseline
    #print target
    return target

In [17]:
def normalize(advantages, smallEps=1e-8):
   return (advantages - advantages.mean())/(advantages.std() + smallEps)

In [18]:
def play_n_games(agent, n=20):
    #add dummy first
    X_batch = np.array([0,0,0,0])
    y_batch = np.array([0,0])
    results = []
    for i in range(n):
        a, b, c, d = play_game(agent)
        X_batch = np.vstack((X_batch, a))
        y_batch = np.vstack((y_batch, get_advantage(b, c, agent.gamma)))
        results.append(d)
    results = np.array(results)
    #remove the dummy first
    X_batch = X_batch[1:]
    y_batch = y_batch[1:]
    #print "play n games"
    #print y_batch
    #normalize
    #print y_batch[:,1].mean(), y_batch[:,1].std()
    y_batch[:,1] = normalize(y_batch[:,1])
    #print y_batch
    return X_batch, y_batch, results.mean(),results.var()

In [19]:
env = gym.make('CartPole-v1')
#sc = SparkContext.getOrCreate(create_spark_conf())
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = PGAgent(state_size, action_size)

creating: createSequential
creating: createLinear
creating: createReLU
creating: createLinear
creating: createReLU
creating: createLinear
creating: createSoftMax


In [25]:
#%%time
import timeit

record = []
exe_times = []

    
for i in range(100):
   
    start_of_play = timeit.default_timer()
    X_batch, y_batch, result_mean,result_variance = play_n_games(agent,n=20)
    end_of_play = timeit.default_timer()
    
    print "Running update iteration",i+1,"score_mean =",result_mean,"score_variance =",result_variance
    record.append([result_mean, result_variance])
  
    #if result == 498:
    #    break
    
    #need to add 1 to each action id as bigdl starts from 1
    y_batch[:,0] = y_batch[:,0] + 1
    #print X_batch,y_batch
    #print y_batch.shape
    
    rdd_sample = to_RDD(X_batch, y_batch)
    
    
    batch_size = X_batch.shape[0] - X_batch.shape[0]%8
    print "using batch_size = ",batch_size
    
    start_of_train = timeit.default_timer()
    
    #if i == 0:
    optimizer = Optimizer(model=agent.model,
                                  training_rdd=rdd_sample,
                                  criterion=RFPGCriterion(),
                                  optim_method=Adam(learningrate=0.01),
                                  end_trigger=MaxIteration(1),
                                  batch_size=batch_size)
    #else:
        #optimizer.set_traindata(training_rdd=rdd_sample, batch_size=batch_size)
        #optimizer.set_criterion(RFPGCriterion())
    agent.model = optimizer.optimize()
    end_of_train = timeit.default_timer()
    exe_time_game_play = end_of_play-start_of_play
    exe_time_train = end_of_train-start_of_train
    exe_times.append([exe_time_game_play,exe_time_train])

Running update iteration 1 score_mean = 8.55 score_variance = 0.6475
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 2 score_mean = 8.1 score_variance = 0.59
using batch_size =  176
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 3 score_mean = 8.3 score_variance = 0.51
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 4 score_mean = 7.95 score_variance = 0.6475
using batch_size =  176
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 5 score_mean = 8.55 score_variance = 0.4475
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: cre

Running update iteration 42 score_mean = 8.5 score_variance = 0.65
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 43 score_mean = 8.2 score_variance = 0.46
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 44 score_mean = 8.0 score_variance = 0.5
using batch_size =  176
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 45 score_mean = 8.7 score_variance = 1.41
using batch_size =  192
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 46 score_mean = 8.05 score_variance = 0.5475
using batch_size =  176
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: creat

Running update iteration 83 score_mean = 8.55 score_variance = 0.7475
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 84 score_mean = 8.7 score_variance = 0.81
using batch_size =  192
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 85 score_mean = 8.05 score_variance = 0.5475
using batch_size =  176
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 86 score_mean = 8.55 score_variance = 1.0475
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creating: createDistriOptimizer
Running update iteration 87 score_mean = 8.35 score_variance = 1.3275
using batch_size =  184
creating: createRFPGCriterion
creating: createAdam
creating: createMaxIteration
creat

In [None]:
%matplotlib inline
import seaborn
plt.style.use('bmh')
arr = np.asarray(record)
print arr.shape
fig, ax1 = plt.subplots()
ind=np.arange(0,arr.shape[0],1)
#t = np.arange(0.01, 10.0, 0.01)
#s1 = np.exp(t)
ax1.plot(ind,arr[:,0], 'b-')
ax1.set_xlabel('no. of updates')
ax1.set_xticks(ind[::5])
# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('score mean', color='b')
ax1.tick_params('y', colors='b')

ax2 = ax1.twinx()
ax2.plot(ind,arr[:,1], 'g--')
ax2.set_ylabel('score variance', color='g')
ax2.tick_params('y', colors='g')
plt.grid(True)
#fig.tight_layout()
plt.title('BigDL Vanilla PG (Cart-Pole v1)');