In [1]:
import argparse
import gym
import numpy as np
from itertools import count
from syft.controller import tensors, models


import syft
import syft.interfaces.torch.actual_torch as actual_torch
import syft.interfaces.torch as torch
import syft.interfaces.torch.nn as nn
import syft.interfaces.torch.nn.functional as F
import syft.interfaces.torch.optim as optim
from syft.interfaces.torch.autograd import Variable
from syft.interfaces.torch.distributions import Categorical

# import torch as torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torch.autograd import Variable
# from torch.distributions import Categorical

gamma = 0.9
seed = 543
render = False
log_interval = 100

env = gym.make('CartPole-v0')
env.seed(seed)
torch.manual_seed(seed)

class Policy(nn.Module):
    
    def __init__(self):
        super(Policy, self).__init__()
        self.affine = nn.Linear(4, 2)

        self.saved_log_probs = []
        self.rewards = []

    def forward(self, x):
        action_scores = self.affine(x)
        return F.softmax(action_scores, dim=1)
    
policy = Policy()

cached_actions = [1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0]
actions = list()

def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    
    # testing purposes only - comment out when using actual pytorch
    action = Variable(torch.IntTensor([cached_actions[len(actions)]]))
    
    actions.append(action)
    policy.saved_log_probs.append(m.log_prob(action))
    return action.data[0]

optimizer = optim.SGD(policy.parameters(), lr=0.15)

[2017-12-30 08:38:36,316] Making new env: CartPole-v0


In [2]:
running_reward = 10

state = env.reset()
for t in range(10000):  # Don't infinite loop while learning
    action = select_action(state)
    state, reward, done, _ = env.step(action)
    if render:
        env.render()
    policy.rewards.append(reward)
    if done:
        break
        
running_reward = running_reward * 0.99 + t * 0.01

R = 0
policy_loss = []
rewards = []
for r in policy.rewards[::-1]:
    R = r + gamma * R
    rewards.insert(0, R)
    
rewards = torch.Tensor(rewards)
rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)

for log_prob, reward in zip(policy.saved_log_probs, rewards):
    policy_loss.append(-log_prob * reward)

In [5]:
policy_loss

[[[ 1.085435]]
 [syft.FloatTensor:1081 grad:None size:1x1 c:[] p:[1080, 1082] init:mul_scalar]
 
 	-----------creators-----------
 	[syft.FloatTensor:1080 grad:None size:1x1 c:[1081] p:[919] init:neg]
 	[syft.FloatTensor:1082 grad:None size:1 c:[] p:[] init:mul_scalar]
 	------------------------------
 
 , [[ 0.6652827]]
 [syft.FloatTensor:1084 grad:None size:1x1 c:[] p:[1083, 1085] init:mul_scalar]
 
 	-----------creators-----------
 	[syft.FloatTensor:1083 grad:None size:1x1 c:[1084] p:[928] init:neg]
 	[syft.FloatTensor:1085 grad:None size:1 c:[] p:[] init:mul_scalar]
 	------------------------------
 
 , [[ 0.930799]]
 [syft.FloatTensor:1087 grad:None size:1x1 c:[] p:[1086, 1088] init:mul_scalar]
 
 	-----------creators-----------
 	[syft.FloatTensor:1086 grad:None size:1x1 c:[1087] p:[937] init:neg]
 	[syft.FloatTensor:1088 grad:None size:1 c:[] p:[] init:mul_scalar]
 	------------------------------
 
 , [[ 0.5522543]]
 [syft.FloatTensor:1090 grad:None size:1x1 c:[] p:[1089, 1091]

In [18]:
def finish_episode():
    R = 0
    policy_loss = []
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for log_prob, reward in zip(policy.saved_log_probs, rewards):
        policy_loss.append(-log_prob * reward)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]


running_reward = 10
for i_episode in count(1):
    state = env.reset()
    for t in range(10000):  # Don't infinite loop while learning
        action = select_action(state)
        state, reward, done, _ = env.step(action)
        if render:
            env.render()
        policy.rewards.append(reward)
        if done:
            break

    running_reward = running_reward * 0.99 + t * 0.01
    finish_episode()
    if i_episode % log_interval == 0:
        print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(
            i_episode, t, running_reward))
    if running_reward > env.spec.reward_threshold:
        print("Solved! Running reward is now {} and "
              "the last episode runs to {} time steps!".format(running_reward, t))
        break


[2017-12-25 14:32:57,551] Making new env: CartPole-v0


Episode 100	Last length:    98	Average length: 66.12
Episode 200	Last length:    17	Average length: 85.61
Episode 300	Last length:    49	Average length: 80.06
Episode 400	Last length:   169	Average length: 138.29
Episode 500	Last length:    41	Average length: 135.51
Episode 600	Last length:    88	Average length: 126.97
Episode 700	Last length:   177	Average length: 163.88
Episode 800	Last length:   126	Average length: 179.32
Episode 900	Last length:    63	Average length: 138.71
Episode 1000	Last length:   199	Average length: 139.44
Episode 1100	Last length:   199	Average length: 176.30
Episode 1200	Last length:   199	Average length: 189.80
Solved! Running reward is now 195.00467036348365 and the last episode runs to 199 time steps!
