In [1]:
import gym
import random
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import matplotlib.pyplot as plt
from itertools import count
import numpy as np

In [2]:
env = gym.make('CartPole-v0')

In [3]:
HIDDEN_LAYER = 256  # NN hidden layer size
LR = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def FloatTensor(x):
    return torch.tensor(x, device=device, dtype=torch.float)

class Network(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(4, HIDDEN_LAYER)
        self.l2 = nn.Linear(HIDDEN_LAYER, 2)
        self.sm = nn.Softmax()

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = self.sm(self.l2(x))
        return x
    
    def get_act_logp(self, x):
        #with torch.no_grad():
        act_p = self.forward(x)
        m = torch.distributions.Categorical(act_p)
        act = m.sample()
        logp = m.log_prob(act)
        return act, logp

    def act(self, x):
        with torch.no_grad():
            act, _ = self.get_act_logp(x)
            return act

model = Network().to(device)
optimizer = optim.Adam(model.parameters(), LR)

In [4]:
def select_action(state):
    act, logp = model.get_act_logp(state)
    return act, logp

In [5]:
episode_durations = []
PLOT_MEAN = 10

def plot_durations():
    plt.figure(2)
    plt.clf()
    durations_t = torch.FloatTensor(episode_durations)
    plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # take 100 episode averages and plot them too
    
    if len(durations_t) >= PLOT_MEAN:
        means = durations_t.unfold(0, PLOT_MEAN, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(PLOT_MEAN-1), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated

In [6]:
gamma = 1

def Qsa(episode_rewards):
    return [np.sum([gamma ** i * r for i, r in enumerate(episode_rewards[j:])]) for j in range(len(episode_rewards))]

In [40]:
def run_episode(episode = 50):
    plt.ion()
    
    for e in range(episode):
        state = env.reset()
        state = FloatTensor([state])

        logps = []
        rewards = []

        for step in count(): 
            #env.render()
            action, logp = select_action(state)
            next_state, reward, done, _ = env.step(action.item())
            next_state = FloatTensor([next_state])
            rewards.append(reward)
            logps.append(logp)
            state = next_state

            if done:
                Qsas = Qsa(rewards)
                # crossentropy : π(s,a) * Q(s, a)
                QsaLogps = [logps[i] * -1. * Qsas[i] for i in range(step)]
                policy_loss = torch.cat(QsaLogps).mean()

                optimizer.zero_grad()
                policy_loss.backward()
                optimizer.step()

                plot_durations()
                episode_durations.append(step)
                break
S
        if e % 50 == 0:
            print("{2} Episode {0} finished after {1} steps".format(e, step, '\033[92m' if step >= 195 else '\033[0m'))
    
    print('Complete')
    #env.render()
    env.close()
    plt.ioff()
    plt.show()

In [43]:
%matplotlib
run_episode(500)
%matplotlib inline

Using matplotlib backend: TkAgg
[0m Episode 0 finished after 34 steps
[0m Episode 10 finished after 18 steps
[0m Episode 20 finished after 24 steps
[0m Episode 30 finished after 36 steps
[0m Episode 40 finished after 11 steps
[0m Episode 50 finished after 21 steps
[0m Episode 60 finished after 15 steps
[0m Episode 70 finished after 18 steps
[0m Episode 80 finished after 13 steps
[0m Episode 90 finished after 21 steps
[0m Episode 100 finished after 21 steps
[0m Episode 110 finished after 13 steps
[0m Episode 120 finished after 32 steps
[0m Episode 130 finished after 17 steps
[0m Episode 140 finished after 20 steps
[0m Episode 150 finished after 17 steps
[0m Episode 160 finished after 43 steps
[0m Episode 170 finished after 95 steps
[0m Episode 180 finished after 25 steps
[0m Episode 190 finished after 16 steps
[0m Episode 200 finished after 19 steps
[0m Episode 210 finished after 68 steps
[0m Episode 220 finished after 11 steps
[0m Episode 230 finished after 36 s

In [44]:
steps = []
for t in range(100):

    obs = env.reset()
    obs = FloatTensor([obs])
    for step in range(300):
        act = model.act(obs)
        nobs, r, done, _= env.step(act.item())

        obs = FloatTensor([nobs])

        if done:
            #print('total %d steps' % step)
            steps.append(step)
            break
print("100 times, average : ", np.average(steps), "steps")

100 times, average :  184.33 steps
