In [1]:
import torch 
import gym
import cv2
import numpy as np
from tensorboardX import SummaryWriter
import random
import argparse
import torch.nn as nn
import torch.optim as optim
import gym.spaces
import torchvision.utils as vutils
from collections import namedtuple

In [2]:
HIDDEN_SIZE = 248
PERCENTILE = 20
BATCH_SIZE = 32
n_actions = 2
n_states = 4
EPOCHS = 100

In [3]:
log = gym.logger
log.set_level(gym.logger.INFO)
device = "cuda"

In [4]:
Episode_Step = namedtuple('Episode_Step',['observation', 'action'])
Episode = namedtuple('Episode', ['reward', 'steps'])

In [5]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.pipe = nn.Sequential(
        nn.Linear(in_features = n_states,out_features = HIDDEN_SIZE),
        nn.ReLU(),
        nn.Linear(in_features = HIDDEN_SIZE,out_features = HIDDEN_SIZE),
        nn.ReLU(),
        nn.Linear(in_features = HIDDEN_SIZE,out_features = HIDDEN_SIZE),
        nn.ReLU(),
        nn.Linear(in_features = HIDDEN_SIZE,out_features = HIDDEN_SIZE),
        nn.ReLU(),
        nn.Linear(in_features = HIDDEN_SIZE, out_features = n_actions ),
        
        )
    def forward(self, x):
        return self.pipe(x)


In [6]:
def get_batch(env, policy):
    Batch = []
    obs = env.reset()
    sample_step = []
    sample_reward = 0
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs]).to(device)
        env.render()
        act_probs_v = sm(policy(obs_v)).to("cpu")
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p = act_probs)
        sample_step.append(Episode_Step(observation = obs, action = action))
        
        next_obs, reward, done, _ = env.step(action)
        sample_reward += reward
        obs = next_obs

        if done:
            Batch.append(Episode(steps = sample_step, reward  = sample_reward))
            obs = env.reset()
            sample_obs = []
            sample_action = []
            sample_reward = 0
        
        if len(Batch) == BATCH_SIZE:
            yield Batch
            Batch = []



In [7]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation,example.steps))
        train_act.extend(map(lambda step: step.action,example.steps))
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return train_obs_v, train_act_v, reward_mean

In [8]:
def training(epochs):
    env = gym.make("CartPole-v0").unwrapped
    policy = Policy().to(device)
    reward_update = []
    policy_losses = []
    writer = SummaryWriter()
    batches = get_batch(env, policy)
    loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(params = policy.parameters())
    env.render()
    for e in range(epochs):
        batch = next(batches)
        train_obs, train_act, reward_mean = filter_batch(batch, PERCENTILE)
        
        optimizer.zero_grad()
        policy_act = policy(train_obs.to(device))
        policy_loss = loss(policy_act,train_act.to(device))
        policy_loss.backward()
        optimizer.step()
        policy_losses.append(policy_loss)
        reward_update.append(reward_mean)
        writer.add_scalar("Loss", policy_loss, e)
        writer.add_scalar("Reward", reward_mean, e)
        log.info("Iter %d: Loss=%.3e, Reward=%.3e", e,policy_loss,reward_mean)

    
    env.close()



In [9]:
training(EPOCHS)

INFO: Making new env: CartPole-v0
INFO: Iter 0: Loss=6.932e-01, Reward=2.050e+01
INFO: Iter 1: Loss=6.935e-01, Reward=2.188e+01
INFO: Iter 2: Loss=6.920e-01, Reward=2.519e+01
INFO: Iter 3: Loss=6.919e-01, Reward=2.281e+01
INFO: Iter 4: Loss=6.914e-01, Reward=2.347e+01
INFO: Iter 5: Loss=6.915e-01, Reward=2.612e+01
INFO: Iter 6: Loss=6.911e-01, Reward=2.159e+01
INFO: Iter 7: Loss=6.906e-01, Reward=2.178e+01
INFO: Iter 8: Loss=6.909e-01, Reward=2.119e+01
INFO: Iter 9: Loss=6.910e-01, Reward=2.050e+01
INFO: Iter 10: Loss=6.911e-01, Reward=2.056e+01
INFO: Iter 11: Loss=6.914e-01, Reward=2.328e+01
INFO: Iter 12: Loss=6.911e-01, Reward=1.922e+01
INFO: Iter 13: Loss=6.907e-01, Reward=2.194e+01
INFO: Iter 14: Loss=6.907e-01, Reward=2.241e+01
INFO: Iter 15: Loss=6.906e-01, Reward=1.997e+01
INFO: Iter 16: Loss=6.904e-01, Reward=2.622e+01
INFO: Iter 17: Loss=6.904e-01, Reward=1.916e+01


KeyboardInterrupt: 

In [10]:
env.close()

NameError: name 'env' is not defined