## CartPole

In [1]:
from torch import nn
from torch import optim
import torch
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter
import gym

In [9]:
# Constants
HIDDEN_SIZE=128
BATCH_SIZE=16
PERCENTILE=70

# NN: a simple DNN with one hidden layer
class Net(nn.Module):
    def __init__(self,obs_size,hidden_size,n_actions):
        super(Net,self).__init__()
        self.pipe=nn.Sequential(
            nn.Linear(obs_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,n_actions)
        )
    def forward(self,x):
        return self.pipe(x)

In [12]:
Episode=namedtuple("Episode",field_names=["reward","steps"])
EpisodeStep=namedtuple('EpisodeStep',field_names=["observation","action"])

def iterate_batches(env,net,batch_size):
    batch=[]
    episode_reward=0.
    episode_steps=[]
    obs=env.reset()
    sm=nn.Softmax(dim=1)
    
    while True:
        obs_v=torch.FloatTensor([obs]) # turn obs into 2-dimensional with 1xn (from 1-dimensional with n)
        act_probs_v=sm(net(obs_v)) # apply softmax function to the output of NN, so that the outputs become probabilitiy of each action
        act_probs=act_probs_v.data.numpy()[0] # convert tensor to numpy array and reduce dimension from 2 to 1
        next_obs, reward,is_done,_=env.step(action)
        # randomly select action based on the probability from softmax function
        # NOTE: instead of using np.argmax to deterministicly select an action, 
        #       using random.choice introduces proper randomness and help exploration
        action=np.random.choice(len(act_probs),p=act_probs) 
        
        episode_reward+=reward # accumulate reward for every step
        episode_steps.append(EpisodeStep(observation=obs,action=action))
        
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward=0.
            episode_steps=[]
            next_obs=env.reset()
            if len(batch)==batch_size:
                yield batch
                batch.clear()
        
        obs=next_obs

In [13]:
def filter_batch(batch,percentile):
    rewards=list(map(lambda s:s.reward,batch))
    reward_bound=np.percentile(rewards,percentile) # calculate percentile of episode's total rewards as cutoff bound
    reward_mean=float(np.mean(rewards)) # just for monitoring purpose
    
    train_obs=[]
    train_act=[]
    for example in batch:
        if example.reward<reward_bound:
            continue
        train_obs.extend(map(lambda step:step.observation, example.steps))
        train_act.extend(map(lambda step:step.action, example.steps))
        
    train_obs_v=torch.FloatTensor(train_obs)
    train_act_v=torch.LongTensor(train_act)
    return train_obs_v,train_act_v,reward_bound,reward_mean

In [8]:
env=gym.make("CartPole-v0")
#env=gym.wrappers.Monitor(env,directory="mon",force=True)
obs_size=env.observation_space.shape[0]
n_actions=env.action_space.n
net=Net(obs_size,HIDDEN_SIZE,n_actions)

objective=nn.CrossEntropyLoss()
optimizer=optim.Adam(params=net.parameters(),lr=0.01)
writer=SummaryWriter()

for iter_no, batch in enumerate(iterate_batches(env,net,BATCH_SIZE)):
    obs_v, acts_v,reward_b,reward_m=filter_batch(batch,PERCENTILE)
    optimizer.zero_grad()
    action_scores_v=net(obs_v)
    loss_v=objective(action_scores_v,acts_v)
    loss_v.backward()
    optimizer.step()
    
    print(f"{iter_no}: loss={round(loss_v.item(),3)}, reward_mean={round(reward_m,1)}, reward_bound={round(reward_b,1)}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    
    if reward_m>199:
        print("Solved!")
        break
writer.close()

0: loss=0.677, reward_mean=19.3, reward_bound=26.0
1: loss=0.676, reward_mean=16.1, reward_bound=19.0
2: loss=0.676, reward_mean=32.4, reward_bound=32.0
3: loss=0.665, reward_mean=29.9, reward_bound=33.0
4: loss=0.655, reward_mean=21.8, reward_bound=25.5
5: loss=0.637, reward_mean=38.3, reward_bound=51.0
6: loss=0.646, reward_mean=33.1, reward_bound=37.0
7: loss=0.621, reward_mean=40.8, reward_bound=45.5
8: loss=0.626, reward_mean=47.9, reward_bound=57.0
9: loss=0.617, reward_mean=44.4, reward_bound=55.0
10: loss=0.597, reward_mean=45.4, reward_bound=45.5
11: loss=0.612, reward_mean=54.8, reward_bound=60.5
12: loss=0.593, reward_mean=67.0, reward_bound=85.0
13: loss=0.587, reward_mean=50.6, reward_bound=55.0
14: loss=0.586, reward_mean=66.0, reward_bound=65.5
15: loss=0.596, reward_mean=66.4, reward_bound=74.0
16: loss=0.562, reward_mean=60.2, reward_bound=61.5
17: loss=0.58, reward_mean=53.2, reward_bound=59.5
18: loss=0.572, reward_mean=58.1, reward_bound=61.0
19: loss=0.571, reward_