In [13]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter
import random

In [14]:
Episode = namedtuple('Episode', ['reward', 'steps'])
Episode_Step = namedtuple('Episode_Step', ['observation', 'action'])

In [15]:
class DiscreteOneHot(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHot, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0,1.0,shape=(env.observation_space.n,), dtype=np.float32)
    def observation(self, observation):
        obs = np.copy(self.observation_space.low)
        obs[observation] = 1.0
        return obs

In [16]:
class Player(nn.Module):
    def __init__(self,in_units, hidden_units, out_units):
        super(Player,self).__init__()
        self.pipe = nn.Sequential(
            nn.Linear(in_features=in_units, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=out_units),
        )
    def forward(self, x):
        return self.pipe(x)

In [17]:
IN_UNITS = 16
HIDDEN_UNITS = 128
OUT_UNITS = 4
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9

In [18]:
def get_batch(env, policy):
    batch = []
    
    sample_step = []
    sample_reward = 0.0
    obs = env.reset()
    sm = nn.Softmax(dim=1)
   
    while True:
        obs_v = torch.FloatTensor([obs])
        act_probs_v = sm(policy(obs_v))
        act_probs = act_probs_v.data.numpy()[0]
        action = np.random.choice(len(act_probs), p = act_probs)
        
        
        next_obs, reward, done, _ = env.step(action)
        sample_reward += reward
        sample_step.append(Episode_Step(observation = obs, action = action))
        
        if done:
            batch.append(Episode(steps = sample_step, reward  = sample_reward))
            
            sample_step = []
            sample_reward = 0.0
           
            next_obs = env.reset()
            
            if len(batch) == BATCH_SIZE:
                yield batch
                batch = []
        obs = next_obs

In [19]:
def filter_batch(batch, percentile):
    disc_rewards = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), batch))
    reward_bound = np.percentile(disc_rewards, percentile)
    # reward_mean = float(np.mean(rewards))
    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, disc_rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)

    
    train_obs_v = torch.FloatTensor(train_obs)
    train_act_v = torch.LongTensor(train_act)
    return elite_batch, train_obs_v, train_act_v, reward_bound

In [20]:
log = gym.logger
log.set_level(gym.logger.INFO)

In [21]:
def train():
    env = DiscreteOneHot(gym.make("FrozenLake-v0"))
    player = Player(IN_UNITS, HIDDEN_UNITS, OUT_UNITS)
    loss = nn.CrossEntropyLoss()
    writer = SummaryWriter()
    optimizer = optim.Adam(params=player.parameters(),lr=0.001)
    full_batch = []
    for ite, batch in enumerate(get_batch(env,player)):
        reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
        full_batch, train_obs, train_act, reward_bound = filter_batch( full_batch+batch, PERCENTILE)
        if not full_batch:
            continue
                
        full_batch = full_batch[-500:]
        # print(train_obs.shape)
        optimizer.zero_grad()
        actions = player(train_obs)
        player_loss = loss(actions, train_act)
        player_loss.backward()
        optimizer.step()
        writer.add_scalar("Loss", player_loss, ite)
        writer.add_scalar("Reward Mean", reward_mean, ite)
        writer.add_scalar("Reward Bound", reward_bound, ite)
        log.info("Iter %d: Loss=%.3f, Reward=%.3f", ite,player_loss,reward_mean)
        if reward_mean >=0.8:
            writer.close()
            env.close()
            break

In [12]:
train()

INFO: Making new env: FrozenLake-v0
INFO: Iter 0: Loss=1.402, Reward=0.010
INFO: Iter 1: Loss=1.393, Reward=0.010
INFO: Iter 2: Loss=1.383, Reward=0.020
INFO: Iter 3: Loss=1.386, Reward=0.020
INFO: Iter 4: Loss=1.392, Reward=0.020
INFO: Iter 5: Loss=1.389, Reward=0.010
INFO: Iter 6: Loss=1.385, Reward=0.020
INFO: Iter 7: Loss=1.383, Reward=0.020
INFO: Iter 8: Loss=1.385, Reward=0.010
INFO: Iter 9: Loss=1.384, Reward=0.000
INFO: Iter 10: Loss=1.382, Reward=0.000
INFO: Iter 11: Loss=1.378, Reward=0.050
INFO: Iter 12: Loss=1.377, Reward=0.020
INFO: Iter 13: Loss=1.378, Reward=0.010
INFO: Iter 14: Loss=1.377, Reward=0.020
INFO: Iter 15: Loss=1.376, Reward=0.000
INFO: Iter 16: Loss=1.375, Reward=0.010
INFO: Iter 17: Loss=1.374, Reward=0.030
INFO: Iter 18: Loss=1.373, Reward=0.000
INFO: Iter 19: Loss=1.373, Reward=0.030
INFO: Iter 20: Loss=1.370, Reward=0.020
INFO: Iter 21: Loss=1.372, Reward=0.030
INFO: Iter 22: Loss=1.370, Reward=0.020
INFO: Iter 23: Loss=1.369, Reward=0.000
INFO: Iter 24:

In [88]:
batch = get_batch(env, player)

In [90]:
b1 = next(batch)

In [96]:
for b in b1:
    print(b.reward)

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [69]:
obs = env.reset()

In [80]:
sm = nn.Softmax(dim = 1)

In [81]:
act = player(torch.tensor([obs]))
act

tensor([[ 0.1388,  0.0958, -0.0763,  0.0586]], grad_fn=<AddmmBackward>)

In [82]:
sm(act)

tensor([[0.2712, 0.2598, 0.2187, 0.2503]], grad_fn=<SoftmaxBackward>)

In [None]:
env.step()