## FrozenLake

In [1]:
from torch import nn
from torch import optim
import torch
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter
import gym

# Register a nonslippery frozen-lake

from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)

In [2]:
# Constants
HIDDEN_SIZE=128
BATCH_SIZE=100
PERCENTILE=30
GAMMA=0.9


# NN: a simple DNN with one hidden layer
class Net(nn.Module):
    def __init__(self,obs_size,hidden_size,n_actions):
        super(Net,self).__init__()
        self.pipe=nn.Sequential(
            nn.Linear(obs_size,hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size,n_actions)
        )
    def forward(self,x):
        return self.pipe(x)

In [3]:
Episode=namedtuple("Episode",field_names=["reward","steps"])
EpisodeStep=namedtuple('EpisodeStep',field_names=["observation","action"])

def iterate_batches(env,net,batch_size):
    batch=[]
    episode_reward=0.
    episode_steps=[]
    obs=env.reset()
    sm=nn.Softmax(dim=1)
    
    while True:
        obs_v=torch.FloatTensor([obs]) # turn obs into 2-dimensional with 1xn (from 1-dimensional with n)
        act_probs_v=sm(net(obs_v)) # apply softmax function to the output of NN, so that the outputs become probabilitiy of each action
        act_probs=act_probs_v.data.numpy()[0] # convert tensor to numpy array and reduce dimension from 2 to 1
        # randomly select action based on the probability from softmax function
        # NOTE: instead of using np.argmax to deterministicly select an action, 
        #       using random.choice introduces proper randomness and help exploration
        action=np.random.choice(len(act_probs),p=act_probs) 
        next_obs, reward,is_done,_=env.step(action)
        
        episode_reward+=reward # accumulate reward for every step
        episode_steps.append(EpisodeStep(observation=obs,action=action))
        
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            episode_reward=0.
            episode_steps=[]
            next_obs=env.reset()
            if len(batch)==batch_size:
                yield batch
                batch.clear()
        
        obs=next_obs

In [4]:
def filter_batch(batch,percentile):
    disc_rewards=list(map(lambda s:s.reward*(GAMMA**len(s.steps)),batch)) # use GAMMA as discount factor, shorter is better
    reward_bound=np.percentile(disc_rewards,percentile) # calculate percentile of episode's total rewards as cutoff bound
    
    train_obs=[]
    train_act=[]
    elite_batch=[]
    
    for example,discounted_reward in zip(batch,disc_rewards):
        if discounted_reward>reward_bound:
            train_obs.extend(map(lambda step:step.observation, example.steps))
            train_act.extend(map(lambda step:step.action, example.steps))
            elite_batch.append(example)
    return elite_batch,train_obs,train_act,reward_bound

In [5]:
class DiscreteOneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(DiscreteOneHotWrapper, self).__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Discrete)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        res = np.copy(self.observation_space.low)
        res[observation]=1.0
        return res

In [7]:
# slippery frozen lake env
env=DiscreteOneHotWrapper(gym.make("FrozenLakeNotSlippery-v0"))

env=gym.wrappers.Monitor(env,directory="mon_fl",force=True)
obs_size=env.observation_space.shape[0]
n_actions=env.action_space.n
net=Net(obs_size,HIDDEN_SIZE,n_actions)

objective=nn.CrossEntropyLoss()
optimizer=optim.Adam(params=net.parameters(),lr=0.001)
writer=SummaryWriter(comment="-nonslippery")

full_batch=[]

for iter_no, batch in enumerate(iterate_batches(env,net,BATCH_SIZE)):
    reward_m=float(np.mean(list(map(lambda s:s.reward,batch))))
    full_batch,obs,acts,reward_b=filter_batch(full_batch+batch,PERCENTILE)
    if not full_batch:
        continue
    obs_v=torch.FloatTensor(obs)
    acts_v=torch.LongTensor(acts)
    full_batch=full_batch[-500:]
    
    optimizer.zero_grad()
    action_scores_v=net(obs_v)
    loss_v=objective(action_scores_v,acts_v)
    loss_v.backward()
    optimizer.step()
    
    print(f"{iter_no}: loss={round(loss_v.item(),3)}, reward_mean={round(reward_m,1)}, reward_bound={round(reward_b,1)}")
    writer.add_scalar("loss", loss_v.item(), iter_no)
    writer.add_scalar("reward_bound", reward_b, iter_no)
    writer.add_scalar("reward_mean", reward_m, iter_no)
    
    if reward_m>0.8:
        print("Solved!")
        break
writer.close()

0: loss=1.404, reward_mean=0.0, reward_bound=0.0
1: loss=1.386, reward_mean=0.0, reward_bound=0.0
2: loss=1.382, reward_mean=0.0, reward_bound=0.0
3: loss=1.376, reward_mean=0.0, reward_bound=0.0
4: loss=1.372, reward_mean=0.0, reward_bound=0.0
5: loss=1.365, reward_mean=0.0, reward_bound=0.0
6: loss=1.364, reward_mean=0.0, reward_bound=0.0
7: loss=1.36, reward_mean=0.0, reward_bound=0.0
8: loss=1.359, reward_mean=0.0, reward_bound=0.0
9: loss=1.355, reward_mean=0.0, reward_bound=0.0
10: loss=1.354, reward_mean=0.0, reward_bound=0.0
11: loss=1.349, reward_mean=0.0, reward_bound=0.0
12: loss=1.343, reward_mean=0.0, reward_bound=0.0
13: loss=1.342, reward_mean=0.0, reward_bound=0.0
14: loss=1.338, reward_mean=0.0, reward_bound=0.0
15: loss=1.345, reward_mean=0.0, reward_bound=0.0
16: loss=1.345, reward_mean=0.1, reward_bound=0.0
17: loss=1.341, reward_mean=0.0, reward_bound=0.0
18: loss=1.332, reward_mean=0.1, reward_bound=0.0
19: loss=1.326, reward_mean=0.0, reward_bound=0.0
20: loss=1.