<a href="https://colab.research.google.com/github/wesley34/comp3414_course_material/blob/master/ch_9_basic_reinforcement_learning/FrozenLakeV1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorboardX

Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |█                               | 10kB 20.5MB/s eta 0:00:01[K     |██▏                             | 20kB 6.3MB/s eta 0:00:01[K     |███▏                            | 30kB 7.6MB/s eta 0:00:01[K     |████▎                           | 40kB 7.1MB/s eta 0:00:01[K     |█████▎                          | 51kB 6.7MB/s eta 0:00:01[K     |██████▍                         | 61kB 7.3MB/s eta 0:00:01[K     |███████▍                        | 71kB 7.7MB/s eta 0:00:01[K     |████████▌                       | 81kB 8.2MB/s eta 0:00:01[K     |█████████▌                      | 92kB 7.8MB/s eta 0:00:01[K     |██████████▋                     | 102kB 8.1MB/s eta 0:00:01[K     |███████████▊                    | 112kB 8.1MB/s eta 0:00:01[K     |████████████▊                   | 122kB 8.

In [None]:
import gym
import gym.envs.toy_text.frozen_lake
import gym.wrappers
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple
import numpy as np
from tensorboardX import SummaryWriter

In [None]:
HIDDEN_SIZE = 128
BATCH_SIZE = 100
PERCENTILE = 30
GAMMA = 0.9

In [None]:
### wrapper for discrete sample space
class DiscreteOneHotWrapper(gym.ObservationWrapper):
  def __init__(self,env):
    super(DiscreteOneHotWrapper,self).__init__(env)
    assert isinstance(env.observation_space,gym.spaces.Discrete)
    self.observation_space = gym.spaces.Box(0.0,1.0,(env.observation_space.n,),dtype=np.float32)
  
  #overriden
  def observation(self,observation):
    result = np.copy(self.observation_space.low)
    result[observation] = 1.0
    return result

In [None]:
### simple neural network
class Net(nn.Module):
  def __init__(self,obs_size,hidden_size,action_size):
    super(Net,self).__init__()
    self.network = nn.Sequential(
        nn.Linear(obs_size,hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size,action_size)
    )
  ## overriden
  def forward(self,x):
    return self.network(x)

In [None]:
# classes for tupple
Episode = namedtuple("Episode",["reward","step"])
Episode_step = namedtuple("EpisodeStep",["action","observation"])

In [None]:
### training on network + random_walk for the data
def iteration_batch(net,env,batch_size):
  episode_reward = 0.0
  batch = []
  episode_step = []
  obs = env.reset()
  softmax = nn.Softmax(dim=1)
  # 3 elements for making a dataset for RL
  while True:
    obs_vector = torch.FloatTensor([obs])
    action_score = net(obs_vector)
    action_probability_vector = softmax(action_score)
    action_probability = action_probability_vector.data.numpy()[0]
    action = np.random.choice(len(action_probability),p=action_probability)
    next_obs, reward, is_done, _ = env.step(action)
    episode_reward += reward
    episode_step.append(Episode_step(action=action,observation=obs))
    if is_done:
      batch.append(Episode(reward=reward,step=episode_step))
      episode_reward = 0.0
      episode_step = []
      next_obs = env.reset() #important
      if len(batch) == batch_size:
        yield batch
        batch = []
      
    obs = next_obs
    

In [None]:
## filter for elite batch
def filter_batch(batch,percentile):
  
  reward = list(map(lambda sample: sample.reward * (GAMMA**len(sample.step)),batch))

  reward_bound = np.percentile(reward,percentile)

  train_obs = []
  train_action = []
  train_elite = []

  for example,discount_reward in zip(batch,reward):

    if discount_reward > reward_bound:
      train_obs.extend(list(map(lambda step : step.observation,example.step)))
      train_action.extend(list(map(lambda step : step.action,example.step)))
      train_elite.append(example)

  return train_elite,train_obs,train_action,reward_bound

  

In [None]:
## training pharse
full_batch = []
env = gym.envs.toy_text.frozen_lake.FrozenLakeEnv(is_slippery=False)
env = gym.wrappers.TimeLimit(env,max_episode_steps=100)
env = DiscreteOneHotWrapper(env)
net = Net(env.observation_space.shape[0],HIDDEN_SIZE,env.action_space.n)
optimizer = optim.Adam(params=net.parameters(),lr=0.01)
objective = nn.CrossEntropyLoss()
softmax = nn.Softmax(dim=1)
writer = SummaryWriter(comment="-frozenlake-nonslippery")

for iter_no , batch in enumerate(iteration_batch(net,env,BATCH_SIZE)):
  reward_mean = float(np.mean(list(map(lambda step : step.reward,batch))))
  full_batch,train_obs,train_action,reward_bound = filter_batch(full_batch+batch,PERCENTILE)
  if not full_batch:
    continue
  obs_vector = torch.FloatTensor(train_obs)
  action_vector = torch.LongTensor(train_action)
  full_batch = full_batch[-500:]

  optimizer.zero_grad()
  action_probability = net(obs_vector)
  
  loss_vector = objective(action_probability,action_vector)
  loss_vector.backward()
  optimizer.step()
  print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f" % (
          iter_no, loss_vector.item(), reward_mean, reward_bound))
  
  writer.add_scalar("loss", loss_vector.item(), iter_no)
  writer.add_scalar("reward_bound", reward_bound, iter_no)
  writer.add_scalar("reward_mean", reward_mean, iter_no)
  if reward_mean > 0.89:
      print("Solved!")
      break
  writer.close()

  

0: loss=1.383, reward_mean=0.0, reward_bound=0.0
1: loss=1.352, reward_mean=0.0, reward_bound=0.0
2: loss=1.331, reward_mean=0.0, reward_bound=0.0
3: loss=1.297, reward_mean=0.0, reward_bound=0.0
4: loss=1.273, reward_mean=0.1, reward_bound=0.0
5: loss=1.228, reward_mean=0.1, reward_bound=0.0
6: loss=1.208, reward_mean=0.1, reward_bound=0.0
7: loss=1.159, reward_mean=0.2, reward_bound=0.0
8: loss=1.114, reward_mean=0.2, reward_bound=0.0
9: loss=1.080, reward_mean=0.2, reward_bound=0.0
10: loss=1.047, reward_mean=0.2, reward_bound=0.0
11: loss=0.970, reward_mean=0.4, reward_bound=0.1
12: loss=0.878, reward_mean=0.5, reward_bound=0.2
13: loss=0.737, reward_mean=0.6, reward_bound=0.3
14: loss=0.652, reward_mean=0.6, reward_bound=0.3
15: loss=0.552, reward_mean=0.7, reward_bound=0.4
16: loss=0.400, reward_mean=0.8, reward_bound=0.4
17: loss=0.223, reward_mean=0.8, reward_bound=0.5
19: loss=0.246, reward_mean=0.9, reward_bound=0.5
Solved!
