<a href="https://colab.research.google.com/github/pabair/rl-course-ss21/blob/main/solutions/S6_LunarLander_PolicyBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [1]:
# source: https://medium.com/coinmonks/landing-a-rocket-with-simple-reinforcement-learning-3a0265f8b58c

In [2]:
!pip3 install box2d-py

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/06/bd/6cdc3fd994b0649dcf5d9bad85bd9e26172308bbe9a421bfc6fdbf5081a6/box2d_py-2.3.8-cp36-cp36m-manylinux1_x86_64.whl (448kB)
[K     |████████████████████████████████| 450kB 4.9MB/s 
[?25hInstalling collected packages: box2d-py
Successfully installed box2d-py-2.3.8


In [3]:
import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
from collections import deque

torch.manual_seed(1)
np.random.seed(1)

# Neural Network

In [4]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(obs_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, n_actions)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

# Generate Episodes

In [5]:
def generate_batch(env, batch_size, t_max=5000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v))
            act_probs = act_probs_v.data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)

            new_s, r, done, info = env.step(a)

            #record sessions like you did before
            states.append(s)
            actions.append(a)
            total_reward += r

            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
                
    return batch_states, batch_actions, batch_rewards

# Training

In [6]:
def filter_batch(states_batch, actions_batch, rewards_batch, percentile):
    
    reward_threshold = np.percentile(rewards_batch, percentile)
    
    elite_states = []
    elite_actions = []
    
    
    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])
    
    return elite_states, elite_actions

In [7]:
batch_size = 100
session_size = 500
percentile = 80
hidden_size = 200
completion_score = 100
learning_rate = 0.01

env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n

#neural network
net = Net(n_states, hidden_size, n_actions)
#loss function
objective = nn.CrossEntropyLoss()
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)

for i in range(session_size):
    #generate new sessions
    batch_states, batch_actions, batch_rewards = generate_batch(env, batch_size, t_max=500)

    elite_states, elite_actions = filter_batch(batch_states, batch_actions, batch_rewards, percentile)
    
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states)
    tensor_actions = torch.LongTensor(elite_actions)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()

    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f" % (
            i, loss_v.item(), mean_reward, threshold))
    
    #check if 
    if np.mean(batch_rewards)> completion_score:
        print("Environment has been successfullly completed!")
        break


0: loss=1.387, reward_mean=-188.8, reward_threshold=-87.9
1: loss=1.368, reward_mean=-242.0, reward_threshold=-119.4
2: loss=1.350, reward_mean=-227.0, reward_threshold=-99.5
3: loss=1.323, reward_mean=-201.6, reward_threshold=-101.7
4: loss=1.303, reward_mean=-161.2, reward_threshold=-82.3
5: loss=1.274, reward_mean=-131.2, reward_threshold=-86.0
6: loss=1.270, reward_mean=-131.8, reward_threshold=-78.2
7: loss=1.246, reward_mean=-118.1, reward_threshold=-77.3
8: loss=1.221, reward_mean=-107.6, reward_threshold=-59.0
9: loss=1.189, reward_mean=-94.6, reward_threshold=-54.7
10: loss=1.164, reward_mean=-99.1, reward_threshold=-57.4
11: loss=1.134, reward_mean=-99.3, reward_threshold=-39.6
12: loss=1.116, reward_mean=-89.9, reward_threshold=-20.7
13: loss=1.108, reward_mean=-151.7, reward_threshold=-28.3
14: loss=1.102, reward_mean=-180.5, reward_threshold=-54.3
15: loss=1.089, reward_mean=-146.1, reward_threshold=-37.9
16: loss=1.063, reward_mean=-113.2, reward_threshold=-20.9
17: loss=

# Evaluation

In [8]:
!apt-get install -y xvfb x11-utils

!pip install pyvirtualdisplay==0.2.* \
             PyOpenGL==3.1.* \
             PyOpenGL-accelerate==3.1.*

!pip install gym[box2d]==0.17.*

import pyvirtualdisplay

_display = pyvirtualdisplay.Display(visible=False, size=(1400, 900))
_ = _display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libxxf86dga1
Suggested packages:
  mesa-utils
The following NEW packages will be installed:
  libxxf86dga1 x11-utils xvfb
0 upgraded, 3 newly installed, 0 to remove and 13 not upgraded.
Need to get 993 kB of archives.
After this operation, 2,981 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libxxf86dga1 amd64 2:1.1.4-1 [13.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 x11-utils amd64 7.7+3build1 [196 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 xvfb amd64 2:1.19.6-1ubuntu4.8 [784 kB]
Fetched 993 kB in 1s (1,197 kB/s)
Selecting previously unselected package libxxf86dga1:amd64.
(Reading database ... 146374 files and directories currently installed.)
Preparing to unpack .../libxxf86dga1_2%3a1.1.4-1_amd64.deb ...
Unpacking libxxf86dga1:amd64 (2:

In [9]:
import time

FPS = 25
record_folder="video"  

env = gym.make('LunarLander-v2')
env = gym.wrappers.Monitor(env, record_folder, force=True)

state = env.reset()
total_reward = 0.0

activation = nn.Softmax(dim=1)

while True:
        start_ts = time.time()
        env.render()
           
        s_v = torch.FloatTensor([state])
        act_probs_v = activation(net(s_v))
        act_probs = act_probs_v.data.numpy()[0]
        a = np.random.choice(len(act_probs), p=act_probs)

        state, reward, done, _ = env.step(a)
        total_reward += reward
        if done:
            break
            
        delta = 1/FPS - (time.time() - start_ts)
        if delta > 0:
            time.sleep(delta)

print("Total reward: %.2f" % total_reward)
env.close()

Total reward: 241.31
