## Lunar Lander
Using reinforcement learning to teach an agent to play the Lunar Lander game

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, action_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [3]:
def generate_batch(env,batch_size, t_max=1000):
    
    activation = nn.Softmax(dim=1)
    batch_actions,batch_states, batch_rewards = [],[],[]
    
    for b in range(batch_size):
        states,actions = [],[]
        total_reward = 0
        s = env.reset()
        for t in range(t_max):
            s_v = torch.FloatTensor([s])
            act_probs_v = activation(net(s_v.to(device)))
            act_probs = act_probs_v.cpu().data.numpy()[0]
            a = np.random.choice(len(act_probs), p=act_probs)
            new_s, r, done, info = env.step(a)
            states.append(s)
            actions.append(a)
            total_reward += r
            s = new_s
            if done:
                batch_actions.append(actions)
                batch_states.append(states)
                batch_rewards.append(total_reward)
                break
    return batch_states, batch_actions, batch_rewards

In [4]:
def filter_batch(states_batch, actions_batch, rewards_batch, percentile=50):

    reward_threshold = np.percentile(rewards_batch, percentile)

    elite_states = []
    elite_actions = []

    for i in range(len(rewards_batch)):
        if rewards_batch[i] > reward_threshold:
            for j in range(len(states_batch[i])):
                elite_states.append(states_batch[i][j])
                elite_actions.append(actions_batch[i][j])

    return elite_states, elite_actions

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cpu')

In [7]:
batch_size = 100
session_size = 100
percentile = 80
hidden_size = 200
learning_rate = 0.0025
completion_score = 200
env = gym.make("LunarLander-v2")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
#neural network
net = Net(n_states, hidden_size, n_actions)
net = net.to(device)
#loss function
objective = nn.CrossEntropyLoss()
#optimisation function
optimizer = optim.Adam(params=net.parameters(), lr=learning_rate)
for i in range(session_size):
    #generate new sessions
    batch_states, batch_actions, batch_rewards = generate_batch(
        env, batch_size, t_max=5000)
    elite_states, elite_actions = filter_batch(batch_states, batch_actions,
                                               batch_rewards, percentile)

#     elite_states = torch.Tensor(elite_states).to(device)
#     elite_actions = torch.Tensor(elite_actions).to(device)
    optimizer.zero_grad()
    tensor_states = torch.FloatTensor(elite_states).to(device)
    tensor_actions = torch.LongTensor(elite_actions).to(device)
    action_scores_v = net(tensor_states)
    loss_v = objective(action_scores_v, tensor_actions)
    loss_v.backward()
    optimizer.step()
    #show results
    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    print("%d: loss=%.3f, reward_mean=%.1f, reward_threshold=%.1f" %
          (i, loss_v.item(), mean_reward, threshold))

    #check if
    if np.mean(batch_rewards) > completion_score:
        print("Environment has been successfullly completed!")

0: loss=1.380, reward_mean=-205.7, reward_threshold=-103.0
1: loss=1.377, reward_mean=-191.1, reward_threshold=-98.1
2: loss=1.374, reward_mean=-201.2, reward_threshold=-103.1
3: loss=1.361, reward_mean=-188.4, reward_threshold=-86.1
4: loss=1.355, reward_mean=-219.1, reward_threshold=-91.1
5: loss=1.361, reward_mean=-192.4, reward_threshold=-79.2
6: loss=1.349, reward_mean=-192.4, reward_threshold=-90.2
7: loss=1.349, reward_mean=-165.0, reward_threshold=-79.1
8: loss=1.345, reward_mean=-166.8, reward_threshold=-71.5
9: loss=1.337, reward_mean=-176.0, reward_threshold=-77.6
10: loss=1.344, reward_mean=-171.4, reward_threshold=-82.8
11: loss=1.345, reward_mean=-140.2, reward_threshold=-69.9
12: loss=1.312, reward_mean=-147.6, reward_threshold=-64.8
13: loss=1.323, reward_mean=-137.3, reward_threshold=-79.1
14: loss=1.318, reward_mean=-157.8, reward_threshold=-62.4
15: loss=1.306, reward_mean=-137.2, reward_threshold=-66.1
16: loss=1.321, reward_mean=-125.5, reward_threshold=-60.8
17: l

In [9]:
# save the model
torch.save(net.state_dict(), "model1")

In [None]:
# import gym.wrappers
# env = gym.wrappers.Monitor(
#     gym.make("LunarLander-v2"), directory="videos", force=True)
# generate_batch(env, 1, t_max=5000)
# env.close()

In [12]:
# lets run and see

s = env.reset()
totalReward = 0
activation = nn.Softmax(dim=1)
for _ in range(5000):
    env.render()
    s_v = torch.FloatTensor([s])
    act_probs_v = activation(net(s_v.to(device)))
    act_probs = act_probs_v.cpu().data.numpy()[0]
    a = np.argmax(act_probs)
    new_s, r, done, info = env.step(a)
    totalReward += r
    
    if done:
        break

print("Total reward is : {}".format(totalReward))
    

Total reward is : -554.8111864924354
