In [1]:
import gym
import tensorflow as tf
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
env = gym.make('LunarLander-v2')

[2017-08-30 22:17:05,988] Making new env: LunarLander-v2


In [4]:
n_actions = env.action_space.n
n_states = env.observation_space.shape
print(n_actions)
print(n_states)

4
(8,)


In [115]:
s = env.reset()
for i in range(10000):
    new_s, reward, done, _ = env.step(env.action_space.sample())
    env.render()
    if done:
        break
print(i)
env.close()

96


In [120]:
from sklearn.neural_network import MLPClassifier
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='tanh',
                      warm_start=True, #keep progress between .fit(...) calls
                      max_iter=1 #make only 1 iteration on each .fit(...)
                     )
#initialize agent to the dimension of state an amount of actions
agent.fit([env.reset()]*n_actions,range(n_actions));



In [121]:
t_max = 10000
def generate_sample():
    s = env.reset()
    batch_s = []
    batch_a = []
    total_reward = 0
    
    for i in range(t_max):
        # probs = agent.predict(s.reshape(1, 8))
        probs = agent.predict_proba(s.reshape(1, 8))
        a = int(np.random.choice(n_actions, 1, p = probs[0]))
        new_s, r, done, _ = env.step(a)
        batch_s.append(s)
        batch_a.append(a)
        s = new_s
        total_reward = total_reward + r
        if done:
            break
    env.close()
    return batch_s, batch_a, total_reward

In [125]:
iterations = 100
percentile = 70
samples = 250

for i in range(iterations):
    population = [generate_sample() for i in range(samples)]
    batch_states,batch_actions,batch_rewards = map(np.array,zip(*population))
    threshold = np.percentile(batch_rewards, percentile)
    elite_states = batch_states[batch_rewards > threshold]
    elite_actions = batch_actions[batch_rewards > threshold]
    elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
    agent.fit(X=elite_states, y=elite_actions)
    print('Iteration: {0}, Mean Reward: {1:.2f}, Threshold: {2:.2f}'.format(i + 1, np.mean(batch_rewards), threshold))

Iteration: 1, Mean Reward: -269.72, Threshold: -203.86
Iteration: 2, Mean Reward: -214.88, Threshold: -174.57
Iteration: 3, Mean Reward: -193.35, Threshold: -163.07
Iteration: 4, Mean Reward: -181.44, Threshold: -150.71
Iteration: 5, Mean Reward: -179.10, Threshold: -150.28
Iteration: 6, Mean Reward: -164.94, Threshold: -146.30
Iteration: 7, Mean Reward: -166.23, Threshold: -143.38
Iteration: 8, Mean Reward: -164.63, Threshold: -142.32
Iteration: 9, Mean Reward: -158.53, Threshold: -137.96
Iteration: 10, Mean Reward: -151.76, Threshold: -135.14
Iteration: 11, Mean Reward: -146.64, Threshold: -131.94
Iteration: 12, Mean Reward: -143.42, Threshold: -128.11
Iteration: 13, Mean Reward: -141.43, Threshold: -125.82
Iteration: 14, Mean Reward: -137.67, Threshold: -121.26
Iteration: 15, Mean Reward: -134.23, Threshold: -119.86
Iteration: 16, Mean Reward: -136.16, Threshold: -120.03
Iteration: 17, Mean Reward: -136.39, Threshold: -120.20
Iteration: 18, Mean Reward: -133.29, Threshold: -117.34
I

In [155]:
from PIL import Image
s = env.reset()
for i in range(1000):
    render = env.render('rgb_array')
    if i%5==0:
        img = Image.fromarray(render, 'RGB')
        img.save(''.join(['./renders/',str(i),'.jpg']))
    probs = agent.predict_proba(s.reshape(1, 8))
    a = int(np.random.choice(n_actions, 1, p = probs[0]))
    new_s, reward, done, _ = env.step(a)
    s = new_s
    if done:
        break
env.close()

In [12]:
agent = Sequential()
agent.add(Dense(20, input_shape=n_states, activation='relu'))
agent.add(Dense(20, activation='relu'))
agent.add(Dense(n_actions, activation='softmax'))
agent.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
action_lookup = np.eye(n_actions)

t_max = 10000
def generate_sample():
    s = env.reset()
    batch_s = []
    batch_a = []
    total_reward = 0
    
    for i in range(t_max):
        probs = agent.predict(s.reshape(1, 8))
        a = int(np.random.choice(n_actions, 1, p = probs[0]))
        new_s, r, done, _ = env.step(a)
        batch_s.append(s)
        batch_a.append(action_lookup[a])
        s = new_s
        total_reward = total_reward + r
        if done:
            break
    env.close()
    return batch_s, batch_a, total_reward

In [None]:
iterations = 100
percentile = 70
samples = 250

for i in range(iterations):
    population = [generate_sample() for i in range(samples)]
    batch_states,batch_actions,batch_rewards = map(np.array,zip(*population))
    threshold = np.percentile(batch_rewards, percentile)
    elite_states = batch_states[batch_rewards > threshold]
    elite_actions = batch_actions[batch_rewards > threshold]
    elite_states, elite_actions = map(np.concatenate, [elite_states, elite_actions])
    agent.fit(epochs=1, x=elite_states, y=elite_actions)
    print('Iteration: {0}, Mean Reward: {1:.2f}, Threshold: {2:.2f}'.format(i + 1, np.mean(batch_rewards), threshold))

Epoch 1/1
Iteration: 1, Mean Reward: -199.68, Threshold: -161.76
Epoch 1/1
Iteration: 2, Mean Reward: -185.53, Threshold: -156.72
Epoch 1/1
Iteration: 3, Mean Reward: -174.82, Threshold: -150.20
Epoch 1/1
Iteration: 4, Mean Reward: -185.84, Threshold: -156.56
Epoch 1/1
Iteration: 5, Mean Reward: -175.31, Threshold: -151.17
Epoch 1/1
Iteration: 6, Mean Reward: -166.26, Threshold: -147.15
Epoch 1/1
Iteration: 7, Mean Reward: -163.43, Threshold: -142.53
Epoch 1/1
Iteration: 8, Mean Reward: -161.76, Threshold: -139.74
Epoch 1/1
Iteration: 9, Mean Reward: -164.68, Threshold: -142.95
Epoch 1/1
Iteration: 10, Mean Reward: -151.01, Threshold: -133.05
Epoch 1/1
Iteration: 11, Mean Reward: -154.18, Threshold: -130.90
Epoch 1/1
Iteration: 12, Mean Reward: -145.78, Threshold: -125.39
Epoch 1/1
Iteration: 13, Mean Reward: -139.95, Threshold: -125.05
Epoch 1/1
Iteration: 14, Mean Reward: -133.46, Threshold: -119.73
Epoch 1/1
Iteration: 15, Mean Reward: -125.79, Threshold: -109.67
Epoch 1/1
Iteration