In [3]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Softmax, Input, BatchNormalization, Dropout

from sklearn.model_selection import train_test_split

import tensorflow as tf
from load_policy import load_policy
import pickle
import numpy as np
import gym

import tf_util


Using TensorFlow backend.


In [4]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

In [5]:
expert_policy_file = "experts/Reacher-v1.pkl"
env_name = "Reacher-v1"
rounds = 1000
max_steps = 1000

In [6]:
data = pickle.load(open(expert_policy_file, "rb"))
policy_fn = load_policy(expert_policy_file)

obs (1, 11) (1, 11)


In [7]:
observations = []
actions = []
rewards = []

with tf.Session() as sess:
    tf_util.initialize()
    env = gym.make(env_name)
    for i in range(rounds):
        obs = env.reset()
        r = 0
        for s in range(max_steps):
            action = policy_fn(obs[None,:])
            observations.append(obs)
            actions.append(action)
            obs, r_, done, _ = env.step(action)
            rewards.append(r_)
            r += r_
            if done:
                break
        if i and i%100 == 0:
            print(i)
            

[2018-06-06 18:38:13,879] Making new env: Reacher-v1


100
200
300
400
500
600
700
800
900


In [8]:
observations = np.array(observations)
actions = np.vstack(actions)
rewards = np.array(rewards)



In [9]:
actions.shape

(1000000, 2)

In [10]:
def linear_model():
    input_dim = observations.shape[-1]
    dg = Sequential()
    dg.add(Dense(32, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    dg.add(Dense(512, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    dg.add(Dense(32, kernel_initializer='normal', activation='relu'))
    dg.add(Dense(2, kernel_initializer='normal'))
    dg.compile(optimizer='adam',loss='mse', metrics=['accuracy', 'mae'])
    return dg

In [24]:
def run_model(m, m_rounds, max_steps):
    env = gym.make(env_name)
    obs_ = []
    m_rewards = []
    for i in range(m_rounds):
        obs = env.reset()
        r = 0
        for s in range(max_steps):
            action = m.predict(obs[None,:])
#             observations.append(obs)
#             actions.append(action)
            obs, r_, done, _ = env.step(action)
            obs_.append(obs)
            m_rewards.append(r_)
#             r += r_
            if done:
                break
    return obs_, m_rewards

In [37]:
dagger_rounds = 10
m_rounds = 20
m_steps = 1000

obs_bag = [o for o in observations]
act_bag = [a for a in actions]
rew = []
for i in range(dagger_rounds):
    print("Starting dagger round {}".format(i))
    print("total data: {}".format(len(obs_bag)))
    obs_train, obs_valid, act_train, act_valid = \
        train_test_split(obs_bag, act_bag, test_size = 0.2, random_state = 1998)

    m = linear_model()    
    hist = m.fit(np.array(obs_train), np.array(act_train), \
          validation_data=[np.array(obs_valid), np.array(act_valid)],  epochs=10, batch_size= 512, verbose = 0)
    print (hist.history['val_mean_absolute_error'][-3:])
    new_obs, new_rewards = run_model(m, m_rounds, m_steps)
    
    with tf.Session(): 
        new_actions = policy_fn(np.array(new_obs))
    obs_bag.extend(new_obs)
    act_bag.extend([a for a in new_actions])
    rew.append(new_rewards)

    print("Dagger round {} average rewards {}". format(i, sum(new_rewards)/m_rounds))
    



Starting dagger round 0
total data: 1000000


[2018-06-07 00:22:44,934] Making new env: Reacher-v1


[0.0009314697754383087, 0.0008881637313775719, 0.0009968089822307228]
Dagger round 0 average rewards -30.90386774738525
Starting dagger round 1
total data: 1020000


[2018-06-07 00:23:33,701] Making new env: Reacher-v1


[0.0010214183843128529, 0.0008538694817335436, 0.0009724201460588066]
Dagger round 1 average rewards -30.080359987450844
Starting dagger round 2
total data: 1040000


[2018-06-07 00:24:23,451] Making new env: Reacher-v1


[0.0011672708051135906, 0.000922916311913958, 0.0009913429842115596]
Dagger round 2 average rewards -24.942037867612235
Starting dagger round 3
total data: 1060000


[2018-06-07 00:25:14,121] Making new env: Reacher-v1


[0.0014171209406494251, 0.0012305618628754087, 0.0010067926708187136]
Dagger round 3 average rewards -30.307623734609752
Starting dagger round 4
total data: 1080000


[2018-06-07 00:26:05,662] Making new env: Reacher-v1


[0.0014900445213058481, 0.0009119868006350265, 0.0011463030982003719]
Dagger round 4 average rewards -25.0500981223255


In [38]:
o, r = run_model(m, 1000, 1000)

[2018-06-07 00:28:02,186] Making new env: Reacher-v1


In [40]:
sum(r)/1000

-22.506101679346383

In [34]:
bc = linear_model()

In [35]:
bc.fit(obs_train, act_train, validation_data=[obs_valid, act_valid],  epochs=10, batch_size= 512)

Train on 800000 samples, validate on 200000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f656c1b3da0>

In [11]:
rewards.sum()/1000

-22.77505284176671

In [20]:
o = np.random.random(11)[None, :]
bc.predict(o)

array([[-0.30525178, -0.2506469 ]], dtype=float32)

In [21]:
with tf.Session():
    print(policy_fn(o))

[[0.8586951 1.2538567]]


In [22]:
m_rewards = []



In [28]:
mr = 1000
mstep = 1000
run_model(bc, mr, mstep)

[2018-06-06 14:28:00,214] Making new env: Reacher-v1


In [29]:
sum(m_rewards)/mr

-124.8846213507004

In [27]:
len(m_rewards)

20000

In [30]:
2

2