In [1]:
import gym
import tensorflow as tf
# from VPG import VPG_agent
from PPO import PPO_agent
import time
import gym_snake
import json
import os
import sys

%matplotlib notebook

In [2]:
run_name = 'cartpole'
cfg_fp = os.path.join('configs', run_name + '.json')
with open(cfg_fp, 'r') as f:
    config = json.load(f)

In [3]:
env_name = config['env']
env = gym.make(env_name).env if 'use_raw_env' in config else gym.make(env_name)

In [4]:
print(env.reset())
print(env.action_space)

[-0.02350919  0.02410208  0.04724527 -0.04920646]
Discrete(2)


In [5]:
if env_name == "CartPole-v0":   # Find a way to put this in config
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(4,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(4,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "MountainCar-v0":
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=(2,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
elif env_name == "Acrobot-v1":
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=(6,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(48, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.Input(shape=(6,)),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(48, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "gym_snake:snake-v0":
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(15, 15, 3)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(4, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(15, 15, 3)),
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])
elif env_name == "Taxi-v3":
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(6, activation='softmax')
    ])
    value = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,)),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation=None)
    ])

In [6]:
agent = PPO_agent(
    model,
    value,
    env=env,
    learning_rate=config['learning_rate'],
    minibatch_size=config['minibatch_size'],
    env_name=config['env_name']
)

In [7]:
t_max = config['t_max']

In [8]:
agent.train(epochs=config['train_epochs'], t_max=t_max)

Training epochs:   0%|          | 0/200 [00:00<?, ?it/s]

[4] Average reward: 26.66
Saving to checkpoint...
[9] Average reward: 32.72
Saving to checkpoint...
[14] Average reward: 41.01
Saving to checkpoint...
[19] Average reward: 56.69
Saving to checkpoint...
[24] Average reward: 88.86
Saving to checkpoint...
[29] Average reward: 123.8
Saving to checkpoint...
[34] Average reward: 181.85
Saving to checkpoint...
[39] Average reward: 219.55
Saving to checkpoint...
[44] Average reward: 285.04
Saving to checkpoint...
[49] Average reward: 382.14
Saving to checkpoint...


ValueError: Input 0 of layer sequential is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: (0,)

In [9]:
# env = gym.make(env_name) # .env
obs = agent.preprocess(env.reset())
reward = 0
for i in range(t_max):
    print(agent.get_policy(obs))
    act = agent.get_action(obs, greedy=True)[0]
    obs, r, dn, info = env.step(agent.action_wrapper(act))
    env.render()
    time.sleep(0.005)
    obs = agent.preprocess(obs)
    reward += r
    if dn:
        break

print("Total reward: {}".format(reward), file=sys.stderr)
env.close()

tf.Tensor([[0.46772587 0.5322741 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.87525237 0.12474764]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4485132 0.5514868]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8690124  0.13098763]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4354331  0.56456697]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8648063  0.13519375]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4279967 0.5720033]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.86266935 0.1373306 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4261678 0.5738323]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8626061  0.13739385]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4297937  0.57020634]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.86460334 0.13539664]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4389122 0.5610878]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8686104  0.13138969]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.45342585 0.54657406]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8742

tf.Tensor([[0.7080575  0.29194254]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.20677412 0.7932259 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.6889784  0.31102157]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.1937609  0.80623907]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.6678337  0.33216634]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.18026273 0.8197373 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.6440017 0.3559983]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.16623448 0.8337655 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.6167827  0.38321722]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.15158676 0.8484132 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5854098 0.4145902]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.13630778 0.8636922 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5490856 0.4509144]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.1208953  0.87910473]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5068595  0.49314043]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.10

tf.Tensor([[0.8140797  0.18592022]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.38514403 0.614856  ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8014497  0.19855031]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.36909294 0.63090706]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.78988165 0.2101183 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.35535562 0.6446443 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7792595  0.22074045]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.34347185 0.65652823]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.76944166 0.23055832]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.3330483  0.66695166]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7602705 0.2397295]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.3237473 0.6762527]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7515781  0.24842189]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.3152756 0.6847244]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7433841  0.25661585]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.30

tf.Tensor([[0.7904165  0.20958354]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.37235868 0.6276413 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7899071  0.21009293]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.37306115 0.6269389 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.79120755 0.20879246]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.37629175 0.62370825]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.7942753  0.20572478]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.3820837 0.6179162]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.79907835 0.20092165]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.39055318 0.6094468 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.80559164 0.19440836]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.4018986  0.59810144]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.8137899  0.18621013]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.41640148 0.58359855]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.82363766 0.17636237]], shape=(1, 2), dtype=float32)
tf.Tensor([[

tf.Tensor([[0.58978885 0.41021118]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.19193538 0.80806464]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5963402  0.40365985]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.19396165 0.8060384 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.59653544 0.4034645 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.19221435 0.8077857 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5904196 0.4095804]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.18674791 0.8132521 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5777886 0.4222114]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.17769827 0.8223017 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5582102 0.4417898]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.16530384 0.8346962 ]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.5310746  0.46892536]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.14993736 0.85006267]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.49570316 0.50429684]], shape=(1, 2), dtype=float32)
tf.Tensor([[0.82

NameError: name 'sys' is not defined