In [2]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'CartPole-v0'

In [3]:
# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [7]:
# Option 1 : Simple model
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))

# Option 2: deep network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

In [None]:
print(model.summary())

memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()

cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
activation_4 (Activation)    (None, 16)                0         
__________

   740/100000: episode: 38, duration: 0.024s, episode steps: 23, steps per second: 967, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.652 [0.000, 1.000], mean observation: -0.089 [-2.459, 1.345], mean_best_reward: --
   752/100000: episode: 39, duration: 0.019s, episode steps: 12, steps per second: 630, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.117 [-2.292, 1.337], mean_best_reward: --
   772/100000: episode: 40, duration: 0.018s, episode steps: 20, steps per second: 1113, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.079 [-0.820, 1.647], mean_best_reward: --
   789/100000: episode: 41, duration: 0.016s, episode steps: 17, steps per second: 1082, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.412 [0.000, 1.000], mean observation: 0.071 [-0.935, 1.450], mean_best_reward: --
   810/100000: episo

  1330/100000: episode: 74, duration: 0.022s, episode steps: 20, steps per second: 899, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.069 [-1.166, 1.985], mean_best_reward: --
  1362/100000: episode: 75, duration: 0.037s, episode steps: 32, steps per second: 867, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.062 [-0.847, 1.719], mean_best_reward: --
  1397/100000: episode: 76, duration: 0.032s, episode steps: 35, steps per second: 1099, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.486 [0.000, 1.000], mean observation: 0.018 [-0.650, 1.262], mean_best_reward: --
  1409/100000: episode: 77, duration: 0.013s, episode steps: 12, steps per second: 934, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.118 [-0.816, 1.376], mean_best_reward: --
  1423/100000: episode:

  2198/100000: episode: 119, duration: 0.038s, episode steps: 35, steps per second: 919, episode reward: 35.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.400 [0.000, 1.000], mean observation: 0.047 [-1.502, 2.089], mean_best_reward: --
  2210/100000: episode: 120, duration: 0.015s, episode steps: 12, steps per second: 820, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.113 [-2.055, 1.183], mean_best_reward: --
  2220/100000: episode: 121, duration: 0.013s, episode steps: 10, steps per second: 759, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.147 [-1.351, 2.228], mean_best_reward: --
  2232/100000: episode: 122, duration: 0.013s, episode steps: 12, steps per second: 900, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.917 [0.000, 1.000], mean observation: -0.081 [-2.966, 1.977], mean_best_reward: --
  2248/100000: epi

  2712/100000: episode: 153, duration: 0.017s, episode steps: 12, steps per second: 702, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.113 [-1.159, 1.989], mean_best_reward: --
  2742/100000: episode: 154, duration: 0.031s, episode steps: 30, steps per second: 983, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.567 [0.000, 1.000], mean observation: -0.054 [-1.652, 0.799], mean_best_reward: --
  2752/100000: episode: 155, duration: 0.010s, episode steps: 10, steps per second: 966, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.127 [-1.414, 2.253], mean_best_reward: --
  2785/100000: episode: 156, duration: 0.027s, episode steps: 33, steps per second: 1213, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.051 [-2.825, 1.767], mean_best_reward: --
  2802/100000: ep

  3322/100000: episode: 188, duration: 0.015s, episode steps: 11, steps per second: 736, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.118 [-1.880, 1.136], mean_best_reward: --
  3338/100000: episode: 189, duration: 0.025s, episode steps: 16, steps per second: 633, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.076 [-2.553, 1.555], mean_best_reward: --
  3358/100000: episode: 190, duration: 0.019s, episode steps: 20, steps per second: 1037, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.040 [-1.511, 2.347], mean_best_reward: --
  3377/100000: episode: 191, duration: 0.018s, episode steps: 19, steps per second: 1035, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.684 [0.000, 1.000], mean observation: -0.056 [-2.208, 1.392], mean_best_reward: --
  3394/100000: 

  3924/100000: episode: 224, duration: 0.015s, episode steps: 12, steps per second: 824, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.119 [-1.200, 2.080], mean_best_reward: --
  3947/100000: episode: 225, duration: 0.026s, episode steps: 23, steps per second: 879, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.077 [-1.052, 0.403], mean_best_reward: --
  3957/100000: episode: 226, duration: 0.011s, episode steps: 10, steps per second: 908, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.136 [-1.965, 3.058], mean_best_reward: --
  3982/100000: episode: 227, duration: 0.023s, episode steps: 25, steps per second: 1077, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.085 [-1.036, 0.560], mean_best_reward: --
  3994/100000: ep

  4786/100000: episode: 267, duration: 0.023s, episode steps: 17, steps per second: 743, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.647 [0.000, 1.000], mean observation: -0.083 [-2.014, 1.168], mean_best_reward: --
  4806/100000: episode: 268, duration: 0.021s, episode steps: 20, steps per second: 949, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.650 [0.000, 1.000], mean observation: -0.085 [-2.300, 1.356], mean_best_reward: --
  4819/100000: episode: 269, duration: 0.014s, episode steps: 13, steps per second: 953, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.109 [-1.786, 0.951], mean_best_reward: --
  4830/100000: episode: 270, duration: 0.012s, episode steps: 11, steps per second: 899, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.110 [-2.246, 1.353], mean_best_reward: --
  4854/100000: e

  5536/100000: episode: 304, duration: 0.030s, episode steps: 20, steps per second: 658, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.067 [-1.689, 0.994], mean_best_reward: --
  5560/100000: episode: 305, duration: 0.039s, episode steps: 24, steps per second: 620, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.056 [-0.860, 0.579], mean_best_reward: --
  5617/100000: episode: 306, duration: 0.066s, episode steps: 57, steps per second: 867, episode reward: 57.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.526 [0.000, 1.000], mean observation: -0.092 [-1.673, 0.857], mean_best_reward: --
  5627/100000: episode: 307, duration: 0.010s, episode steps: 10, steps per second: 1026, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.120 [-1.672, 0.988], mean_best_reward: --
  5650/100000: 

  6355/100000: episode: 346, duration: 0.014s, episode steps: 12, steps per second: 871, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.167 [0.000, 1.000], mean observation: 0.121 [-1.585, 2.521], mean_best_reward: --
  6366/100000: episode: 347, duration: 0.013s, episode steps: 11, steps per second: 818, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.136 [-2.347, 1.418], mean_best_reward: --
  6390/100000: episode: 348, duration: 0.021s, episode steps: 24, steps per second: 1151, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.107 [-1.544, 2.766], mean_best_reward: --
  6401/100000: episode: 349, duration: 0.011s, episode steps: 11, steps per second: 1009, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.135 [-1.762, 2.781], mean_best_reward: --
  6415/100000: ep

  7013/100000: episode: 382, duration: 0.017s, episode steps: 16, steps per second: 944, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.250 [0.000, 1.000], mean observation: 0.085 [-1.561, 2.470], mean_best_reward: --
  7035/100000: episode: 383, duration: 0.022s, episode steps: 22, steps per second: 989, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.409 [0.000, 1.000], mean observation: 0.069 [-1.143, 1.782], mean_best_reward: --
  7061/100000: episode: 384, duration: 0.022s, episode steps: 26, steps per second: 1161, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.423 [0.000, 1.000], mean observation: 0.041 [-1.023, 1.564], mean_best_reward: --
  7084/100000: episode: 385, duration: 0.020s, episode steps: 23, steps per second: 1138, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.522 [0.000, 1.000], mean observation: -0.085 [-1.146, 0.584], mean_best_reward: --
  7096/100000: ep

  7772/100000: episode: 421, duration: 0.029s, episode steps: 17, steps per second: 587, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.588 [0.000, 1.000], mean observation: -0.062 [-1.416, 0.833], mean_best_reward: --
  7784/100000: episode: 422, duration: 0.020s, episode steps: 12, steps per second: 603, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.135 [-2.596, 1.538], mean_best_reward: --
  7804/100000: episode: 423, duration: 0.025s, episode steps: 20, steps per second: 794, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.050 [-2.352, 1.514], mean_best_reward: --
  7822/100000: episode: 424, duration: 0.020s, episode steps: 18, steps per second: 896, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.081 [-1.110, 0.616], mean_best_reward: --
  7841/100000: e

  8626/100000: episode: 460, duration: 0.014s, episode steps: 13, steps per second: 898, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.109 [-1.351, 2.304], mean_best_reward: --
  8642/100000: episode: 461, duration: 0.017s, episode steps: 16, steps per second: 933, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.438 [0.000, 1.000], mean observation: 0.090 [-0.770, 1.282], mean_best_reward: --
  8681/100000: episode: 462, duration: 0.032s, episode steps: 39, steps per second: 1233, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.075 [-0.755, 1.234], mean_best_reward: --
  8695/100000: episode: 463, duration: 0.013s, episode steps: 14, steps per second: 1091, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.643 [0.000, 1.000], mean observation: -0.078 [-1.794, 1.198], mean_best_reward: --
  8728/100000: ep

  9546/100000: episode: 505, duration: 0.025s, episode steps: 22, steps per second: 883, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.076 [-0.583, 1.114], mean_best_reward: --
  9576/100000: episode: 506, duration: 0.029s, episode steps: 30, steps per second: 1042, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.104 [-0.787, 1.573], mean_best_reward: --
  9584/100000: episode: 507, duration: 0.009s, episode steps: 8, steps per second: 910, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.146 [-1.611, 2.547], mean_best_reward: --
  9595/100000: episode: 508, duration: 0.011s, episode steps: 11, steps per second: 1005, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.091 [0.000, 1.000], mean observation: 0.141 [-1.749, 2.894], mean_best_reward: --
  9635/100000: episo

 10466/100000: episode: 546, duration: 0.049s, episode steps: 50, steps per second: 1014, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: -0.023 [-1.378, 0.601], mean_best_reward: --
 10479/100000: episode: 547, duration: 0.012s, episode steps: 13, steps per second: 1092, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.846 [0.000, 1.000], mean observation: -0.089 [-2.673, 1.776], mean_best_reward: --
 10489/100000: episode: 548, duration: 0.011s, episode steps: 10, steps per second: 926, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.154 [-1.720, 2.716], mean_best_reward: --
 10507/100000: episode: 549, duration: 0.016s, episode steps: 18, steps per second: 1108, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.066 [-0.991, 0.630], mean_best_reward: --
 10526/100000:

 11102/100000: episode: 585, duration: 0.012s, episode steps: 11, steps per second: 922, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.121 [-1.923, 1.184], mean_best_reward: --
 11117/100000: episode: 586, duration: 0.021s, episode steps: 15, steps per second: 707, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.267 [0.000, 1.000], mean observation: 0.078 [-1.416, 2.310], mean_best_reward: --
 11129/100000: episode: 587, duration: 0.011s, episode steps: 12, steps per second: 1070, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.097 [-0.813, 1.221], mean_best_reward: --
 11144/100000: episode: 588, duration: 0.014s, episode steps: 15, steps per second: 1088, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.267 [0.000, 1.000], mean observation: 0.082 [-1.388, 2.139], mean_best_reward: --
 11162/100000: ep

 11722/100000: episode: 627, duration: 0.016s, episode steps: 14, steps per second: 864, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.571 [0.000, 1.000], mean observation: -0.115 [-1.221, 0.552], mean_best_reward: --
 11733/100000: episode: 628, duration: 0.015s, episode steps: 11, steps per second: 748, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.110 [-1.220, 1.765], mean_best_reward: --
 11748/100000: episode: 629, duration: 0.014s, episode steps: 15, steps per second: 1051, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.267 [0.000, 1.000], mean observation: 0.089 [-1.414, 2.290], mean_best_reward: --
 11764/100000: episode: 630, duration: 0.015s, episode steps: 16, steps per second: 1103, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.312 [0.000, 1.000], mean observation: 0.074 [-1.329, 2.132], mean_best_reward: --
 11794/100000: ep

 12416/100000: episode: 664, duration: 0.020s, episode steps: 21, steps per second: 1048, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.619 [0.000, 1.000], mean observation: -0.052 [-1.774, 0.996], mean_best_reward: --
 12432/100000: episode: 665, duration: 0.019s, episode steps: 16, steps per second: 853, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.312 [0.000, 1.000], mean observation: 0.064 [-1.219, 2.028], mean_best_reward: --
 12456/100000: episode: 666, duration: 0.020s, episode steps: 24, steps per second: 1200, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.089 [-0.357, 0.907], mean_best_reward: --
 12478/100000: episode: 667, duration: 0.019s, episode steps: 22, steps per second: 1156, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.094 [-0.758, 1.453], mean_best_reward: --
 12487/100000: e

 13043/100000: episode: 701, duration: 0.013s, episode steps: 12, steps per second: 916, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.833 [0.000, 1.000], mean observation: -0.123 [-2.560, 1.582], mean_best_reward: 69.000000
 13061/100000: episode: 702, duration: 0.021s, episode steps: 18, steps per second: 869, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.081 [-2.139, 1.164], mean_best_reward: --
 13077/100000: episode: 703, duration: 0.015s, episode steps: 16, steps per second: 1089, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.625 [0.000, 1.000], mean observation: -0.115 [-1.567, 0.753], mean_best_reward: --
 13090/100000: episode: 704, duration: 0.012s, episode steps: 13, steps per second: 1052, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.231 [0.000, 1.000], mean observation: 0.118 [-1.332, 2.306], mean_best_reward: --
 13100/1

 13716/100000: episode: 740, duration: 0.021s, episode steps: 19, steps per second: 891, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.263 [0.000, 1.000], mean observation: 0.069 [-1.712, 2.719], mean_best_reward: --
 13729/100000: episode: 741, duration: 0.014s, episode steps: 13, steps per second: 955, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.385 [0.000, 1.000], mean observation: 0.118 [-0.768, 1.418], mean_best_reward: --
 13748/100000: episode: 742, duration: 0.016s, episode steps: 19, steps per second: 1155, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: 0.100 [-0.624, 1.006], mean_best_reward: --
 13758/100000: episode: 743, duration: 0.010s, episode steps: 10, steps per second: 1003, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.127 [-1.613, 2.606], mean_best_reward: --
 13769/100000: epi

 14364/100000: episode: 785, duration: 0.028s, episode steps: 30, steps per second: 1086, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.367 [0.000, 1.000], mean observation: 0.058 [-1.537, 2.512], mean_best_reward: --
 14374/100000: episode: 786, duration: 0.013s, episode steps: 10, steps per second: 751, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.151 [-1.963, 3.089], mean_best_reward: --
 14383/100000: episode: 787, duration: 0.010s, episode steps: 9, steps per second: 932, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.141 [-2.268, 1.362], mean_best_reward: --
 14400/100000: episode: 788, duration: 0.015s, episode steps: 17, steps per second: 1101, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.706 [0.000, 1.000], mean observation: -0.078 [-2.219, 1.376], mean_best_reward: --
 14417/100000: epi

 14997/100000: episode: 824, duration: 0.014s, episode steps: 14, steps per second: 1001, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.143 [0.000, 1.000], mean observation: 0.093 [-1.941, 3.011], mean_best_reward: --
 15013/100000: episode: 825, duration: 0.022s, episode steps: 16, steps per second: 731, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.089 [-0.951, 1.759], mean_best_reward: --
 15023/100000: episode: 826, duration: 0.010s, episode steps: 10, steps per second: 976, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.114 [-2.485, 1.552], mean_best_reward: --
 15038/100000: episode: 827, duration: 0.014s, episode steps: 15, steps per second: 1111, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.109 [-0.984, 1.797], mean_best_reward: --
 15051/100000: ep

 15669/100000: episode: 862, duration: 0.047s, episode steps: 43, steps per second: 920, episode reward: 43.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.087 [-0.768, 0.944], mean_best_reward: --
 15690/100000: episode: 863, duration: 0.019s, episode steps: 21, steps per second: 1094, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000], mean observation: -0.036 [-2.724, 1.770], mean_best_reward: --
 15700/100000: episode: 864, duration: 0.010s, episode steps: 10, steps per second: 977, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.103 [-2.255, 1.415], mean_best_reward: --
 15751/100000: episode: 865, duration: 0.041s, episode steps: 51, steps per second: 1230, episode reward: 51.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.510 [0.000, 1.000], mean observation: 0.082 [-0.460, 0.835], mean_best_reward: --
 15785/100000: e

 16596/100000: episode: 902, duration: 0.027s, episode steps: 25, steps per second: 920, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.440 [0.000, 1.000], mean observation: -0.109 [-1.012, 0.606], mean_best_reward: --
 16621/100000: episode: 903, duration: 0.024s, episode steps: 25, steps per second: 1045, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.440 [0.000, 1.000], mean observation: 0.084 [-0.575, 1.304], mean_best_reward: --
 16633/100000: episode: 904, duration: 0.012s, episode steps: 12, steps per second: 1026, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.083 [0.000, 1.000], mean observation: 0.105 [-1.954, 3.029], mean_best_reward: --
 16651/100000: episode: 905, duration: 0.017s, episode steps: 18, steps per second: 1076, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.079 [-0.749, 1.420], mean_best_reward: --
 16662/100000: e

 17223/100000: episode: 939, duration: 0.032s, episode steps: 38, steps per second: 1195, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.368 [0.000, 1.000], mean observation: 0.009 [-1.940, 2.889], mean_best_reward: --
 17239/100000: episode: 940, duration: 0.021s, episode steps: 16, steps per second: 759, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.070 [-1.191, 1.829], mean_best_reward: --
 17249/100000: episode: 941, duration: 0.010s, episode steps: 10, steps per second: 989, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.155 [-2.594, 1.526], mean_best_reward: --
 17273/100000: episode: 942, duration: 0.020s, episode steps: 24, steps per second: 1187, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.542 [0.000, 1.000], mean observation: -0.086 [-1.179, 0.594], mean_best_reward: --
 17285/100000: e

 17888/100000: episode: 973, duration: 0.013s, episode steps: 9, steps per second: 704, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.142 [-1.762, 2.841], mean_best_reward: --
 17906/100000: episode: 974, duration: 0.023s, episode steps: 18, steps per second: 774, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.444 [0.000, 1.000], mean observation: 0.079 [-0.830, 1.252], mean_best_reward: --
 17920/100000: episode: 975, duration: 0.013s, episode steps: 14, steps per second: 1043, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.429 [0.000, 1.000], mean observation: 0.105 [-0.767, 1.185], mean_best_reward: --
 17935/100000: episode: 976, duration: 0.014s, episode steps: 15, steps per second: 1104, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.267 [0.000, 1.000], mean observation: 0.086 [-1.393, 2.304], mean_best_reward: --
 17947/100000: episo

 18812/100000: episode: 1017, duration: 0.033s, episode steps: 26, steps per second: 792, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.577 [0.000, 1.000], mean observation: -0.046 [-1.493, 0.813], mean_best_reward: --
 18823/100000: episode: 1018, duration: 0.011s, episode steps: 11, steps per second: 983, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.636 [0.000, 1.000], mean observation: -0.115 [-1.813, 1.160], mean_best_reward: --
 18856/100000: episode: 1019, duration: 0.027s, episode steps: 33, steps per second: 1214, episode reward: 33.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.125 [-0.617, 1.905], mean_best_reward: --
 18873/100000: episode: 1020, duration: 0.015s, episode steps: 17, steps per second: 1097, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.706 [0.000, 1.000], mean observation: -0.062 [-2.290, 1.406], mean_best_reward: --
 18889/1000

 19471/100000: episode: 1052, duration: 0.015s, episode steps: 14, steps per second: 926, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.106 [-1.564, 2.608], mean_best_reward: --
 19481/100000: episode: 1053, duration: 0.015s, episode steps: 10, steps per second: 679, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.100 [0.000, 1.000], mean observation: 0.137 [-1.526, 2.516], mean_best_reward: --
 19495/100000: episode: 1054, duration: 0.013s, episode steps: 14, steps per second: 1057, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.095 [-1.566, 2.457], mean_best_reward: --
 19508/100000: episode: 1055, duration: 0.013s, episode steps: 13, steps per second: 1016, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.081 [-1.220, 1.910], mean_best_reward: --
 19521/100000:

 20107/100000: episode: 1092, duration: 0.016s, episode steps: 14, steps per second: 851, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.214 [0.000, 1.000], mean observation: 0.105 [-1.589, 2.693], mean_best_reward: --
 20118/100000: episode: 1093, duration: 0.014s, episode steps: 11, steps per second: 780, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.130 [-1.586, 2.511], mean_best_reward: --
 20129/100000: episode: 1094, duration: 0.011s, episode steps: 11, steps per second: 1020, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.115 [-1.167, 1.873], mean_best_reward: --
 20139/100000: episode: 1095, duration: 0.010s, episode steps: 10, steps per second: 994, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.146 [-1.980, 3.062], mean_best_reward: --
 20151/100000: 

 20731/100000: episode: 1132, duration: 0.024s, episode steps: 21, steps per second: 867, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.714 [0.000, 1.000], mean observation: -0.033 [-2.842, 1.978], mean_best_reward: --
 20765/100000: episode: 1133, duration: 0.030s, episode steps: 34, steps per second: 1150, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.124 [-1.384, 0.612], mean_best_reward: --
 20776/100000: episode: 1134, duration: 0.011s, episode steps: 11, steps per second: 982, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.182 [0.000, 1.000], mean observation: 0.114 [-1.418, 2.279], mean_best_reward: --
 20788/100000: episode: 1135, duration: 0.012s, episode steps: 12, steps per second: 1037, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.083 [0.000, 1.000], mean observation: 0.130 [-1.956, 3.055], mean_best_reward: --
 20813/10000

 21643/100000: episode: 1169, duration: 0.036s, episode steps: 38, steps per second: 1062, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.553 [0.000, 1.000], mean observation: -0.022 [-1.558, 0.959], mean_best_reward: --
 21654/100000: episode: 1170, duration: 0.014s, episode steps: 11, steps per second: 809, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.140 [-2.269, 1.327], mean_best_reward: --
 21693/100000: episode: 1171, duration: 0.035s, episode steps: 39, steps per second: 1121, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.513 [0.000, 1.000], mean observation: 0.094 [-0.395, 1.377], mean_best_reward: --
 21743/100000: episode: 1172, duration: 0.043s, episode steps: 50, steps per second: 1152, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.460 [0.000, 1.000], mean observation: -0.145 [-0.973, 0.821], mean_best_reward: --
 21756/100

 22838/100000: episode: 1215, duration: 0.018s, episode steps: 15, steps per second: 812, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: 0.091 [-1.139, 1.724], mean_best_reward: --
 22847/100000: episode: 1216, duration: 0.015s, episode steps: 9, steps per second: 606, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.141 [-2.793, 1.719], mean_best_reward: --
 22862/100000: episode: 1217, duration: 0.017s, episode steps: 15, steps per second: 861, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.267 [0.000, 1.000], mean observation: 0.096 [-1.382, 2.390], mean_best_reward: --
 22874/100000: episode: 1218, duration: 0.013s, episode steps: 12, steps per second: 937, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.750 [0.000, 1.000], mean observation: -0.125 [-2.057, 1.175], mean_best_reward: --
 22884/100000: e

 23451/100000: episode: 1253, duration: 0.017s, episode steps: 13, steps per second: 744, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.846 [0.000, 1.000], mean observation: -0.118 [-2.969, 1.922], mean_best_reward: --
 23467/100000: episode: 1254, duration: 0.019s, episode steps: 16, steps per second: 858, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.688 [0.000, 1.000], mean observation: -0.092 [-1.983, 1.134], mean_best_reward: --
 23478/100000: episode: 1255, duration: 0.011s, episode steps: 11, steps per second: 977, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.110 [-2.353, 1.418], mean_best_reward: --
 23497/100000: episode: 1256, duration: 0.017s, episode steps: 19, steps per second: 1112, episode reward: 19.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.421 [0.000, 1.000], mean observation: 0.108 [-0.770, 1.660], mean_best_reward: --
 23526/10000

 24370/100000: episode: 1296, duration: 0.013s, episode steps: 11, steps per second: 852, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.818 [0.000, 1.000], mean observation: -0.107 [-2.298, 1.414], mean_best_reward: --
 24387/100000: episode: 1297, duration: 0.020s, episode steps: 17, steps per second: 869, episode reward: 17.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.765 [0.000, 1.000], mean observation: -0.069 [-2.753, 1.766], mean_best_reward: --
 24400/100000: episode: 1298, duration: 0.012s, episode steps: 13, steps per second: 1050, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.692 [0.000, 1.000], mean observation: -0.093 [-1.779, 1.037], mean_best_reward: --
 24410/100000: episode: 1299, duration: 0.010s, episode steps: 10, steps per second: 983, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.130 [-2.519, 1.600], mean_best_reward: --
 24468/1000

 25261/100000: episode: 1336, duration: 0.021s, episode steps: 24, steps per second: 1157, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: 0.073 [-0.749, 1.455], mean_best_reward: --
 25299/100000: episode: 1337, duration: 0.037s, episode steps: 38, steps per second: 1028, episode reward: 38.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.605 [0.000, 1.000], mean observation: 0.105 [-1.842, 1.758], mean_best_reward: --
 25326/100000: episode: 1338, duration: 0.022s, episode steps: 27, steps per second: 1225, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: 0.111 [-0.374, 1.288], mean_best_reward: --
 25353/100000: episode: 1339, duration: 0.023s, episode steps: 27, steps per second: 1175, episode reward: 27.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.119 [-0.603, 0.970], mean_best_reward: --
 25364/10000

 26297/100000: episode: 1370, duration: 0.034s, episode steps: 37, steps per second: 1080, episode reward: 37.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.514 [0.000, 1.000], mean observation: 0.123 [-1.114, 1.146], mean_best_reward: --
 26317/100000: episode: 1371, duration: 0.022s, episode steps: 20, steps per second: 927, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.700 [0.000, 1.000], mean observation: -0.040 [-2.502, 1.596], mean_best_reward: --
 26349/100000: episode: 1372, duration: 0.027s, episode steps: 32, steps per second: 1205, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.105 [-0.680, 1.245], mean_best_reward: --
 26385/100000: episode: 1373, duration: 0.030s, episode steps: 36, steps per second: 1217, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.528 [0.000, 1.000], mean observation: 0.038 [-0.608, 0.927], mean_best_reward: --
 26396/10000

 27283/100000: episode: 1411, duration: 0.039s, episode steps: 47, steps per second: 1203, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.532 [0.000, 1.000], mean observation: -0.038 [-1.706, 0.976], mean_best_reward: --
 27299/100000: episode: 1412, duration: 0.018s, episode steps: 16, steps per second: 883, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.078 [-0.966, 1.686], mean_best_reward: --
 27322/100000: episode: 1413, duration: 0.020s, episode steps: 23, steps per second: 1159, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: 0.110 [-0.433, 1.094], mean_best_reward: --
 27337/100000: episode: 1414, duration: 0.014s, episode steps: 15, steps per second: 1090, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.600 [0.000, 1.000], mean observation: -0.111 [-1.302, 0.571], mean_best_reward: --
 27367/1000

 28223/100000: episode: 1450, duration: 0.019s, episode steps: 20, steps per second: 1040, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.094 [-0.831, 1.228], mean_best_reward: --
 28248/100000: episode: 1451, duration: 0.026s, episode steps: 25, steps per second: 976, episode reward: 25.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.480 [0.000, 1.000], mean observation: 0.095 [-0.432, 1.275], mean_best_reward: 69.500000
 28287/100000: episode: 1452, duration: 0.031s, episode steps: 39, steps per second: 1244, episode reward: 39.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.410 [0.000, 1.000], mean observation: -0.063 [-1.294, 1.564], mean_best_reward: --
 28323/100000: episode: 1453, duration: 0.029s, episode steps: 36, steps per second: 1233, episode reward: 36.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.639 [0.000, 1.000], mean observation: 0.020 [-2.804, 2.174], mean_best_reward: --
 2834

 29016/100000: episode: 1491, duration: 0.057s, episode steps: 20, steps per second: 348, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.450 [0.000, 1.000], mean observation: 0.088 [-0.813, 1.279], mean_best_reward: --
 29036/100000: episode: 1492, duration: 0.040s, episode steps: 20, steps per second: 500, episode reward: 20.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.350 [0.000, 1.000], mean observation: 0.086 [-1.144, 2.027], mean_best_reward: --
 29048/100000: episode: 1493, duration: 0.024s, episode steps: 12, steps per second: 500, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.083 [0.000, 1.000], mean observation: 0.102 [-1.987, 3.023], mean_best_reward: --
 29061/100000: episode: 1494, duration: 0.023s, episode steps: 13, steps per second: 568, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.154 [0.000, 1.000], mean observation: 0.094 [-1.755, 2.681], mean_best_reward: --
 29076/100000: e

 29869/100000: episode: 1527, duration: 0.020s, episode steps: 22, steps per second: 1106, episode reward: 22.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.068 [-0.769, 1.351], mean_best_reward: --
 29918/100000: episode: 1528, duration: 0.048s, episode steps: 49, steps per second: 1027, episode reward: 49.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: 0.050 [-0.593, 1.567], mean_best_reward: --
 29942/100000: episode: 1529, duration: 0.021s, episode steps: 24, steps per second: 1164, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.417 [0.000, 1.000], mean observation: 0.067 [-0.785, 1.560], mean_best_reward: --
 29983/100000: episode: 1530, duration: 0.034s, episode steps: 41, steps per second: 1197, episode reward: 41.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.512 [0.000, 1.000], mean observation: 0.112 [-0.373, 0.900], mean_best_reward: --
 30031/10000

 30992/100000: episode: 1568, duration: 0.046s, episode steps: 47, steps per second: 1022, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.158 [-0.894, 0.411], mean_best_reward: --
 31039/100000: episode: 1569, duration: 0.054s, episode steps: 47, steps per second: 873, episode reward: 47.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.511 [0.000, 1.000], mean observation: 0.099 [-0.581, 1.169], mean_best_reward: --
 31067/100000: episode: 1570, duration: 0.024s, episode steps: 28, steps per second: 1173, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.464 [0.000, 1.000], mean observation: -0.101 [-0.927, 0.368], mean_best_reward: --
 31093/100000: episode: 1571, duration: 0.022s, episode steps: 26, steps per second: 1163, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.423 [0.000, 1.000], mean observation: 0.031 [-0.794, 1.347], mean_best_reward: --
 31167/1000

 32177/100000: episode: 1604, duration: 0.100s, episode steps: 68, steps per second: 683, episode reward: 68.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.485 [0.000, 1.000], mean observation: 0.030 [-0.570, 1.428], mean_best_reward: --
 32193/100000: episode: 1605, duration: 0.023s, episode steps: 16, steps per second: 695, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.375 [0.000, 1.000], mean observation: 0.065 [-1.030, 1.566], mean_best_reward: --
 32208/100000: episode: 1606, duration: 0.032s, episode steps: 15, steps per second: 472, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.109 [-1.882, 0.958], mean_best_reward: --
 32222/100000: episode: 1607, duration: 0.028s, episode steps: 14, steps per second: 504, episode reward: 14.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.357 [0.000, 1.000], mean observation: 0.119 [-0.802, 1.584], mean_best_reward: --
 32249/100000: 

 34010/100000: episode: 1646, duration: 0.090s, episode steps: 87, steps per second: 969, episode reward: 87.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.483 [0.000, 1.000], mean observation: -0.084 [-1.098, 0.803], mean_best_reward: --
 34093/100000: episode: 1647, duration: 0.076s, episode steps: 83, steps per second: 1094, episode reward: 83.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.458 [0.000, 1.000], mean observation: -0.340 [-1.902, 0.891], mean_best_reward: --
 34104/100000: episode: 1648, duration: 0.013s, episode steps: 11, steps per second: 831, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.108 [-0.810, 1.300], mean_best_reward: --
 34122/100000: episode: 1649, duration: 0.027s, episode steps: 18, steps per second: 678, episode reward: 18.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.556 [0.000, 1.000], mean observation: -0.071 [-1.171, 0.635], mean_best_reward: --
 34242/10000

 35184/100000: episode: 1680, duration: 0.052s, episode steps: 62, steps per second: 1186, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.105 [-0.835, 0.690], mean_best_reward: --
 35277/100000: episode: 1681, duration: 0.078s, episode steps: 93, steps per second: 1196, episode reward: 93.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.033 [-0.619, 1.278], mean_best_reward: --
 35311/100000: episode: 1682, duration: 0.029s, episode steps: 34, steps per second: 1182, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.140 [-1.113, 0.204], mean_best_reward: --
 35327/100000: episode: 1683, duration: 0.015s, episode steps: 16, steps per second: 1085, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.071 [-0.812, 1.324], mean_best_reward: --
 35374/100

In [14]:
# Finally, evaluate our algorithm for 5 episodes.
cem.load_weights('cem_{}_params.h5f'.format(ENV_NAME))
cem.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 11.000, steps: 11
Episode 2: reward: 10.000, steps: 10
Episode 3: reward: 10.000, steps: 10
Episode 4: reward: 11.000, steps: 11
Episode 5: reward: 9.000, steps: 9


<keras.callbacks.History at 0x7fa55c7267b8>