In [None]:
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
import numpy as np
import _pickle as pickle
import gym

# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False

# model initialization
D = 80 * 80 # input dimensionality: 80x80 grid
if resume:
    model = pickle.load(open('save.p', 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
    model['W2'] = np.random.randn(H) / np.sqrt(H)

grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
    return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float).ravel()

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def policy_forward(x):
    h = np.dot(model['W1'], x)
    h[h<0] = 0 # ReLU nonlinearity
    logp = np.dot(model['W2'], h)
    p = sigmoid(logp)
    return p, h # return probability of taking action 2, and hidden state

def policy_backward(eph, epdlogp):
    """ backward pass. (eph is array of intermediate hidden states) """
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0 # backpro prelu
    dW1 = np.dot(dh.T, epx)
    return {'W1':dW1, 'W2':dW2}

env = gym.make("Pong-v0")
observation = env.reset()
prev_x = None # used in computing the difference frame
xs,hs,dlogps,drs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0

while True:
    if render: env.render()

    # preprocess the observation, set input to network to be difference image
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x
    
    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
        episode_number += 1

        # stack together all inputs, hidden states, action gradients, and rewards for this episode
        epx = np.vstack(xs)
        eph = np.vstack(hs)
        epdlogp = np.vstack(dlogps)
        epr = np.vstack(drs)
        xs,hs,dlogps,drs = [],[],[],[] # reset array memory

        # compute the discounted reward backwards through time
        discounted_epr = discount_rewards(epr)
        # standardize the rewards to be unit normal (helps control the gradient estimator variance)
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)

        epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
        grad = policy_backward(eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

        # perform rmsprop parameter update every batch_size episodes
        if episode_number % batch_size == 0:
            for k,v in model.items():
                g = grad_buffer[k] # gradient
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
                model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
                grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer
                
        # boring book-keeping
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        if episode_number % 100 == 0: pickle.dump(model, open('save.p', 'wb'))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_x = None

#     if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
#         print ('ep %d: game finished, reward: %f' % (episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

[2017-12-25 12:49:00,687] Making new env: Pong-v0


resetting env. episode reward total was -21.000000. running mean: -21.000000
resetting env. episode reward total was -20.000000. running mean: -20.990000
resetting env. episode reward total was -20.000000. running mean: -20.980100
resetting env. episode reward total was -20.000000. running mean: -20.970299
resetting env. episode reward total was -19.000000. running mean: -20.950596
resetting env. episode reward total was -21.000000. running mean: -20.951090
resetting env. episode reward total was -21.000000. running mean: -20.951579
resetting env. episode reward total was -21.000000. running mean: -20.952063
resetting env. episode reward total was -21.000000. running mean: -20.952543
resetting env. episode reward total was -21.000000. running mean: -20.953017
resetting env. episode reward total was -21.000000. running mean: -20.953487
resetting env. episode reward total was -21.000000. running mean: -20.953952
resetting env. episode reward total was -21.000000. running mean: -20.954413

resetting env. episode reward total was -20.000000. running mean: -20.620780
resetting env. episode reward total was -20.000000. running mean: -20.614572
resetting env. episode reward total was -21.000000. running mean: -20.618426
resetting env. episode reward total was -19.000000. running mean: -20.602242
resetting env. episode reward total was -21.000000. running mean: -20.606220
resetting env. episode reward total was -21.000000. running mean: -20.610157
resetting env. episode reward total was -21.000000. running mean: -20.614056
resetting env. episode reward total was -21.000000. running mean: -20.617915
resetting env. episode reward total was -21.000000. running mean: -20.621736
resetting env. episode reward total was -19.000000. running mean: -20.605519
resetting env. episode reward total was -21.000000. running mean: -20.609464
resetting env. episode reward total was -19.000000. running mean: -20.593369
resetting env. episode reward total was -20.000000. running mean: -20.587435

resetting env. episode reward total was -21.000000. running mean: -20.477520
resetting env. episode reward total was -21.000000. running mean: -20.482744
resetting env. episode reward total was -21.000000. running mean: -20.487917
resetting env. episode reward total was -21.000000. running mean: -20.493038
resetting env. episode reward total was -19.000000. running mean: -20.478107
resetting env. episode reward total was -21.000000. running mean: -20.483326
resetting env. episode reward total was -20.000000. running mean: -20.478493
resetting env. episode reward total was -21.000000. running mean: -20.483708
resetting env. episode reward total was -17.000000. running mean: -20.448871
resetting env. episode reward total was -21.000000. running mean: -20.454382
resetting env. episode reward total was -21.000000. running mean: -20.459839
resetting env. episode reward total was -21.000000. running mean: -20.465240
resetting env. episode reward total was -20.000000. running mean: -20.460588

resetting env. episode reward total was -21.000000. running mean: -20.360166
resetting env. episode reward total was -21.000000. running mean: -20.366564
resetting env. episode reward total was -19.000000. running mean: -20.352898
resetting env. episode reward total was -21.000000. running mean: -20.359369
resetting env. episode reward total was -21.000000. running mean: -20.365776
resetting env. episode reward total was -20.000000. running mean: -20.362118
resetting env. episode reward total was -20.000000. running mean: -20.358497
resetting env. episode reward total was -20.000000. running mean: -20.354912
resetting env. episode reward total was -20.000000. running mean: -20.351363
resetting env. episode reward total was -18.000000. running mean: -20.327849
resetting env. episode reward total was -21.000000. running mean: -20.334571
resetting env. episode reward total was -21.000000. running mean: -20.341225
resetting env. episode reward total was -21.000000. running mean: -20.347813

resetting env. episode reward total was -20.000000. running mean: -20.423796
resetting env. episode reward total was -20.000000. running mean: -20.419558
resetting env. episode reward total was -21.000000. running mean: -20.425363
resetting env. episode reward total was -21.000000. running mean: -20.431109
resetting env. episode reward total was -21.000000. running mean: -20.436798
resetting env. episode reward total was -21.000000. running mean: -20.442430
resetting env. episode reward total was -17.000000. running mean: -20.408006
resetting env. episode reward total was -20.000000. running mean: -20.403926
resetting env. episode reward total was -21.000000. running mean: -20.409886
resetting env. episode reward total was -21.000000. running mean: -20.415788
resetting env. episode reward total was -20.000000. running mean: -20.411630
resetting env. episode reward total was -19.000000. running mean: -20.397513
resetting env. episode reward total was -20.000000. running mean: -20.393538

resetting env. episode reward total was -21.000000. running mean: -20.410588
resetting env. episode reward total was -20.000000. running mean: -20.406482
resetting env. episode reward total was -21.000000. running mean: -20.412418
resetting env. episode reward total was -21.000000. running mean: -20.418293
resetting env. episode reward total was -21.000000. running mean: -20.424111
resetting env. episode reward total was -20.000000. running mean: -20.419869
resetting env. episode reward total was -21.000000. running mean: -20.425671
resetting env. episode reward total was -21.000000. running mean: -20.431414
resetting env. episode reward total was -20.000000. running mean: -20.427100
resetting env. episode reward total was -21.000000. running mean: -20.432829
resetting env. episode reward total was -20.000000. running mean: -20.428501
resetting env. episode reward total was -21.000000. running mean: -20.434216
resetting env. episode reward total was -21.000000. running mean: -20.439873

resetting env. episode reward total was -21.000000. running mean: -20.394500
resetting env. episode reward total was -20.000000. running mean: -20.390555
resetting env. episode reward total was -20.000000. running mean: -20.386649
resetting env. episode reward total was -20.000000. running mean: -20.382783
resetting env. episode reward total was -21.000000. running mean: -20.388955
resetting env. episode reward total was -20.000000. running mean: -20.385065
resetting env. episode reward total was -18.000000. running mean: -20.361215
resetting env. episode reward total was -21.000000. running mean: -20.367603
resetting env. episode reward total was -21.000000. running mean: -20.373926
resetting env. episode reward total was -19.000000. running mean: -20.360187
resetting env. episode reward total was -21.000000. running mean: -20.366585
resetting env. episode reward total was -20.000000. running mean: -20.362919
resetting env. episode reward total was -19.000000. running mean: -20.349290

resetting env. episode reward total was -21.000000. running mean: -20.272747
resetting env. episode reward total was -21.000000. running mean: -20.280019
resetting env. episode reward total was -20.000000. running mean: -20.277219
resetting env. episode reward total was -21.000000. running mean: -20.284447
resetting env. episode reward total was -18.000000. running mean: -20.261602
resetting env. episode reward total was -21.000000. running mean: -20.268986
resetting env. episode reward total was -21.000000. running mean: -20.276296
resetting env. episode reward total was -20.000000. running mean: -20.273533
resetting env. episode reward total was -21.000000. running mean: -20.280798
resetting env. episode reward total was -20.000000. running mean: -20.277990
resetting env. episode reward total was -20.000000. running mean: -20.275210
resetting env. episode reward total was -21.000000. running mean: -20.282458
resetting env. episode reward total was -21.000000. running mean: -20.289634

resetting env. episode reward total was -21.000000. running mean: -20.199574
resetting env. episode reward total was -19.000000. running mean: -20.187578
resetting env. episode reward total was -21.000000. running mean: -20.195702
resetting env. episode reward total was -21.000000. running mean: -20.203745
resetting env. episode reward total was -20.000000. running mean: -20.201708
resetting env. episode reward total was -20.000000. running mean: -20.199691
resetting env. episode reward total was -19.000000. running mean: -20.187694
resetting env. episode reward total was -20.000000. running mean: -20.185817
resetting env. episode reward total was -21.000000. running mean: -20.193959
resetting env. episode reward total was -21.000000. running mean: -20.202019
resetting env. episode reward total was -21.000000. running mean: -20.209999
resetting env. episode reward total was -21.000000. running mean: -20.217899
resetting env. episode reward total was -20.000000. running mean: -20.215720

resetting env. episode reward total was -21.000000. running mean: -20.229556
resetting env. episode reward total was -20.000000. running mean: -20.227260
resetting env. episode reward total was -19.000000. running mean: -20.214988
resetting env. episode reward total was -21.000000. running mean: -20.222838
resetting env. episode reward total was -21.000000. running mean: -20.230609
resetting env. episode reward total was -21.000000. running mean: -20.238303
resetting env. episode reward total was -20.000000. running mean: -20.235920
resetting env. episode reward total was -21.000000. running mean: -20.243561
resetting env. episode reward total was -20.000000. running mean: -20.241125
resetting env. episode reward total was -21.000000. running mean: -20.248714
resetting env. episode reward total was -21.000000. running mean: -20.256227
resetting env. episode reward total was -20.000000. running mean: -20.253665
resetting env. episode reward total was -20.000000. running mean: -20.251128

resetting env. episode reward total was -20.000000. running mean: -20.349650
resetting env. episode reward total was -19.000000. running mean: -20.336154
resetting env. episode reward total was -21.000000. running mean: -20.342792
resetting env. episode reward total was -21.000000. running mean: -20.349364
resetting env. episode reward total was -20.000000. running mean: -20.345870
resetting env. episode reward total was -18.000000. running mean: -20.322412
resetting env. episode reward total was -20.000000. running mean: -20.319188
resetting env. episode reward total was -21.000000. running mean: -20.325996
resetting env. episode reward total was -20.000000. running mean: -20.322736
resetting env. episode reward total was -21.000000. running mean: -20.329508
resetting env. episode reward total was -19.000000. running mean: -20.316213
resetting env. episode reward total was -21.000000. running mean: -20.323051
resetting env. episode reward total was -20.000000. running mean: -20.319821

resetting env. episode reward total was -20.000000. running mean: -20.295577
resetting env. episode reward total was -21.000000. running mean: -20.302621
resetting env. episode reward total was -20.000000. running mean: -20.299595
resetting env. episode reward total was -21.000000. running mean: -20.306599
resetting env. episode reward total was -19.000000. running mean: -20.293533
resetting env. episode reward total was -21.000000. running mean: -20.300598
resetting env. episode reward total was -20.000000. running mean: -20.297592
resetting env. episode reward total was -21.000000. running mean: -20.304616
resetting env. episode reward total was -20.000000. running mean: -20.301570
resetting env. episode reward total was -20.000000. running mean: -20.298554
resetting env. episode reward total was -21.000000. running mean: -20.305568
resetting env. episode reward total was -20.000000. running mean: -20.302513
resetting env. episode reward total was -21.000000. running mean: -20.309487

resetting env. episode reward total was -20.000000. running mean: -20.302854
resetting env. episode reward total was -21.000000. running mean: -20.309825
resetting env. episode reward total was -20.000000. running mean: -20.306727
resetting env. episode reward total was -17.000000. running mean: -20.273660
resetting env. episode reward total was -21.000000. running mean: -20.280923
resetting env. episode reward total was -21.000000. running mean: -20.288114
resetting env. episode reward total was -20.000000. running mean: -20.285233
resetting env. episode reward total was -21.000000. running mean: -20.292380
resetting env. episode reward total was -21.000000. running mean: -20.299457
resetting env. episode reward total was -20.000000. running mean: -20.296462
resetting env. episode reward total was -19.000000. running mean: -20.283497
resetting env. episode reward total was -19.000000. running mean: -20.270662
resetting env. episode reward total was -20.000000. running mean: -20.267956

resetting env. episode reward total was -21.000000. running mean: -20.304504
resetting env. episode reward total was -21.000000. running mean: -20.311459
resetting env. episode reward total was -21.000000. running mean: -20.318344
resetting env. episode reward total was -19.000000. running mean: -20.305161
resetting env. episode reward total was -21.000000. running mean: -20.312109
resetting env. episode reward total was -19.000000. running mean: -20.298988
resetting env. episode reward total was -21.000000. running mean: -20.305998
resetting env. episode reward total was -20.000000. running mean: -20.302938
resetting env. episode reward total was -20.000000. running mean: -20.299909
resetting env. episode reward total was -21.000000. running mean: -20.306910
resetting env. episode reward total was -20.000000. running mean: -20.303840
resetting env. episode reward total was -21.000000. running mean: -20.310802
resetting env. episode reward total was -21.000000. running mean: -20.317694

resetting env. episode reward total was -21.000000. running mean: -20.307162
resetting env. episode reward total was -21.000000. running mean: -20.314090
resetting env. episode reward total was -20.000000. running mean: -20.310949
resetting env. episode reward total was -21.000000. running mean: -20.317840
resetting env. episode reward total was -18.000000. running mean: -20.294662
resetting env. episode reward total was -21.000000. running mean: -20.301715
resetting env. episode reward total was -18.000000. running mean: -20.278698
resetting env. episode reward total was -18.000000. running mean: -20.255911
resetting env. episode reward total was -21.000000. running mean: -20.263352
resetting env. episode reward total was -20.000000. running mean: -20.260718
resetting env. episode reward total was -21.000000. running mean: -20.268111
resetting env. episode reward total was -20.000000. running mean: -20.265430
resetting env. episode reward total was -18.000000. running mean: -20.242776

resetting env. episode reward total was -21.000000. running mean: -20.233501
resetting env. episode reward total was -20.000000. running mean: -20.231166
resetting env. episode reward total was -20.000000. running mean: -20.228854
resetting env. episode reward total was -21.000000. running mean: -20.236566
resetting env. episode reward total was -18.000000. running mean: -20.214200
resetting env. episode reward total was -19.000000. running mean: -20.202058
resetting env. episode reward total was -21.000000. running mean: -20.210037
resetting env. episode reward total was -18.000000. running mean: -20.187937
resetting env. episode reward total was -19.000000. running mean: -20.176058
resetting env. episode reward total was -21.000000. running mean: -20.184297
resetting env. episode reward total was -19.000000. running mean: -20.172454
resetting env. episode reward total was -21.000000. running mean: -20.180730
resetting env. episode reward total was -21.000000. running mean: -20.188922

resetting env. episode reward total was -21.000000. running mean: -20.206297
resetting env. episode reward total was -21.000000. running mean: -20.214234
resetting env. episode reward total was -21.000000. running mean: -20.222092
resetting env. episode reward total was -21.000000. running mean: -20.229871
resetting env. episode reward total was -19.000000. running mean: -20.217572
resetting env. episode reward total was -21.000000. running mean: -20.225396
resetting env. episode reward total was -21.000000. running mean: -20.233142
resetting env. episode reward total was -20.000000. running mean: -20.230811
resetting env. episode reward total was -20.000000. running mean: -20.228503
resetting env. episode reward total was -20.000000. running mean: -20.226218
resetting env. episode reward total was -21.000000. running mean: -20.233956
resetting env. episode reward total was -20.000000. running mean: -20.231616
resetting env. episode reward total was -20.000000. running mean: -20.229300

resetting env. episode reward total was -19.000000. running mean: -20.206597
resetting env. episode reward total was -20.000000. running mean: -20.204531
resetting env. episode reward total was -21.000000. running mean: -20.212486
resetting env. episode reward total was -21.000000. running mean: -20.220361
resetting env. episode reward total was -21.000000. running mean: -20.228157
resetting env. episode reward total was -20.000000. running mean: -20.225876
resetting env. episode reward total was -21.000000. running mean: -20.233617
resetting env. episode reward total was -19.000000. running mean: -20.221281
resetting env. episode reward total was -20.000000. running mean: -20.219068
resetting env. episode reward total was -19.000000. running mean: -20.206877
resetting env. episode reward total was -20.000000. running mean: -20.204808
resetting env. episode reward total was -20.000000. running mean: -20.202760
resetting env. episode reward total was -20.000000. running mean: -20.200733

resetting env. episode reward total was -19.000000. running mean: -20.221635
resetting env. episode reward total was -19.000000. running mean: -20.209419
resetting env. episode reward total was -21.000000. running mean: -20.217325
resetting env. episode reward total was -20.000000. running mean: -20.215152
resetting env. episode reward total was -21.000000. running mean: -20.223000
resetting env. episode reward total was -20.000000. running mean: -20.220770
resetting env. episode reward total was -20.000000. running mean: -20.218562
resetting env. episode reward total was -21.000000. running mean: -20.226377
resetting env. episode reward total was -20.000000. running mean: -20.224113
resetting env. episode reward total was -20.000000. running mean: -20.221872
resetting env. episode reward total was -20.000000. running mean: -20.219653
resetting env. episode reward total was -21.000000. running mean: -20.227457
resetting env. episode reward total was -21.000000. running mean: -20.235182

resetting env. episode reward total was -20.000000. running mean: -20.273070
resetting env. episode reward total was -19.000000. running mean: -20.260340
resetting env. episode reward total was -21.000000. running mean: -20.267736
resetting env. episode reward total was -20.000000. running mean: -20.265059
resetting env. episode reward total was -20.000000. running mean: -20.262408
resetting env. episode reward total was -21.000000. running mean: -20.269784
resetting env. episode reward total was -21.000000. running mean: -20.277086
resetting env. episode reward total was -21.000000. running mean: -20.284316
resetting env. episode reward total was -21.000000. running mean: -20.291472
resetting env. episode reward total was -21.000000. running mean: -20.298558
resetting env. episode reward total was -21.000000. running mean: -20.305572
resetting env. episode reward total was -21.000000. running mean: -20.312516
resetting env. episode reward total was -19.000000. running mean: -20.299391

resetting env. episode reward total was -21.000000. running mean: -20.096665
resetting env. episode reward total was -21.000000. running mean: -20.105699
resetting env. episode reward total was -20.000000. running mean: -20.104642
resetting env. episode reward total was -21.000000. running mean: -20.113595
resetting env. episode reward total was -19.000000. running mean: -20.102459
resetting env. episode reward total was -20.000000. running mean: -20.101435
resetting env. episode reward total was -21.000000. running mean: -20.110420
resetting env. episode reward total was -21.000000. running mean: -20.119316
resetting env. episode reward total was -20.000000. running mean: -20.118123
resetting env. episode reward total was -20.000000. running mean: -20.116942
resetting env. episode reward total was -21.000000. running mean: -20.125772
resetting env. episode reward total was -21.000000. running mean: -20.134514
resetting env. episode reward total was -21.000000. running mean: -20.143169

resetting env. episode reward total was -21.000000. running mean: -20.291196
resetting env. episode reward total was -21.000000. running mean: -20.298284
resetting env. episode reward total was -20.000000. running mean: -20.295301
resetting env. episode reward total was -21.000000. running mean: -20.302348
resetting env. episode reward total was -20.000000. running mean: -20.299325
resetting env. episode reward total was -19.000000. running mean: -20.286331
resetting env. episode reward total was -21.000000. running mean: -20.293468
resetting env. episode reward total was -21.000000. running mean: -20.300533
resetting env. episode reward total was -21.000000. running mean: -20.307528
resetting env. episode reward total was -20.000000. running mean: -20.304453
resetting env. episode reward total was -20.000000. running mean: -20.301408
resetting env. episode reward total was -20.000000. running mean: -20.298394
resetting env. episode reward total was -20.000000. running mean: -20.295410

resetting env. episode reward total was -18.000000. running mean: -20.195548
resetting env. episode reward total was -20.000000. running mean: -20.193592
resetting env. episode reward total was -20.000000. running mean: -20.191657
resetting env. episode reward total was -20.000000. running mean: -20.189740
resetting env. episode reward total was -21.000000. running mean: -20.197843
resetting env. episode reward total was -21.000000. running mean: -20.205864
resetting env. episode reward total was -20.000000. running mean: -20.203805
resetting env. episode reward total was -21.000000. running mean: -20.211767
resetting env. episode reward total was -21.000000. running mean: -20.219650
resetting env. episode reward total was -21.000000. running mean: -20.227453
resetting env. episode reward total was -21.000000. running mean: -20.235179
resetting env. episode reward total was -20.000000. running mean: -20.232827
resetting env. episode reward total was -20.000000. running mean: -20.230499

resetting env. episode reward total was -21.000000. running mean: -20.171598
resetting env. episode reward total was -21.000000. running mean: -20.179882
resetting env. episode reward total was -20.000000. running mean: -20.178083
resetting env. episode reward total was -20.000000. running mean: -20.176302
resetting env. episode reward total was -21.000000. running mean: -20.184539
resetting env. episode reward total was -20.000000. running mean: -20.182694
resetting env. episode reward total was -21.000000. running mean: -20.190867
resetting env. episode reward total was -21.000000. running mean: -20.198958
resetting env. episode reward total was -20.000000. running mean: -20.196969
resetting env. episode reward total was -19.000000. running mean: -20.184999
resetting env. episode reward total was -20.000000. running mean: -20.183149
resetting env. episode reward total was -20.000000. running mean: -20.181318
resetting env. episode reward total was -20.000000. running mean: -20.179504

resetting env. episode reward total was -19.000000. running mean: -20.149168
resetting env. episode reward total was -20.000000. running mean: -20.147676
resetting env. episode reward total was -20.000000. running mean: -20.146199
resetting env. episode reward total was -20.000000. running mean: -20.144737
resetting env. episode reward total was -21.000000. running mean: -20.153290
resetting env. episode reward total was -21.000000. running mean: -20.161757
resetting env. episode reward total was -20.000000. running mean: -20.160139
resetting env. episode reward total was -19.000000. running mean: -20.148538
resetting env. episode reward total was -19.000000. running mean: -20.137053
resetting env. episode reward total was -20.000000. running mean: -20.135682
resetting env. episode reward total was -20.000000. running mean: -20.134325
resetting env. episode reward total was -19.000000. running mean: -20.122982
resetting env. episode reward total was -21.000000. running mean: -20.131752

resetting env. episode reward total was -21.000000. running mean: -20.070999
resetting env. episode reward total was -21.000000. running mean: -20.080289
resetting env. episode reward total was -21.000000. running mean: -20.089486
resetting env. episode reward total was -21.000000. running mean: -20.098591
resetting env. episode reward total was -18.000000. running mean: -20.077605
resetting env. episode reward total was -21.000000. running mean: -20.086829
resetting env. episode reward total was -20.000000. running mean: -20.085961
resetting env. episode reward total was -21.000000. running mean: -20.095101
resetting env. episode reward total was -21.000000. running mean: -20.104150
resetting env. episode reward total was -20.000000. running mean: -20.103109
resetting env. episode reward total was -21.000000. running mean: -20.112078
resetting env. episode reward total was -21.000000. running mean: -20.120957
resetting env. episode reward total was -20.000000. running mean: -20.119747

resetting env. episode reward total was -20.000000. running mean: -20.185329
resetting env. episode reward total was -20.000000. running mean: -20.183476
resetting env. episode reward total was -19.000000. running mean: -20.171641
resetting env. episode reward total was -21.000000. running mean: -20.179924
resetting env. episode reward total was -20.000000. running mean: -20.178125
resetting env. episode reward total was -20.000000. running mean: -20.176344
resetting env. episode reward total was -21.000000. running mean: -20.184580
resetting env. episode reward total was -21.000000. running mean: -20.192735
resetting env. episode reward total was -21.000000. running mean: -20.200807
resetting env. episode reward total was -21.000000. running mean: -20.208799
resetting env. episode reward total was -19.000000. running mean: -20.196711
resetting env. episode reward total was -20.000000. running mean: -20.194744
resetting env. episode reward total was -20.000000. running mean: -20.192797

resetting env. episode reward total was -20.000000. running mean: -20.080019
resetting env. episode reward total was -21.000000. running mean: -20.089219
resetting env. episode reward total was -19.000000. running mean: -20.078326
resetting env. episode reward total was -21.000000. running mean: -20.087543
resetting env. episode reward total was -19.000000. running mean: -20.076668
resetting env. episode reward total was -18.000000. running mean: -20.055901
resetting env. episode reward total was -19.000000. running mean: -20.045342
resetting env. episode reward total was -20.000000. running mean: -20.044889
resetting env. episode reward total was -20.000000. running mean: -20.044440
resetting env. episode reward total was -20.000000. running mean: -20.043995
resetting env. episode reward total was -20.000000. running mean: -20.043555
resetting env. episode reward total was -20.000000. running mean: -20.043120
resetting env. episode reward total was -20.000000. running mean: -20.042689

resetting env. episode reward total was -21.000000. running mean: -20.054196
resetting env. episode reward total was -20.000000. running mean: -20.053654
resetting env. episode reward total was -19.000000. running mean: -20.043117
resetting env. episode reward total was -21.000000. running mean: -20.052686
resetting env. episode reward total was -21.000000. running mean: -20.062159
resetting env. episode reward total was -21.000000. running mean: -20.071538
resetting env. episode reward total was -21.000000. running mean: -20.080822
resetting env. episode reward total was -18.000000. running mean: -20.060014
resetting env. episode reward total was -21.000000. running mean: -20.069414
resetting env. episode reward total was -21.000000. running mean: -20.078720
resetting env. episode reward total was -20.000000. running mean: -20.077933
resetting env. episode reward total was -19.000000. running mean: -20.067153
resetting env. episode reward total was -21.000000. running mean: -20.076482

resetting env. episode reward total was -21.000000. running mean: -20.101354
resetting env. episode reward total was -20.000000. running mean: -20.100340
resetting env. episode reward total was -21.000000. running mean: -20.109337
resetting env. episode reward total was -21.000000. running mean: -20.118243
resetting env. episode reward total was -20.000000. running mean: -20.117061
resetting env. episode reward total was -18.000000. running mean: -20.095890
resetting env. episode reward total was -20.000000. running mean: -20.094931
resetting env. episode reward total was -19.000000. running mean: -20.083982
resetting env. episode reward total was -17.000000. running mean: -20.053142
resetting env. episode reward total was -20.000000. running mean: -20.052611
resetting env. episode reward total was -20.000000. running mean: -20.052085
resetting env. episode reward total was -21.000000. running mean: -20.061564
resetting env. episode reward total was -21.000000. running mean: -20.070948

resetting env. episode reward total was -21.000000. running mean: -20.138817
resetting env. episode reward total was -21.000000. running mean: -20.147429
resetting env. episode reward total was -19.000000. running mean: -20.135954
resetting env. episode reward total was -19.000000. running mean: -20.124595
resetting env. episode reward total was -21.000000. running mean: -20.133349
resetting env. episode reward total was -18.000000. running mean: -20.112015
resetting env. episode reward total was -21.000000. running mean: -20.120895
resetting env. episode reward total was -19.000000. running mean: -20.109686
resetting env. episode reward total was -20.000000. running mean: -20.108589
resetting env. episode reward total was -21.000000. running mean: -20.117503
resetting env. episode reward total was -21.000000. running mean: -20.126328
resetting env. episode reward total was -20.000000. running mean: -20.125065
resetting env. episode reward total was -21.000000. running mean: -20.133814

resetting env. episode reward total was -19.000000. running mean: -20.003191
resetting env. episode reward total was -20.000000. running mean: -20.003159
resetting env. episode reward total was -20.000000. running mean: -20.003128
resetting env. episode reward total was -20.000000. running mean: -20.003096
resetting env. episode reward total was -19.000000. running mean: -19.993065
resetting env. episode reward total was -20.000000. running mean: -19.993135
resetting env. episode reward total was -21.000000. running mean: -20.003203
resetting env. episode reward total was -20.000000. running mean: -20.003171
resetting env. episode reward total was -21.000000. running mean: -20.013140
resetting env. episode reward total was -21.000000. running mean: -20.023008
resetting env. episode reward total was -20.000000. running mean: -20.022778
resetting env. episode reward total was -20.000000. running mean: -20.022550
resetting env. episode reward total was -19.000000. running mean: -20.012325

resetting env. episode reward total was -21.000000. running mean: -19.920908
resetting env. episode reward total was -21.000000. running mean: -19.931699
resetting env. episode reward total was -20.000000. running mean: -19.932382
resetting env. episode reward total was -21.000000. running mean: -19.943058
resetting env. episode reward total was -19.000000. running mean: -19.933628
resetting env. episode reward total was -20.000000. running mean: -19.934291
resetting env. episode reward total was -19.000000. running mean: -19.924948
resetting env. episode reward total was -19.000000. running mean: -19.915699
resetting env. episode reward total was -21.000000. running mean: -19.926542
resetting env. episode reward total was -20.000000. running mean: -19.927277
resetting env. episode reward total was -20.000000. running mean: -19.928004
resetting env. episode reward total was -21.000000. running mean: -19.938724
resetting env. episode reward total was -20.000000. running mean: -19.939337

resetting env. episode reward total was -21.000000. running mean: -19.974508
resetting env. episode reward total was -21.000000. running mean: -19.984763
resetting env. episode reward total was -21.000000. running mean: -19.994916
resetting env. episode reward total was -21.000000. running mean: -20.004966
resetting env. episode reward total was -19.000000. running mean: -19.994917
resetting env. episode reward total was -19.000000. running mean: -19.984968
resetting env. episode reward total was -21.000000. running mean: -19.995118
resetting env. episode reward total was -17.000000. running mean: -19.965167
resetting env. episode reward total was -20.000000. running mean: -19.965515
resetting env. episode reward total was -20.000000. running mean: -19.965860
resetting env. episode reward total was -20.000000. running mean: -19.966201
resetting env. episode reward total was -20.000000. running mean: -19.966539
resetting env. episode reward total was -20.000000. running mean: -19.966874

resetting env. episode reward total was -21.000000. running mean: -19.945625
resetting env. episode reward total was -21.000000. running mean: -19.956169
resetting env. episode reward total was -19.000000. running mean: -19.946607
resetting env. episode reward total was -21.000000. running mean: -19.957141
resetting env. episode reward total was -21.000000. running mean: -19.967570
resetting env. episode reward total was -19.000000. running mean: -19.957894
resetting env. episode reward total was -18.000000. running mean: -19.938315
resetting env. episode reward total was -20.000000. running mean: -19.938932
resetting env. episode reward total was -18.000000. running mean: -19.919543
resetting env. episode reward total was -21.000000. running mean: -19.930347
resetting env. episode reward total was -21.000000. running mean: -19.941044
resetting env. episode reward total was -20.000000. running mean: -19.941633
resetting env. episode reward total was -21.000000. running mean: -19.952217

resetting env. episode reward total was -21.000000. running mean: -20.148901
resetting env. episode reward total was -19.000000. running mean: -20.137412
resetting env. episode reward total was -19.000000. running mean: -20.126038
resetting env. episode reward total was -20.000000. running mean: -20.124777
resetting env. episode reward total was -21.000000. running mean: -20.133530
resetting env. episode reward total was -19.000000. running mean: -20.122194
resetting env. episode reward total was -21.000000. running mean: -20.130972
resetting env. episode reward total was -21.000000. running mean: -20.139663
resetting env. episode reward total was -20.000000. running mean: -20.138266
resetting env. episode reward total was -17.000000. running mean: -20.106883
resetting env. episode reward total was -21.000000. running mean: -20.115815
resetting env. episode reward total was -19.000000. running mean: -20.104656
resetting env. episode reward total was -20.000000. running mean: -20.103610

resetting env. episode reward total was -21.000000. running mean: -20.012609
resetting env. episode reward total was -19.000000. running mean: -20.002483
resetting env. episode reward total was -20.000000. running mean: -20.002458
resetting env. episode reward total was -20.000000. running mean: -20.002434
resetting env. episode reward total was -19.000000. running mean: -19.992409
resetting env. episode reward total was -20.000000. running mean: -19.992485
resetting env. episode reward total was -21.000000. running mean: -20.002560
resetting env. episode reward total was -21.000000. running mean: -20.012535
resetting env. episode reward total was -20.000000. running mean: -20.012410
resetting env. episode reward total was -21.000000. running mean: -20.022285
resetting env. episode reward total was -21.000000. running mean: -20.032063
resetting env. episode reward total was -17.000000. running mean: -20.001742
resetting env. episode reward total was -21.000000. running mean: -20.011725

resetting env. episode reward total was -21.000000. running mean: -20.021544
resetting env. episode reward total was -18.000000. running mean: -20.001328
resetting env. episode reward total was -21.000000. running mean: -20.011315
resetting env. episode reward total was -19.000000. running mean: -20.001202
resetting env. episode reward total was -20.000000. running mean: -20.001190
resetting env. episode reward total was -21.000000. running mean: -20.011178
resetting env. episode reward total was -20.000000. running mean: -20.011066
resetting env. episode reward total was -20.000000. running mean: -20.010956
resetting env. episode reward total was -21.000000. running mean: -20.020846
resetting env. episode reward total was -19.000000. running mean: -20.010638
resetting env. episode reward total was -20.000000. running mean: -20.010531
resetting env. episode reward total was -19.000000. running mean: -20.000426
resetting env. episode reward total was -21.000000. running mean: -20.010422

resetting env. episode reward total was -21.000000. running mean: -20.071230
resetting env. episode reward total was -19.000000. running mean: -20.060517
resetting env. episode reward total was -19.000000. running mean: -20.049912
resetting env. episode reward total was -21.000000. running mean: -20.059413
resetting env. episode reward total was -20.000000. running mean: -20.058819
resetting env. episode reward total was -20.000000. running mean: -20.058231
resetting env. episode reward total was -21.000000. running mean: -20.067648
resetting env. episode reward total was -21.000000. running mean: -20.076972
resetting env. episode reward total was -20.000000. running mean: -20.076202
resetting env. episode reward total was -19.000000. running mean: -20.065440
resetting env. episode reward total was -20.000000. running mean: -20.064786
resetting env. episode reward total was -21.000000. running mean: -20.074138
resetting env. episode reward total was -21.000000. running mean: -20.083396

resetting env. episode reward total was -21.000000. running mean: -19.976572
resetting env. episode reward total was -18.000000. running mean: -19.956806
resetting env. episode reward total was -20.000000. running mean: -19.957238
resetting env. episode reward total was -20.000000. running mean: -19.957666
resetting env. episode reward total was -18.000000. running mean: -19.938089
resetting env. episode reward total was -21.000000. running mean: -19.948708
resetting env. episode reward total was -19.000000. running mean: -19.939221
resetting env. episode reward total was -19.000000. running mean: -19.929829
resetting env. episode reward total was -20.000000. running mean: -19.930531
resetting env. episode reward total was -20.000000. running mean: -19.931225
resetting env. episode reward total was -21.000000. running mean: -19.941913
resetting env. episode reward total was -20.000000. running mean: -19.942494
resetting env. episode reward total was -20.000000. running mean: -19.943069

resetting env. episode reward total was -21.000000. running mean: -19.856212
resetting env. episode reward total was -20.000000. running mean: -19.857650
resetting env. episode reward total was -21.000000. running mean: -19.869073
resetting env. episode reward total was -21.000000. running mean: -19.880382
resetting env. episode reward total was -20.000000. running mean: -19.881578
resetting env. episode reward total was -21.000000. running mean: -19.892763
resetting env. episode reward total was -20.000000. running mean: -19.893835
resetting env. episode reward total was -17.000000. running mean: -19.864897
resetting env. episode reward total was -21.000000. running mean: -19.876248
resetting env. episode reward total was -20.000000. running mean: -19.877485
resetting env. episode reward total was -21.000000. running mean: -19.888710
resetting env. episode reward total was -20.000000. running mean: -19.889823
resetting env. episode reward total was -17.000000. running mean: -19.860925

resetting env. episode reward total was -19.000000. running mean: -19.769123
resetting env. episode reward total was -20.000000. running mean: -19.771431
resetting env. episode reward total was -20.000000. running mean: -19.773717
resetting env. episode reward total was -21.000000. running mean: -19.785980
resetting env. episode reward total was -20.000000. running mean: -19.788120
resetting env. episode reward total was -20.000000. running mean: -19.790239
resetting env. episode reward total was -21.000000. running mean: -19.802337
resetting env. episode reward total was -18.000000. running mean: -19.784313
resetting env. episode reward total was -21.000000. running mean: -19.796470
resetting env. episode reward total was -19.000000. running mean: -19.788505
resetting env. episode reward total was -21.000000. running mean: -19.800620
resetting env. episode reward total was -21.000000. running mean: -19.812614
resetting env. episode reward total was -18.000000. running mean: -19.794488

resetting env. episode reward total was -21.000000. running mean: -19.930429
resetting env. episode reward total was -19.000000. running mean: -19.921124
resetting env. episode reward total was -21.000000. running mean: -19.931913
resetting env. episode reward total was -21.000000. running mean: -19.942594
resetting env. episode reward total was -20.000000. running mean: -19.943168
resetting env. episode reward total was -17.000000. running mean: -19.913736
resetting env. episode reward total was -18.000000. running mean: -19.894599
resetting env. episode reward total was -20.000000. running mean: -19.895653
resetting env. episode reward total was -21.000000. running mean: -19.906696
resetting env. episode reward total was -20.000000. running mean: -19.907629
resetting env. episode reward total was -21.000000. running mean: -19.918553
resetting env. episode reward total was -20.000000. running mean: -19.919368
resetting env. episode reward total was -21.000000. running mean: -19.930174

resetting env. episode reward total was -18.000000. running mean: -19.812128
resetting env. episode reward total was -21.000000. running mean: -19.824007
resetting env. episode reward total was -21.000000. running mean: -19.835766
resetting env. episode reward total was -19.000000. running mean: -19.827409
resetting env. episode reward total was -21.000000. running mean: -19.839135
resetting env. episode reward total was -17.000000. running mean: -19.810743
resetting env. episode reward total was -19.000000. running mean: -19.802636
resetting env. episode reward total was -19.000000. running mean: -19.794610
resetting env. episode reward total was -19.000000. running mean: -19.786664
resetting env. episode reward total was -20.000000. running mean: -19.788797
resetting env. episode reward total was -20.000000. running mean: -19.790909
resetting env. episode reward total was -19.000000. running mean: -19.783000
resetting env. episode reward total was -18.000000. running mean: -19.765170

resetting env. episode reward total was -20.000000. running mean: -19.748024
resetting env. episode reward total was -21.000000. running mean: -19.760544
resetting env. episode reward total was -20.000000. running mean: -19.762939
resetting env. episode reward total was -21.000000. running mean: -19.775309
resetting env. episode reward total was -19.000000. running mean: -19.767556
resetting env. episode reward total was -19.000000. running mean: -19.759881
resetting env. episode reward total was -18.000000. running mean: -19.742282
resetting env. episode reward total was -19.000000. running mean: -19.734859
resetting env. episode reward total was -21.000000. running mean: -19.747510
resetting env. episode reward total was -19.000000. running mean: -19.740035
resetting env. episode reward total was -18.000000. running mean: -19.722635
resetting env. episode reward total was -20.000000. running mean: -19.725409
resetting env. episode reward total was -19.000000. running mean: -19.718155

resetting env. episode reward total was -21.000000. running mean: -19.763857
resetting env. episode reward total was -21.000000. running mean: -19.776218
resetting env. episode reward total was -19.000000. running mean: -19.768456
resetting env. episode reward total was -20.000000. running mean: -19.770772
resetting env. episode reward total was -21.000000. running mean: -19.783064
resetting env. episode reward total was -21.000000. running mean: -19.795233
resetting env. episode reward total was -21.000000. running mean: -19.807281
resetting env. episode reward total was -20.000000. running mean: -19.809208
resetting env. episode reward total was -19.000000. running mean: -19.801116
resetting env. episode reward total was -20.000000. running mean: -19.803105
resetting env. episode reward total was -21.000000. running mean: -19.815074
resetting env. episode reward total was -20.000000. running mean: -19.816923
resetting env. episode reward total was -20.000000. running mean: -19.818754

resetting env. episode reward total was -21.000000. running mean: -19.778934
resetting env. episode reward total was -20.000000. running mean: -19.781145
resetting env. episode reward total was -20.000000. running mean: -19.783333
resetting env. episode reward total was -21.000000. running mean: -19.795500
resetting env. episode reward total was -20.000000. running mean: -19.797545
resetting env. episode reward total was -21.000000. running mean: -19.809569
resetting env. episode reward total was -20.000000. running mean: -19.811474
resetting env. episode reward total was -19.000000. running mean: -19.803359
resetting env. episode reward total was -20.000000. running mean: -19.805325
resetting env. episode reward total was -20.000000. running mean: -19.807272
resetting env. episode reward total was -19.000000. running mean: -19.799199
resetting env. episode reward total was -18.000000. running mean: -19.781207
resetting env. episode reward total was -19.000000. running mean: -19.773395

resetting env. episode reward total was -20.000000. running mean: -19.752221
resetting env. episode reward total was -16.000000. running mean: -19.714699
resetting env. episode reward total was -19.000000. running mean: -19.707552
resetting env. episode reward total was -20.000000. running mean: -19.710477
resetting env. episode reward total was -19.000000. running mean: -19.703372
resetting env. episode reward total was -20.000000. running mean: -19.706338
resetting env. episode reward total was -20.000000. running mean: -19.709275
resetting env. episode reward total was -21.000000. running mean: -19.722182
resetting env. episode reward total was -21.000000. running mean: -19.734960
resetting env. episode reward total was -16.000000. running mean: -19.697610
resetting env. episode reward total was -20.000000. running mean: -19.700634
resetting env. episode reward total was -20.000000. running mean: -19.703628
resetting env. episode reward total was -19.000000. running mean: -19.696592

resetting env. episode reward total was -21.000000. running mean: -19.716238
resetting env. episode reward total was -20.000000. running mean: -19.719076
resetting env. episode reward total was -18.000000. running mean: -19.701885
resetting env. episode reward total was -19.000000. running mean: -19.694866
resetting env. episode reward total was -20.000000. running mean: -19.697917
resetting env. episode reward total was -20.000000. running mean: -19.700938
resetting env. episode reward total was -19.000000. running mean: -19.693929
resetting env. episode reward total was -19.000000. running mean: -19.686990
resetting env. episode reward total was -20.000000. running mean: -19.690120
resetting env. episode reward total was -21.000000. running mean: -19.703218
resetting env. episode reward total was -21.000000. running mean: -19.716186
resetting env. episode reward total was -21.000000. running mean: -19.729024
resetting env. episode reward total was -19.000000. running mean: -19.721734

resetting env. episode reward total was -19.000000. running mean: -19.757408
resetting env. episode reward total was -20.000000. running mean: -19.759834
resetting env. episode reward total was -20.000000. running mean: -19.762235
resetting env. episode reward total was -20.000000. running mean: -19.764613
resetting env. episode reward total was -21.000000. running mean: -19.776967
resetting env. episode reward total was -16.000000. running mean: -19.739197
resetting env. episode reward total was -20.000000. running mean: -19.741805
resetting env. episode reward total was -20.000000. running mean: -19.744387
resetting env. episode reward total was -19.000000. running mean: -19.736943
resetting env. episode reward total was -21.000000. running mean: -19.749574
resetting env. episode reward total was -21.000000. running mean: -19.762078
resetting env. episode reward total was -21.000000. running mean: -19.774457
resetting env. episode reward total was -21.000000. running mean: -19.786713

resetting env. episode reward total was -13.000000. running mean: -19.623199
resetting env. episode reward total was -21.000000. running mean: -19.636967
resetting env. episode reward total was -20.000000. running mean: -19.640597
resetting env. episode reward total was -19.000000. running mean: -19.634191
resetting env. episode reward total was -21.000000. running mean: -19.647849
resetting env. episode reward total was -19.000000. running mean: -19.641371
resetting env. episode reward total was -21.000000. running mean: -19.654957
resetting env. episode reward total was -19.000000. running mean: -19.648408
resetting env. episode reward total was -20.000000. running mean: -19.651924
resetting env. episode reward total was -19.000000. running mean: -19.645404
resetting env. episode reward total was -19.000000. running mean: -19.638950
resetting env. episode reward total was -17.000000. running mean: -19.612561
resetting env. episode reward total was -21.000000. running mean: -19.626435

resetting env. episode reward total was -21.000000. running mean: -19.559153
resetting env. episode reward total was -20.000000. running mean: -19.563562
resetting env. episode reward total was -19.000000. running mean: -19.557926
resetting env. episode reward total was -19.000000. running mean: -19.552347
resetting env. episode reward total was -20.000000. running mean: -19.556823
resetting env. episode reward total was -21.000000. running mean: -19.571255
resetting env. episode reward total was -18.000000. running mean: -19.555542
resetting env. episode reward total was -20.000000. running mean: -19.559987
resetting env. episode reward total was -21.000000. running mean: -19.574387
resetting env. episode reward total was -21.000000. running mean: -19.588643
resetting env. episode reward total was -18.000000. running mean: -19.572757
resetting env. episode reward total was -20.000000. running mean: -19.577029
resetting env. episode reward total was -19.000000. running mean: -19.571259

resetting env. episode reward total was -21.000000. running mean: -19.536247
resetting env. episode reward total was -17.000000. running mean: -19.510884
resetting env. episode reward total was -20.000000. running mean: -19.515776
resetting env. episode reward total was -18.000000. running mean: -19.500618
resetting env. episode reward total was -20.000000. running mean: -19.505612
resetting env. episode reward total was -19.000000. running mean: -19.500555
resetting env. episode reward total was -20.000000. running mean: -19.505550
resetting env. episode reward total was -19.000000. running mean: -19.500494
resetting env. episode reward total was -21.000000. running mean: -19.515489
resetting env. episode reward total was -20.000000. running mean: -19.520335
resetting env. episode reward total was -20.000000. running mean: -19.525131
resetting env. episode reward total was -20.000000. running mean: -19.529880
resetting env. episode reward total was -19.000000. running mean: -19.524581

resetting env. episode reward total was -21.000000. running mean: -19.495628
resetting env. episode reward total was -21.000000. running mean: -19.510671
resetting env. episode reward total was -19.000000. running mean: -19.505565
resetting env. episode reward total was -21.000000. running mean: -19.520509
resetting env. episode reward total was -21.000000. running mean: -19.535304
resetting env. episode reward total was -21.000000. running mean: -19.549951
resetting env. episode reward total was -20.000000. running mean: -19.554451
resetting env. episode reward total was -21.000000. running mean: -19.568907
resetting env. episode reward total was -19.000000. running mean: -19.563218
resetting env. episode reward total was -20.000000. running mean: -19.567586
resetting env. episode reward total was -17.000000. running mean: -19.541910
resetting env. episode reward total was -20.000000. running mean: -19.546491
resetting env. episode reward total was -21.000000. running mean: -19.561026

resetting env. episode reward total was -19.000000. running mean: -19.642343
resetting env. episode reward total was -21.000000. running mean: -19.655920
resetting env. episode reward total was -21.000000. running mean: -19.669361
resetting env. episode reward total was -20.000000. running mean: -19.672667
resetting env. episode reward total was -20.000000. running mean: -19.675941
resetting env. episode reward total was -21.000000. running mean: -19.689181
resetting env. episode reward total was -20.000000. running mean: -19.692289
resetting env. episode reward total was -18.000000. running mean: -19.675366
resetting env. episode reward total was -21.000000. running mean: -19.688613
resetting env. episode reward total was -20.000000. running mean: -19.691727
resetting env. episode reward total was -20.000000. running mean: -19.694809
resetting env. episode reward total was -20.000000. running mean: -19.697861
resetting env. episode reward total was -19.000000. running mean: -19.690883

resetting env. episode reward total was -21.000000. running mean: -19.539629
resetting env. episode reward total was -19.000000. running mean: -19.534232
resetting env. episode reward total was -21.000000. running mean: -19.548890
resetting env. episode reward total was -18.000000. running mean: -19.533401
resetting env. episode reward total was -18.000000. running mean: -19.518067
resetting env. episode reward total was -20.000000. running mean: -19.522887
resetting env. episode reward total was -16.000000. running mean: -19.487658
resetting env. episode reward total was -18.000000. running mean: -19.472781
resetting env. episode reward total was -20.000000. running mean: -19.478053
resetting env. episode reward total was -19.000000. running mean: -19.473273
resetting env. episode reward total was -20.000000. running mean: -19.478540
resetting env. episode reward total was -17.000000. running mean: -19.453755
resetting env. episode reward total was -20.000000. running mean: -19.459217

resetting env. episode reward total was -20.000000. running mean: -19.396764
resetting env. episode reward total was -20.000000. running mean: -19.402796
resetting env. episode reward total was -20.000000. running mean: -19.408768
resetting env. episode reward total was -18.000000. running mean: -19.394680
resetting env. episode reward total was -21.000000. running mean: -19.410733
resetting env. episode reward total was -19.000000. running mean: -19.406626
resetting env. episode reward total was -19.000000. running mean: -19.402560
resetting env. episode reward total was -19.000000. running mean: -19.398534
resetting env. episode reward total was -20.000000. running mean: -19.404549
resetting env. episode reward total was -21.000000. running mean: -19.420503
resetting env. episode reward total was -19.000000. running mean: -19.416298
resetting env. episode reward total was -20.000000. running mean: -19.422135
resetting env. episode reward total was -20.000000. running mean: -19.427914

resetting env. episode reward total was -21.000000. running mean: -19.565986
resetting env. episode reward total was -21.000000. running mean: -19.580326
resetting env. episode reward total was -19.000000. running mean: -19.574523
resetting env. episode reward total was -20.000000. running mean: -19.578778
resetting env. episode reward total was -21.000000. running mean: -19.592990
resetting env. episode reward total was -17.000000. running mean: -19.567060
resetting env. episode reward total was -21.000000. running mean: -19.581390
resetting env. episode reward total was -20.000000. running mean: -19.585576
resetting env. episode reward total was -21.000000. running mean: -19.599720
resetting env. episode reward total was -19.000000. running mean: -19.593723
resetting env. episode reward total was -21.000000. running mean: -19.607785
resetting env. episode reward total was -18.000000. running mean: -19.591708
resetting env. episode reward total was -21.000000. running mean: -19.605790

resetting env. episode reward total was -19.000000. running mean: -19.497882
resetting env. episode reward total was -20.000000. running mean: -19.502903
resetting env. episode reward total was -20.000000. running mean: -19.507874
resetting env. episode reward total was -19.000000. running mean: -19.502795
resetting env. episode reward total was -20.000000. running mean: -19.507767
resetting env. episode reward total was -18.000000. running mean: -19.492690
resetting env. episode reward total was -15.000000. running mean: -19.447763
resetting env. episode reward total was -20.000000. running mean: -19.453285
resetting env. episode reward total was -19.000000. running mean: -19.448752
resetting env. episode reward total was -19.000000. running mean: -19.444265
resetting env. episode reward total was -18.000000. running mean: -19.429822
resetting env. episode reward total was -21.000000. running mean: -19.445524
resetting env. episode reward total was -18.000000. running mean: -19.431069

resetting env. episode reward total was -19.000000. running mean: -19.337612
resetting env. episode reward total was -19.000000. running mean: -19.334236
resetting env. episode reward total was -17.000000. running mean: -19.310894
resetting env. episode reward total was -17.000000. running mean: -19.287785
resetting env. episode reward total was -17.000000. running mean: -19.264907
resetting env. episode reward total was -21.000000. running mean: -19.282258
resetting env. episode reward total was -19.000000. running mean: -19.279435
resetting env. episode reward total was -21.000000. running mean: -19.296641
resetting env. episode reward total was -19.000000. running mean: -19.293674
resetting env. episode reward total was -21.000000. running mean: -19.310738
resetting env. episode reward total was -20.000000. running mean: -19.317630
resetting env. episode reward total was -19.000000. running mean: -19.314454
resetting env. episode reward total was -20.000000. running mean: -19.321309

resetting env. episode reward total was -19.000000. running mean: -19.355508
resetting env. episode reward total was -21.000000. running mean: -19.371953
resetting env. episode reward total was -18.000000. running mean: -19.358234
resetting env. episode reward total was -19.000000. running mean: -19.354651
resetting env. episode reward total was -21.000000. running mean: -19.371105
resetting env. episode reward total was -17.000000. running mean: -19.347394
resetting env. episode reward total was -19.000000. running mean: -19.343920
resetting env. episode reward total was -20.000000. running mean: -19.350481
resetting env. episode reward total was -21.000000. running mean: -19.366976
resetting env. episode reward total was -21.000000. running mean: -19.383306
resetting env. episode reward total was -21.000000. running mean: -19.399473
resetting env. episode reward total was -20.000000. running mean: -19.405478
resetting env. episode reward total was -20.000000. running mean: -19.411424

resetting env. episode reward total was -20.000000. running mean: -19.509931
resetting env. episode reward total was -20.000000. running mean: -19.514832
resetting env. episode reward total was -20.000000. running mean: -19.519683
resetting env. episode reward total was -19.000000. running mean: -19.514487
resetting env. episode reward total was -20.000000. running mean: -19.519342
resetting env. episode reward total was -20.000000. running mean: -19.524148
resetting env. episode reward total was -20.000000. running mean: -19.528907
resetting env. episode reward total was -18.000000. running mean: -19.513618
resetting env. episode reward total was -18.000000. running mean: -19.498482
resetting env. episode reward total was -18.000000. running mean: -19.483497
resetting env. episode reward total was -21.000000. running mean: -19.498662
resetting env. episode reward total was -17.000000. running mean: -19.473675
resetting env. episode reward total was -18.000000. running mean: -19.458938