N = 5000, batch_size = 32 max_experiences = 32  min_experiences = 32

In [1]:
import sys
import os
import random
import numpy as np
from collections import deque
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam
from matplotlib import pyplot as plt
import tensorflow as tf
import datetime
from statistics import mean
from tqdm import tqdm

# use tf2
# https://github.com/VXU1230/Medium-Tutorials/blob/master/dqn/cart_pole.py
# https://towardsdatascience.com/deep-reinforcement-learning-build-a-deep-q-network-dqn-to-play-cartpole-with-tensorflow-2-and-gym-8e105744b998

sys.path.append(os.getcwd())

from wumpusworld.envs.WumpusGym import WumpusWorldEnv
from BeelineAgent import Agent, Action

WORLD_SIZE = 4


class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for i in hidden_units:
            self.hidden_layers.append(tf.keras.layers.Dense(
                i, activation='tanh', kernel_initializer='RandomNormal'))
        self.output_layer = tf.keras.layers.Dense(
            num_actions, activation='linear', kernel_initializer='RandomNormal')

    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output


class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])

    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))

    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)
        i = 1

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

    def train(self, TargetNet, isdone):
        if isdone:
            pass
        elif len(self.experience['s']) < self.min_experiences:
            return 0

        # ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        ids = range(0, len(self.experience['s']))
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=-1)
        actual_values = np.where(dones, rewards, rewards + self.gamma * value_next.squeeze())

        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.num_actions), axis=-1)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        # if isinstance(loss, int):
        #     print(loss)
        # else:
        #     print(loss.numpy())

        return loss


l1 = 72
l2 = 150
l3 = 100
l4 = 6

loss_fn = MSE
learning_rate = 1e-3
optmizer = None


def action_to_string(action):
    """ action_to_string: return a string from the given action """
    if action == Action.WALK:
        return "WALK"
    if action == Action.TURNRIGHT:
        return "TURNRIGHT"
    if action == Action.TURNLEFT:
        return "TURNLEFT"
    if action == Action.SHOOT:
        return "SHOOT"
    if action == Action.GRAB:
        return "GRAB"
    if action == Action.CLIMB:
        return "CLIMB"
    return "UNKNOWN ACTION"


#
# def buildnetwork():
#     m = Sequential(
#         [
#             Input(shape=(l1,), name="layer1"),
#             Dense(l2, activation="relu", name="layer2"),
#             Dense(l3, activation="relu", name="layer3"),
#             Dense(l4, name="layer4"),
#         ]
#     )
#     m.compile(optimizer=Adam(learning_rate=learning_rate), loss=MSE)
#
#     return m


# def getoptimizer():
#     return Adam(learning_rate=learning_rate)
#
#
# def getQmodels():
#     qmodel = buildnetwork()
#     # targetmodel.set_weights(qmodel.get_weights())
#     targetmodel = clone_model(qmodel)
#     targetmodel.set_weights(qmodel.get_weights())
#     opt = getoptimizer()
#     return qmodel, targetmodel, opt


def play_game(agent, env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    gameswon = 0
    steps = 0
    while not done:
        state = agent.processPercepts(WORLD_SIZE, observations)
        # print(state[0:16])
        # print(state[16:32])
        # print(state[32:48])
        # print(state[48:64])
        # print(state[64:])
        action = TrainNet.get_action(state, epsilon)
        steps += 1
        # print(action_to_string(action))
        prev_observations = state
        observations, reward, done = env.step(action)

        state2 = agent.processPercepts(WORLD_SIZE, observations)

        rewards += int(reward)
        if done:
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': int(reward), 's2': state2, 'done': done}
        TrainNet.add_experience(exp)

    loss = TrainNet.train(TargetNet, done)
    if isinstance(loss, int):
        losses.append(loss)
    else:
        losses.append(loss.numpy())
    iter += 1
    if iter % copy_step == 0:
        TargetNet.copy_weights(TrainNet)

    if int(reward) > 0:
        gameswon += 1

    # print("steps=", steps)
    return rewards, mean(losses), gameswon, steps


def test_model(model, epsilon):
    i = 0
    test_game = WumpusWorldEnv()
    agent = Agent()
    observations = test_game.reset()

    status = 1
    while status == 1:  # A
        state = agent.processPercepts(WORLD_SIZE, observations)
        action = model.get_action(state, epsilon)

        observations, reward, done = test_game.step(action)

        reward = int(reward)

        if reward != -1:
            if reward > 0:
                status = 2
            else:
                status = 0
        i += 1
        if i > 15:
            break

    win = True if status == 2 else False
    return win


def test(tnet, epsilon):
    max_games = 1000
    wins = 0
    for i in range(max_games):
        win = test_model(tnet, epsilon)
        if win:
            wins += 1
    win_perc = float(wins) / float(max_games)
    print("Games played: {0}, # of wins: {1}".format(max_games, wins))
    print("Win percentage: {}%".format(100.0 * win_perc))


def playgame():
    env = WumpusWorldEnv()
    # env.render()
    # agent = Agent()
    gamma = 0.99
    copy_step = 25
    num_states = l1
    num_actions = len(env.action_space)
    hidden_units = [200, 200]
    max_experiences = 32
    min_experiences = 32
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 5000
    n_envs = 500
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    sumgameswon = 0
    firstwin = {}

    for i in range(n_envs):
        log_dir = 'logs/dqn/' + str(i) + '/' + current_time
        summary_writer = tf.summary.create_file_writer(log_dir)
        firstwin[i] = 0
        print("running on env = ", i)
        
        for n in tqdm(range(N)):
            agent = Agent()
            epsilon = max(min_epsilon, epsilon * decay)
            total_reward, losses, gameswon, stepstaken = play_game(agent, env, TrainNet, TargetNet, epsilon, copy_step)
            sumgameswon += gameswon
            total_rewards[n] = total_reward
            avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
            with summary_writer.as_default():
                tf.summary.scalar('episode reward', total_reward, step=n)
                tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
                tf.summary.scalar('average loss)', losses, step=n)
                tf.summary.scalar('stepstaken)', stepstaken, step=n)
                tf.summary.scalar('gameswon)', gameswon, step=n)
            if n % 100 == 0:
                print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):",
                      avg_rewards,
                      "episode loss: ", losses)
                print("avg reward for last 100 episodes:", avg_rewards)

            if gameswon == 1:
                env = WumpusWorldEnv()
                firstwin[i] = n
                # env.render()
                print("game won. Run 500 episodes on new env")
                break
        if gameswon == 0:
            print("env couldnt win")
            env.render()
            env = WumpusWorldEnv()

    print("gameswon=", sumgameswon)
    print(firstwin)

In [None]:
playgame()

  0%|          | 0/5000 [00:00<?, ?it/s]

running on env =  0


  0%|          | 1/5000 [00:00<42:39,  1.95it/s]

episode: 0 episode reward: -1000 eps: 0.989901 avg reward (last 100): -1000.0 episode loss:  1000020.5
avg reward for last 100 episodes: -1000.0


  2%|▏         | 103/5000 [00:04<03:09, 25.83it/s]

episode: 100 episode reward: -1010 eps: 0.980050830419928 avg reward (last 100): -1036.2673267326732 episode loss:  130478.48
avg reward for last 100 episodes: -1036.2673267326732


  4%|▍         | 204/5000 [00:08<02:53, 27.65it/s]

episode: 200 episode reward: -1018 eps: 0.9702986765411791 avg reward (last 100): -1037.2079207920792 episode loss:  37452.133
avg reward for last 100 episodes: -1037.2079207920792


  6%|▌         | 303/5000 [00:13<05:13, 14.99it/s]

episode: 300 episode reward: -1074 eps: 0.960643563042708 avg reward (last 100): -1041.5841584158416 episode loss:  25080.29
avg reward for last 100 episodes: -1041.5841584158416


  8%|▊         | 403/5000 [00:17<02:06, 36.41it/s]

episode: 400 episode reward: -1043 eps: 0.9510845243085565 avg reward (last 100): -1031.5346534653465 episode loss:  21438.793
avg reward for last 100 episodes: -1031.5346534653465


 10%|█         | 502/5000 [00:22<04:32, 16.51it/s]

episode: 500 episode reward: -1017 eps: 0.9416206043312847 avg reward (last 100): -1037.1881188118812 episode loss:  15578.789
avg reward for last 100 episodes: -1037.1881188118812


 12%|█▏        | 603/5000 [00:27<04:27, 16.42it/s]

episode: 600 episode reward: -1001 eps: 0.9322508566163586 avg reward (last 100): -1042.1782178217823 episode loss:  22858.943
avg reward for last 100 episodes: -1042.1782178217823


 14%|█▍        | 705/5000 [00:33<03:46, 18.99it/s]

episode: 700 episode reward: -1030 eps: 0.9229743440874912 avg reward (last 100): -1040.7227722772277 episode loss:  30259.082
avg reward for last 100 episodes: -1040.7227722772277


 16%|█▌        | 783/5000 [00:36<03:17, 21.37it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  1


  0%|          | 2/5000 [00:00<14:32,  5.73it/s]

episode: 0 episode reward: -1177 eps: 0.9152534467159078 avg reward (last 100): -1177.0 episode loss:  33663.73
avg reward for last 100 episodes: -1177.0


  2%|▏         | 89/5000 [00:07<07:14, 11.29it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  2


  0%|          | 3/5000 [00:00<22:11,  3.75it/s]

episode: 0 episode reward: -1209 eps: 0.9070527143054943 avg reward (last 100): -1209.0 episode loss:  21350.2
avg reward for last 100 episodes: -1209.0


  2%|▏         | 102/5000 [00:10<10:20,  7.89it/s]

episode: 100 episode reward: -1031 eps: 0.8980269399563684 avg reward (last 100): -1087.6039603960396 episode loss:  41258.414
avg reward for last 100 episodes: -1087.6039603960396


  4%|▍         | 201/5000 [00:20<10:47,  7.41it/s]

episode: 200 episode reward: -1085 eps: 0.8890909780308385 avg reward (last 100): -1078.5148514851485 episode loss:  24368.674
avg reward for last 100 episodes: -1078.5148514851485


  6%|▌         | 302/5000 [00:33<13:17,  5.89it/s]

episode: 300 episode reward: -1022 eps: 0.8802439348358967 avg reward (last 100): -1085.6534653465346 episode loss:  37735.137
avg reward for last 100 episodes: -1085.6534653465346


  7%|▋         | 372/5000 [00:41<08:35,  8.97it/s]
  0%|          | 1/5000 [00:00<10:30,  7.93it/s]

game won. Run 500 episodes on new env
running on env =  3
episode: 0 episode reward: -1080 eps: 0.873841232270227 avg reward (last 100): -1080.0 episode loss:  23477.818
avg reward for last 100 episodes: -1080.0


  0%|          | 10/5000 [00:01<09:06,  9.14it/s]
  0%|          | 1/5000 [00:00<09:39,  8.63it/s]

game won. Run 500 episodes on new env
running on env =  4
episode: 0 episode reward: -1061 eps: 0.8728804873832527 avg reward (last 100): -1061.0 episode loss:  27625.605
avg reward for last 100 episodes: -1061.0


  2%|▏         | 104/5000 [00:08<04:49, 16.89it/s]

episode: 100 episode reward: -1026 eps: 0.8641947492903919 avg reward (last 100): -1051.4158415841584 episode loss:  27716.268
avg reward for last 100 episodes: -1051.4158415841584


  4%|▍         | 204/5000 [00:17<04:46, 16.72it/s]

episode: 200 episode reward: -1002 eps: 0.8555954400355081 avg reward (last 100): -1049.6138613861385 episode loss:  19788.71
avg reward for last 100 episodes: -1049.6138613861385


  6%|▌         | 303/5000 [00:27<05:53, 13.29it/s]

episode: 300 episode reward: -1070 eps: 0.8470816995945079 avg reward (last 100): -1060.6435643564357 episode loss:  26211.1
avg reward for last 100 episodes: -1060.6435643564357


  8%|▊         | 403/5000 [00:34<08:25,  9.09it/s]

episode: 400 episode reward: -1131 eps: 0.838652676501106 avg reward (last 100): -1049.4257425742574 episode loss:  5011.7964
avg reward for last 100 episodes: -1049.4257425742574


 10%|█         | 503/5000 [00:42<06:24, 11.69it/s]

episode: 500 episode reward: -1017 eps: 0.8303075277616689 avg reward (last 100): -1046.1584158415842 episode loss:  5020.874
avg reward for last 100 episodes: -1046.1584158415842


 12%|█▏        | 602/5000 [00:50<05:46, 12.68it/s]

episode: 600 episode reward: -1005 eps: 0.822045418770909 avg reward (last 100): -1050.8415841584158 episode loss:  8873.076
avg reward for last 100 episodes: -1050.8415841584158


 14%|█▍        | 705/5000 [00:58<03:42, 19.33it/s]

episode: 700 episode reward: -1024 eps: 0.8138655232284105 avg reward (last 100): -1050.7128712871288 episode loss:  58036.14
avg reward for last 100 episodes: -1050.7128712871288


 16%|█▌        | 802/5000 [01:07<04:19, 16.17it/s]

episode: 800 episode reward: -1009 eps: 0.8057670230559948 avg reward (last 100): -1055.0891089108911 episode loss:  4934.974
avg reward for last 100 episodes: -1055.0891089108911


 18%|█▊        | 902/5000 [01:15<06:41, 10.22it/s]

episode: 900 episode reward: -1011 eps: 0.797749108315903 avg reward (last 100): -1051.3663366336634 episode loss:  50341.527
avg reward for last 100 episodes: -1051.3663366336634


 20%|██        | 1001/5000 [01:25<07:17,  9.15it/s]

episode: 1000 episode reward: -1027 eps: 0.7898109771297914 avg reward (last 100): -1051.2178217821781 episode loss:  14081.131
avg reward for last 100 episodes: -1051.2178217821781


 22%|██▏       | 1102/5000 [01:34<06:24, 10.13it/s]

episode: 1100 episode reward: -1001 eps: 0.7819518355985356 avg reward (last 100): -1046.3366336633662 episode loss:  2337.2178
avg reward for last 100 episodes: -1046.3366336633662


 24%|██▍       | 1203/5000 [01:42<04:39, 13.57it/s]

episode: 1200 episode reward: -1053 eps: 0.7741708977228343 avg reward (last 100): -1042.9405940594058 episode loss:  13785.934
avg reward for last 100 episodes: -1042.9405940594058


 26%|██▌       | 1308/5000 [01:51<03:38, 16.91it/s]

episode: 1300 episode reward: -1003 eps: 0.7664673853245978 avg reward (last 100): -1039.1683168316831 episode loss:  11831.793
avg reward for last 100 episodes: -1039.1683168316831


 28%|██▊       | 1402/5000 [01:59<05:27, 10.97it/s]

episode: 1400 episode reward: -1158 eps: 0.758840527969123 avg reward (last 100): -1048.2772277227723 episode loss:  42259.1
avg reward for last 100 episodes: -1048.2772277227723


 30%|███       | 1502/5000 [02:10<06:53,  8.47it/s]

episode: 1500 episode reward: -1045 eps: 0.7512895628880419 avg reward (last 100): -1060.7128712871288 episode loss:  1397.7622
avg reward for last 100 episodes: -1060.7128712871288


 32%|███▏      | 1602/5000 [02:19<06:08,  9.22it/s]

episode: 1600 episode reward: -1036 eps: 0.7438137349030363 avg reward (last 100): -1044.09900990099 episode loss:  34580.594
avg reward for last 100 episodes: -1044.09900990099


 34%|███▍      | 1703/5000 [02:28<03:57, 13.86it/s]

episode: 1700 episode reward: -1024 eps: 0.7364122963503104 avg reward (last 100): -1048.3564356435643 episode loss:  38053.984
avg reward for last 100 episodes: -1048.3564356435643


 36%|███▌      | 1802/5000 [02:39<05:43,  9.30it/s]

episode: 1800 episode reward: -1014 eps: 0.7290845070058195 avg reward (last 100): -1054.3762376237623 episode loss:  3665.944
avg reward for last 100 episodes: -1054.3762376237623


 38%|███▊      | 1901/5000 [02:49<05:52,  8.80it/s]

episode: 1900 episode reward: -1007 eps: 0.7218296340112362 avg reward (last 100): -1052.3366336633662 episode loss:  2470.0876
avg reward for last 100 episodes: -1052.3366336633662


 40%|████      | 2003/5000 [03:02<04:41, 10.66it/s]

episode: 2000 episode reward: -1023 eps: 0.7146469518006594 avg reward (last 100): -1059.5841584158416 episode loss:  2721.4253
avg reward for last 100 episodes: -1059.5841584158416


 42%|████▏     | 2105/5000 [03:12<04:46, 10.12it/s]

episode: 2100 episode reward: -1032 eps: 0.7075357420280477 avg reward (last 100): -1050.6336633663366 episode loss:  13039.4
avg reward for last 100 episodes: -1050.6336633663366


 44%|████▍     | 2201/5000 [03:19<03:30, 13.27it/s]

episode: 2200 episode reward: -1053 eps: 0.7004952934953774 avg reward (last 100): -1050.9108910891089 episode loss:  2177.6367
avg reward for last 100 episodes: -1050.9108910891089


 46%|████▌     | 2302/5000 [03:29<06:36,  6.81it/s]

episode: 2300 episode reward: -1103 eps: 0.6935249020815175 avg reward (last 100): -1059.7128712871288 episode loss:  918.02026
avg reward for last 100 episodes: -1059.7128712871288


 48%|████▊     | 2404/5000 [03:41<03:58, 10.87it/s]

episode: 2400 episode reward: -1025 eps: 0.6866238706718049 avg reward (last 100): -1054.7623762376238 episode loss:  47801.312
avg reward for last 100 episodes: -1054.7623762376238


 50%|█████     | 2503/5000 [03:48<02:26, 17.05it/s]

episode: 2500 episode reward: -1035 eps: 0.6797915090883303 avg reward (last 100): -1050.6138613861385 episode loss:  20159.4
avg reward for last 100 episodes: -1050.6138613861385


 52%|█████▏    | 2603/5000 [03:57<03:21, 11.87it/s]

episode: 2600 episode reward: -1090 eps: 0.6730271340209116 avg reward (last 100): -1057.2970297029703 episode loss:  15167.73
avg reward for last 100 episodes: -1057.2970297029703


 54%|█████▍    | 2701/5000 [04:05<02:14, 17.08it/s]

episode: 2700 episode reward: -1035 eps: 0.6663300689587526 avg reward (last 100): -1052.4653465346535 episode loss:  34283.992
avg reward for last 100 episodes: -1052.4653465346535


 56%|█████▌    | 2802/5000 [04:12<02:37, 13.91it/s]

episode: 2800 episode reward: -1096 eps: 0.6596996441227895 avg reward (last 100): -1050.0 episode loss:  15988.233
avg reward for last 100 episodes: -1050.0


 58%|█████▊    | 2903/5000 [04:20<03:36,  9.70it/s]

episode: 2900 episode reward: -1021 eps: 0.6531351963987002 avg reward (last 100): -1050.6534653465346 episode loss:  3537.9883
avg reward for last 100 episodes: -1050.6534653465346


 60%|██████    | 3002/5000 [04:31<03:40,  9.06it/s]

episode: 3000 episode reward: -1131 eps: 0.6466360692705925 avg reward (last 100): -1065.8910891089108 episode loss:  20066.871
avg reward for last 100 episodes: -1065.8910891089108


 62%|██████▏   | 3102/5000 [04:41<02:06, 14.95it/s]

episode: 3100 episode reward: -1026 eps: 0.6402016127553383 avg reward (last 100): -1059.3465346534654 episode loss:  905.008
avg reward for last 100 episodes: -1059.3465346534654


 64%|██████▍   | 3204/5000 [04:51<02:44, 10.93it/s]

episode: 3200 episode reward: -1004 eps: 0.6338311833375723 avg reward (last 100): -1056.6831683168316 episode loss:  22566.72
avg reward for last 100 episodes: -1056.6831683168316


 66%|██████▌   | 3303/5000 [05:01<02:06, 13.38it/s]

episode: 3300 episode reward: -1024 eps: 0.6275241439053327 avg reward (last 100): -1055.7128712871288 episode loss:  686.0577
avg reward for last 100 episodes: -1055.7128712871288


 68%|██████▊   | 3403/5000 [05:12<02:28, 10.74it/s]

episode: 3400 episode reward: -1075 eps: 0.6212798636863439 avg reward (last 100): -1061.4554455445545 episode loss:  926.83746
avg reward for last 100 episodes: -1061.4554455445545


 70%|███████   | 3501/5000 [05:25<06:32,  3.82it/s]

episode: 3500 episode reward: -1121 eps: 0.6150977181849303 avg reward (last 100): -1068.2475247524753 episode loss:  18727.834
avg reward for last 100 episodes: -1068.2475247524753


 72%|███████▏  | 3601/5000 [05:40<04:34,  5.10it/s]

episode: 3600 episode reward: -1122 eps: 0.6089770891195618 avg reward (last 100): -1060.2277227722773 episode loss:  1707.9995
avg reward for last 100 episodes: -1060.2277227722773


 74%|███████▍  | 3702/5000 [05:50<01:27, 14.87it/s]

episode: 3700 episode reward: -1018 eps: 0.6029173643610188 avg reward (last 100): -1055.851485148515 episode loss:  6725.711
avg reward for last 100 episodes: -1055.851485148515


 76%|███████▌  | 3800/5000 [06:00<03:05,  6.47it/s]

episode: 3800 episode reward: -1033 eps: 0.5969179378711718 avg reward (last 100): -1056.029702970297 episode loss:  807.4867
avg reward for last 100 episodes: -1056.029702970297


 78%|███████▊  | 3903/5000 [06:10<01:43, 10.64it/s]

episode: 3900 episode reward: -1052 eps: 0.5909782096423711 avg reward (last 100): -1062.4851485148515 episode loss:  194.49382
avg reward for last 100 episodes: -1062.4851485148515


 80%|████████  | 4004/5000 [06:20<01:26, 11.55it/s]

episode: 4000 episode reward: -1055 eps: 0.5850975856374407 avg reward (last 100): -1058.4554455445545 episode loss:  3571.6475
avg reward for last 100 episodes: -1058.4554455445545


 82%|████████▏ | 4102/5000 [06:29<01:10, 12.79it/s]

episode: 4100 episode reward: -1012 eps: 0.5792754777302661 avg reward (last 100): -1053.029702970297 episode loss:  16910.81
avg reward for last 100 episodes: -1053.029702970297


 84%|████████▍ | 4202/5000 [06:37<00:48, 16.53it/s]

episode: 4200 episode reward: -1018 eps: 0.573511303646978 avg reward (last 100): -1051.4455445544554 episode loss:  2542.313
avg reward for last 100 episodes: -1051.4455445544554


 86%|████████▌ | 4303/5000 [06:46<01:05, 10.57it/s]

episode: 4300 episode reward: -1078 eps: 0.5678044869077173 avg reward (last 100): -1059.7326732673268 episode loss:  427.53955
avg reward for last 100 episodes: -1059.7326732673268


 88%|████████▊ | 4402/5000 [06:58<01:24,  7.09it/s]

episode: 4400 episode reward: -1147 eps: 0.5621544567689791 avg reward (last 100): -1072.7029702970297 episode loss:  9406.948
avg reward for last 100 episodes: -1072.7029702970297


 90%|█████████ | 4500/5000 [07:11<01:11,  7.04it/s]

episode: 4500 episode reward: -1027 eps: 0.5565606481665357 avg reward (last 100): -1083.4257425742574 episode loss:  3689.6365
avg reward for last 100 episodes: -1083.4257425742574


 92%|█████████▏| 4602/5000 [07:20<00:41,  9.66it/s]

episode: 4600 episode reward: -1047 eps: 0.5510225016589201 avg reward (last 100): -1063.148514851485 episode loss:  3717.05
avg reward for last 100 episodes: -1063.148514851485


 94%|█████████▍| 4700/5000 [07:32<00:41,  7.22it/s]

episode: 4700 episode reward: -1089 eps: 0.545539463371479 avg reward (last 100): -1075.5247524752476 episode loss:  3224.2954
avg reward for last 100 episodes: -1075.5247524752476


 96%|█████████▌| 4803/5000 [07:43<00:16, 11.69it/s]

episode: 4800 episode reward: -1086 eps: 0.5401109849409782 avg reward (last 100): -1064.4257425742574 episode loss:  8682.445
avg reward for last 100 episodes: -1064.4257425742574


 98%|█████████▊| 4901/5000 [07:54<00:08, 11.48it/s]

episode: 4900 episode reward: -1018 eps: 0.5347365234607607 avg reward (last 100): -1062.7623762376238 episode loss:  3411.1404
avg reward for last 100 episodes: -1062.7623762376238


100%|██████████| 5000/5000 [08:06<00:00, 10.28it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

env couldnt win
+---+---+---+---+
| P | P |WP |  G|
|   |   |   |   |
+---+---+---+---+
| P |   | P | P |
|   |   |   |   |
+---+---+---+---+
|   |   |   |   |
|   |   |   |   |
+---+---+---+---+
|   |   | P |   |
| A>|   |   |   |
+---+---+---+---+
Score : 0
running on env =  5


  0%|          | 1/5000 [00:00<1:09:52,  1.19it/s]

episode: 0 episode reward: -1381 eps: 0.5294155414264499 avg reward (last 100): -1381.0 episode loss:  37801.81
avg reward for last 100 episodes: -1381.0


  1%|          | 30/5000 [00:12<33:13,  2.49it/s] 
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  6


  0%|          | 2/5000 [00:00<30:57,  2.69it/s]

episode: 0 episode reward: -1342 eps: 0.5277768126522379 avg reward (last 100): -1342.0 episode loss:  22689.227
avg reward for last 100 episodes: -1342.0


  1%|          | 27/5000 [00:04<14:17,  5.80it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  7
episode: 0 episode reward: -1021 eps: 0.5263010308452468 avg reward (last 100): -1021.0 episode loss:  67920.81
avg reward for last 100 episodes: -1021.0


  2%|▏         | 80/5000 [00:14<14:50,  5.52it/s]
  0%|          | 2/5000 [00:00<06:10, 13.50it/s]

game won. Run 500 episodes on new env
running on env =  8
episode: 0 episode reward: -1025 eps: 0.5220549998322245 avg reward (last 100): -1025.0 episode loss:  48985.2
avg reward for last 100 episodes: -1025.0


  2%|▏         | 101/5000 [00:10<10:51,  7.51it/s]

episode: 100 episode reward: -1152 eps: 0.5168602073444185 avg reward (last 100): -1055.2475247524753 episode loss:  16939.316
avg reward for last 100 episodes: -1055.2475247524753


  3%|▎         | 148/5000 [00:15<08:35,  9.41it/s]
  0%|          | 1/5000 [00:00<09:11,  9.07it/s]

game won. Run 500 episodes on new env
running on env =  9
episode: 0 episode reward: -1042 eps: 0.5143336610927781 avg reward (last 100): -1042.0 episode loss:  18120.822
avg reward for last 100 episodes: -1042.0


  2%|▏         | 101/5000 [00:23<22:51,  3.57it/s]

episode: 100 episode reward: -1360 eps: 0.5092157010316166 avg reward (last 100): -1151.6336633663366 episode loss:  27513.152
avg reward for last 100 episodes: -1151.6336633663366


  3%|▎         | 134/5000 [00:30<18:24,  4.41it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  10
episode: 0 episode reward: -1056 eps: 0.5074364725812754 avg reward (last 100): -1056.0 episode loss:  11400.3545
avg reward for last 100 episodes: -1056.0


  2%|▏         | 101/5000 [00:15<15:00,  5.44it/s]

episode: 100 episode reward: -1099 eps: 0.5023871441069742 avg reward (last 100): -1098.7029702970297 episode loss:  24271.295
avg reward for last 100 episodes: -1098.7029702970297


  4%|▍         | 202/5000 [00:37<27:00,  2.96it/s]

episode: 200 episode reward: -1478 eps: 0.4973880597901572 avg reward (last 100): -1140.2475247524753 episode loss:  19703.107
avg reward for last 100 episodes: -1140.2475247524753


  6%|▌         | 302/5000 [00:59<21:24,  3.66it/s]  

episode: 300 episode reward: -1066 eps: 0.4924387196682295 avg reward (last 100): -1137.5643564356435 episode loss:  6427.438
avg reward for last 100 episodes: -1137.5643564356435


  8%|▊         | 400/5000 [01:25<21:18,  3.60it/s]  

episode: 400 episode reward: -1042 eps: 0.4875386287535564 avg reward (last 100): -1123.5247524752476 episode loss:  7484.565
avg reward for last 100 episodes: -1123.5247524752476


 10%|█         | 501/5000 [01:49<21:59,  3.41it/s]

episode: 500 episode reward: -1079 eps: 0.4826872969839567 avg reward (last 100): -1117.7029702970297 episode loss:  14254.945
avg reward for last 100 episodes: -1117.7029702970297


 12%|█▏        | 602/5000 [02:10<18:46,  3.90it/s]

episode: 600 episode reward: -1062 eps: 0.477884239173692 avg reward (last 100): -1110.9405940594058 episode loss:  18850.98
avg reward for last 100 episodes: -1110.9405940594058


 14%|█▍        | 701/5000 [02:31<15:15,  4.70it/s]

episode: 700 episode reward: -1037 eps: 0.47312897496494305 avg reward (last 100): -1122.7425742574258 episode loss:  5271.247
avg reward for last 100 episodes: -1122.7425742574258


 16%|█▌        | 800/5000 [02:53<29:36,  2.36it/s]

episode: 800 episode reward: -1037 eps: 0.4684210287797683 avg reward (last 100): -1119.7326732673268 episode loss:  5974.948
avg reward for last 100 episodes: -1119.7326732673268


 18%|█▊        | 903/5000 [03:15<11:21,  6.01it/s]

episode: 900 episode reward: -1071 eps: 0.4637599297725419 avg reward (last 100): -1108.118811881188 episode loss:  3453.2883
avg reward for last 100 episodes: -1108.118811881188


 20%|█▉        | 985/5000 [03:34<14:35,  4.59it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  11
episode: 0 episode reward: -1011 eps: 0.4597884974390611 avg reward (last 100): -1011.0 episode loss:  71428.77
avg reward for last 100 episodes: -1011.0


  0%|          | 18/5000 [00:08<38:27,  2.16it/s]
  0%|          | 1/5000 [00:00<14:17,  5.83it/s]

game won. Run 500 episodes on new env
running on env =  12
episode: 0 episode reward: -1054 eps: 0.45891568508690067 avg reward (last 100): -1054.0 episode loss:  28085.186
avg reward for last 100 episodes: -1054.0


  0%|          | 5/5000 [00:02<47:09,  1.77it/s]
  0%|          | 2/5000 [00:00<06:23, 13.03it/s]

game won. Run 500 episodes on new env
running on env =  13
episode: 0 episode reward: -1002 eps: 0.45864040450402366 avg reward (last 100): -1002.0 episode loss:  52044.633
avg reward for last 100 episodes: -1002.0


  1%|          | 52/5000 [00:09<15:28,  5.33it/s]
  0%|          | 3/5000 [00:00<04:24, 18.87it/s]

game won. Run 500 episodes on new env
running on env =  14
episode: 0 episode reward: -1018 eps: 0.45621591969423353 avg reward (last 100): -1018.0 episode loss:  61252.37
avg reward for last 100 episodes: -1018.0


  2%|▏         | 103/5000 [00:16<09:37,  8.48it/s]

episode: 100 episode reward: -1000 eps: 0.45167626959375207 avg reward (last 100): -1088.148514851485 episode loss:  55969.168
avg reward for last 100 episodes: -1088.148514851485


  3%|▎         | 150/5000 [00:23<12:52,  6.28it/s]
  0%|          | 2/5000 [00:00<07:27, 11.17it/s]

game won. Run 500 episodes on new env
running on env =  15
episode: 0 episode reward: -1060 eps: 0.44937847009637977 avg reward (last 100): -1060.0 episode loss:  15330.56
avg reward for last 100 episodes: -1060.0


  0%|          | 18/5000 [00:05<25:24,  3.27it/s]
  0%|          | 1/5000 [00:00<09:42,  8.58it/s]

game won. Run 500 episodes on new env
running on env =  16
episode: 0 episode reward: -1047 eps: 0.44852541900510706 avg reward (last 100): -1047.0 episode loss:  14318.073
avg reward for last 100 episodes: -1047.0


  2%|▏         | 103/5000 [00:17<10:58,  7.44it/s]

episode: 100 episode reward: -1108 eps: 0.4440622944722767 avg reward (last 100): -1102.3168316831684 episode loss:  20748.324
avg reward for last 100 episodes: -1102.3168316831684


  4%|▍         | 202/5000 [00:35<19:20,  4.14it/s]  

episode: 200 episode reward: -1088 eps: 0.4396435809800508 avg reward (last 100): -1082.3267326732673 episode loss:  15925.203
avg reward for last 100 episodes: -1082.3267326732673


  6%|▌         | 301/5000 [00:56<35:04,  2.23it/s]

episode: 300 episode reward: -1393 eps: 0.43526883660920584 avg reward (last 100): -1098.950495049505 episode loss:  3012.7095
avg reward for last 100 episodes: -1098.950495049505


  8%|▊         | 402/5000 [01:23<30:39,  2.50it/s]

episode: 400 episode reward: -1037 eps: 0.43093762383790685 avg reward (last 100): -1146.2475247524753 episode loss:  14305.797
avg reward for last 100 episodes: -1146.2475247524753


 10%|█         | 502/5000 [01:41<11:03,  6.78it/s]

episode: 500 episode reward: -1012 eps: 0.4266495094979509 avg reward (last 100): -1085.6039603960396 episode loss:  18761.33
avg reward for last 100 episodes: -1085.6039603960396


 12%|█▏        | 602/5000 [01:58<14:19,  5.12it/s]

episode: 600 episode reward: -1082 eps: 0.4224040647314447 avg reward (last 100): -1087.6930693069307 episode loss:  4671.1294
avg reward for last 100 episodes: -1087.6930693069307


 14%|█▍        | 703/5000 [02:18<07:33,  9.47it/s]

episode: 700 episode reward: -1042 eps: 0.41820086494791414 avg reward (last 100): -1100.0 episode loss:  2841.5425
avg reward for last 100 episodes: -1100.0


 16%|█▌        | 801/5000 [02:43<12:29,  5.61it/s]

episode: 800 episode reward: -1043 eps: 0.414039489781842 avg reward (last 100): -1095.4356435643565 episode loss:  3964.1672
avg reward for last 100 episodes: -1095.4356435643565


 18%|█▊        | 901/5000 [03:10<15:35,  4.38it/s]

episode: 900 episode reward: -1056 eps: 0.4099195230506257 avg reward (last 100): -1089.0 episode loss:  1239.9154
avg reward for last 100 episodes: -1089.0


 20%|██        | 1003/5000 [03:41<12:52,  5.17it/s]

episode: 1000 episode reward: -1072 eps: 0.4058405527129545 avg reward (last 100): -1101.7920792079208 episode loss:  811.80554
avg reward for last 100 episodes: -1101.7920792079208


 22%|██▏       | 1100/5000 [04:04<26:27,  2.46it/s]

episode: 1100 episode reward: -1007 eps: 0.40180217082760145 avg reward (last 100): -1084.1782178217823 episode loss:  18821.867
avg reward for last 100 episodes: -1084.1782178217823


 24%|██▍       | 1201/5000 [04:38<15:10,  4.17it/s]

episode: 1200 episode reward: -1078 eps: 0.39780397351262436 avg reward (last 100): -1139.039603960396 episode loss:  24169.422
avg reward for last 100 episodes: -1139.039603960396


 26%|██▌       | 1302/5000 [04:53<07:20,  8.40it/s]

episode: 1300 episode reward: -1036 eps: 0.3938455609049734 avg reward (last 100): -1089.2871287128712 episode loss:  45554.586
avg reward for last 100 episodes: -1089.2871287128712


 28%|██▊       | 1402/5000 [05:15<10:27,  5.73it/s]

episode: 1400 episode reward: -1058 eps: 0.3899265371205014 avg reward (last 100): -1101.1683168316831 episode loss:  34467.117
avg reward for last 100 episodes: -1101.1683168316831


 30%|███       | 1501/5000 [05:35<17:30,  3.33it/s]

episode: 1500 episode reward: -1357 eps: 0.386046510214369 avg reward (last 100): -1104.930693069307 episode loss:  490.09952
avg reward for last 100 episodes: -1104.930693069307


 32%|███▏      | 1601/5000 [05:58<22:51,  2.48it/s]

episode: 1600 episode reward: -1194 eps: 0.38220509214184767 avg reward (last 100): -1123.4752475247524 episode loss:  952.50146
avg reward for last 100 episodes: -1123.4752475247524


 34%|███▍      | 1705/5000 [06:16<05:05, 10.80it/s]

episode: 1700 episode reward: -1083 eps: 0.3784018987195114 avg reward (last 100): -1098.1980198019803 episode loss:  34147.07
avg reward for last 100 episodes: -1098.1980198019803


 36%|███▌      | 1803/5000 [06:33<13:13,  4.03it/s]

episode: 1800 episode reward: -1100 eps: 0.37463654958681164 avg reward (last 100): -1086.0594059405942 episode loss:  126.260025
avg reward for last 100 episodes: -1086.0594059405942


 38%|███▊      | 1900/5000 [06:57<20:54,  2.47it/s]

episode: 1900 episode reward: -1009 eps: 0.3709086681680402 avg reward (last 100): -1093.7326732673268 episode loss:  195.82724
avg reward for last 100 episodes: -1093.7326732673268


 40%|████      | 2001/5000 [07:23<10:00,  4.99it/s]

episode: 2000 episode reward: -1023 eps: 0.36721788163466584 avg reward (last 100): -1086.8019801980197 episode loss:  201.1159
avg reward for last 100 episodes: -1086.8019801980197


 42%|████▏     | 2100/5000 [07:45<08:56,  5.41it/s]

episode: 2100 episode reward: -1043 eps: 0.3635638208680476 avg reward (last 100): -1097.039603960396 episode loss:  31168.936
avg reward for last 100 episodes: -1097.039603960396


 44%|████▍     | 2201/5000 [08:14<13:30,  3.45it/s]

episode: 2200 episode reward: -1086 eps: 0.35994612042251883 avg reward (last 100): -1144.1980198019803 episode loss:  2624.6133
avg reward for last 100 episodes: -1144.1980198019803


 46%|████▌     | 2302/5000 [08:33<12:30,  3.60it/s]

episode: 2300 episode reward: -1276 eps: 0.35636441848883976 avg reward (last 100): -1098.2079207920792 episode loss:  30.392262
avg reward for last 100 episodes: -1098.2079207920792


 48%|████▊     | 2403/5000 [08:52<05:39,  7.66it/s]

episode: 2400 episode reward: -1033 eps: 0.3528183568580111 avg reward (last 100): -1089.2772277227723 episode loss:  7616.597
avg reward for last 100 episodes: -1089.2772277227723


 50%|█████     | 2504/5000 [09:13<04:01, 10.34it/s]

episode: 2500 episode reward: -1021 eps: 0.3493075808854503 avg reward (last 100): -1108.1584158415842 episode loss:  27707.572
avg reward for last 100 episodes: -1108.1584158415842


 52%|█████▏    | 2601/5000 [09:29<05:56,  6.72it/s]

episode: 2600 episode reward: -1045 eps: 0.34583173945552276 avg reward (last 100): -1092.2178217821781 episode loss:  45.73044
avg reward for last 100 episodes: -1092.2178217821781


 54%|█████▍    | 2700/5000 [10:03<07:50,  4.89it/s]

episode: 2700 episode reward: -1057 eps: 0.34239048494642726 avg reward (last 100): -1169.4455445544554 episode loss:  14997.634
avg reward for last 100 episodes: -1169.4455445544554


 56%|█████▌    | 2803/5000 [10:27<05:20,  6.86it/s]

episode: 2800 episode reward: -1121 eps: 0.33898347319542815 avg reward (last 100): -1120.3465346534654 episode loss:  209.33862
avg reward for last 100 episodes: -1120.3465346534654


 58%|█████▊    | 2901/5000 [10:49<08:56,  3.91it/s]

episode: 2900 episode reward: -1271 eps: 0.335610363464438 avg reward (last 100): -1116.4950495049504 episode loss:  15454.91
avg reward for last 100 episodes: -1116.4950495049504


 60%|█████▉    | 2999/5000 [11:09<07:30,  4.44it/s]

episode: 3000 episode reward: -1031 eps: 0.33227081840593786 avg reward (last 100): -1104.5445544554455 episode loss:  19682.139
avg reward for last 100 episodes: -1104.5445544554455


 62%|██████▏   | 3101/5000 [11:32<05:58,  5.30it/s]

episode: 3100 episode reward: -1037 eps: 0.32896450402923966 avg reward (last 100): -1115.6336633663366 episode loss:  14326.469
avg reward for last 100 episodes: -1115.6336633663366


 64%|██████▍   | 3201/5000 [11:56<08:43,  3.44it/s]

episode: 3200 episode reward: -1198 eps: 0.32569108966708415 avg reward (last 100): -1126.2178217821781 episode loss:  743.82117
avg reward for last 100 episodes: -1126.2178217821781


 66%|██████▌   | 3302/5000 [12:31<03:51,  7.33it/s]

episode: 3300 episode reward: -1077 eps: 0.32245024794256905 avg reward (last 100): -1164.1287128712872 episode loss:  442.4384
avg reward for last 100 episodes: -1164.1287128712872


 68%|██████▊   | 3402/5000 [13:00<14:31,  1.83it/s]

episode: 3400 episode reward: -1028 eps: 0.3192416547364095 avg reward (last 100): -1143.3069306930693 episode loss:  839.80615
avg reward for last 100 episodes: -1143.3069306930693


 70%|███████   | 3501/5000 [13:18<06:02,  4.13it/s]

episode: 3500 episode reward: -1052 eps: 0.3160649891545213 avg reward (last 100): -1085.0891089108911 episode loss:  12110.868
avg reward for last 100 episodes: -1085.0891089108911


 72%|███████▏  | 3602/5000 [13:46<06:26,  3.62it/s]

episode: 3600 episode reward: -1167 eps: 0.31291993349592906 avg reward (last 100): -1119.2475247524753 episode loss:  13556.211
avg reward for last 100 episodes: -1119.2475247524753


 74%|███████▍  | 3701/5000 [14:23<05:04,  4.27it/s]

episode: 3700 episode reward: -1080 eps: 0.30980617322099235 avg reward (last 100): -1131.3861386138615 episode loss:  3636.3076
avg reward for last 100 episodes: -1131.3861386138615


 76%|███████▌  | 3802/5000 [14:49<06:33,  3.05it/s]

episode: 3800 episode reward: -1108 eps: 0.30672339691994815 avg reward (last 100): -1115.4257425742574 episode loss:  142.91289
avg reward for last 100 episodes: -1115.4257425742574


 78%|███████▊  | 3905/5000 [15:15<02:58,  6.12it/s]

episode: 3900 episode reward: -1488 eps: 0.3036712962817658 avg reward (last 100): -1144.3861386138615 episode loss:  4533.8325
avg reward for last 100 episodes: -1144.3861386138615


 80%|████████  | 4001/5000 [15:37<02:57,  5.64it/s]

episode: 4000 episode reward: -1113 eps: 0.3006495660633138 avg reward (last 100): -1110.3366336633662 episode loss:  3719.0798
avg reward for last 100 episodes: -1110.3366336633662


 82%|████████▏ | 4101/5000 [16:04<04:31,  3.31it/s]

episode: 4100 episode reward: -1034 eps: 0.2976579040588317 avg reward (last 100): -1144.3960396039604 episode loss:  3098.2637
avg reward for last 100 episodes: -1144.3960396039604


 84%|████████▍ | 4201/5000 [16:35<04:12,  3.16it/s]

episode: 4200 episode reward: -1102 eps: 0.2946960110697062 avg reward (last 100): -1145.2772277227723 episode loss:  272.41187
avg reward for last 100 episodes: -1145.2772277227723


 86%|████████▌ | 4301/5000 [17:06<02:41,  4.34it/s]

episode: 4300 episode reward: -1043 eps: 0.2917635908745477 avg reward (last 100): -1141.7920792079208 episode loss:  2163.6118
avg reward for last 100 episodes: -1141.7920792079208


 88%|████████▊ | 4400/5000 [17:44<03:08,  3.19it/s]

episode: 4400 episode reward: -1021 eps: 0.2888603501995658 avg reward (last 100): -1126.6930693069307 episode loss:  80.224976
avg reward for last 100 episodes: -1126.6930693069307


 90%|█████████ | 4501/5000 [18:13<03:10,  2.62it/s]

episode: 4500 episode reward: -1039 eps: 0.2859859986892382 avg reward (last 100): -1124.0891089108911 episode loss:  562.72516
avg reward for last 100 episodes: -1124.0891089108911


 92%|█████████▏| 4603/5000 [18:51<01:39,  4.01it/s]

episode: 4600 episode reward: -1310 eps: 0.28314024887727196 avg reward (last 100): -1168.6039603960396 episode loss:  21046.275
avg reward for last 100 episodes: -1168.6039603960396


 94%|█████████▍| 4702/5000 [19:43<01:40,  2.97it/s]

episode: 4700 episode reward: -1015 eps: 0.2803228161578537 avg reward (last 100): -1228.1287128712872 episode loss:  10276.597
avg reward for last 100 episodes: -1228.1287128712872


 96%|█████████▌| 4801/5000 [20:12<00:42,  4.64it/s]

episode: 4800 episode reward: -1068 eps: 0.27753341875718607 avg reward (last 100): -1133.029702970297 episode loss:  10662.642
avg reward for last 100 episodes: -1133.029702970297


 98%|█████████▊| 4901/5000 [20:45<00:27,  3.61it/s]

episode: 4900 episode reward: -1126 eps: 0.27477177770530753 avg reward (last 100): -1152.881188118812 episode loss:  9.402145
avg reward for last 100 episodes: -1152.881188118812


100%|██████████| 5000/5000 [21:12<00:00,  3.93it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

env couldnt win
+---+---+---+---+
| P |   |   |  G|
|   |   |   |   |
+---+---+---+---+
|   | P |   |   |
|   |   |   |   |
+---+---+---+---+
|WP |   |   | P |
|   |   |   |   |
+---+---+---+---+
|   |   |   |   |
| A>|   |   |   |
+---+---+---+---+
Score : 0
running on env =  17


  0%|          | 1/5000 [00:00<27:43,  3.00it/s]

episode: 0 episode reward: -1112 eps: 0.2720376168081922 avg reward (last 100): -1112.0 episode loss:  13.6655245
avg reward for last 100 episodes: -1112.0


  0%|          | 5/5000 [00:02<42:17,  1.97it/s]
  0%|          | 1/5000 [00:00<13:21,  6.24it/s]

game won. Run 500 episodes on new env
running on env =  18
episode: 0 episode reward: -1077 eps: 0.27187443503830955 avg reward (last 100): -1077.0 episode loss:  30683.79
avg reward for last 100 episodes: -1077.0


  2%|▏         | 101/5000 [00:53<36:58,  2.21it/s] 

episode: 100 episode reward: -1053 eps: 0.2691691046167689 avg reward (last 100): -1201.2178217821781 episode loss:  11940.44
avg reward for last 100 episodes: -1201.2178217821781


  4%|▍         | 201/5000 [01:46<1:00:40,  1.32it/s]

episode: 200 episode reward: -1180 eps: 0.26649069402198045 avg reward (last 100): -1167.1386138613861 episode loss:  5050.698
avg reward for last 100 episodes: -1167.1386138613861


  6%|▌         | 301/5000 [02:39<1:51:04,  1.42s/it]

episode: 300 episode reward: -1378 eps: 0.2638389353838665 avg reward (last 100): -1168.8415841584158 episode loss:  2095.167
avg reward for last 100 episodes: -1168.8415841584158


  8%|▊         | 401/5000 [04:02<1:33:57,  1.23s/it]

episode: 400 episode reward: -1355 eps: 0.26121356349783253 avg reward (last 100): -1264.1287128712872 episode loss:  5687.723
avg reward for last 100 episodes: -1264.1287128712872


 10%|█         | 501/5000 [05:10<49:27,  1.52it/s]  

episode: 500 episode reward: -1089 eps: 0.25861431579824573 avg reward (last 100): -1216.9108910891089 episode loss:  1913.7374
avg reward for last 100 episodes: -1216.9108910891089


 12%|█▏        | 601/5000 [05:51<36:59,  1.98it/s]  

episode: 600 episode reward: -1157 eps: 0.2560409323321748 avg reward (last 100): -1172.3267326732673 episode loss:  23627.65
avg reward for last 100 episodes: -1172.3267326732673


 14%|█▍        | 701/5000 [06:32<26:52,  2.67it/s]  

episode: 700 episode reward: -1057 eps: 0.253493155733392 avg reward (last 100): -1194.2178217821781 episode loss:  1623.928
avg reward for last 100 episodes: -1194.2178217821781


 16%|█▌        | 801/5000 [07:13<24:22,  2.87it/s]  

episode: 800 episode reward: -1039 eps: 0.25097073119663377 avg reward (last 100): -1198.4950495049504 episode loss:  281.5959
avg reward for last 100 episodes: -1198.4950495049504


 18%|█▊        | 901/5000 [08:01<10:01,  6.81it/s]  

episode: 900 episode reward: -1108 eps: 0.24847340645211738 avg reward (last 100): -1169.90099009901 episode loss:  108.98403
avg reward for last 100 episodes: -1169.90099009901


 20%|██        | 1001/5000 [08:45<41:00,  1.63it/s] 

episode: 1000 episode reward: -2172 eps: 0.2460009317403113 avg reward (last 100): -1172.7920792079208 episode loss:  7465.993
avg reward for last 100 episodes: -1172.7920792079208


 22%|██▏       | 1101/5000 [09:40<35:52,  1.81it/s]  

episode: 1100 episode reward: -1108 eps: 0.24355305978695654 avg reward (last 100): -1204.3267326732673 episode loss:  14565.987
avg reward for last 100 episodes: -1204.3267326732673


 24%|██▍       | 1201/5000 [10:20<34:14,  1.85it/s]

episode: 1200 episode reward: -1341 eps: 0.24112954577833648 avg reward (last 100): -1171.1089108910892 episode loss:  6842.5312
avg reward for last 100 episodes: -1171.1089108910892


 26%|██▌       | 1301/5000 [11:22<16:51,  3.66it/s]  

episode: 1300 episode reward: -1114 eps: 0.2387301473367928 avg reward (last 100): -1232.5643564356435 episode loss:  1467.3838
avg reward for last 100 episodes: -1232.5643564356435


 28%|██▊       | 1401/5000 [12:09<1:06:55,  1.12s/it]

episode: 1400 episode reward: -1338 eps: 0.23635462449648545 avg reward (last 100): -1175.970297029703 episode loss:  14408.138
avg reward for last 100 episodes: -1175.970297029703


 30%|███       | 1500/5000 [13:12<47:36,  1.23it/s]  

episode: 1500 episode reward: -1002 eps: 0.23400273967939306 avg reward (last 100): -1213.7821782178219 episode loss:  9513.761
avg reward for last 100 episodes: -1213.7821782178219


 32%|███▏      | 1602/5000 [14:07<18:14,  3.10it/s]  

episode: 1600 episode reward: -1231 eps: 0.23167425767155236 avg reward (last 100): -1183.5049504950496 episode loss:  6582.215
avg reward for last 100 episodes: -1183.5049504950496


 34%|███▍      | 1702/5000 [15:35<26:53,  2.04it/s]  

episode: 1700 episode reward: -1333 eps: 0.22936894559953486 avg reward (last 100): -1245.7821782178219 episode loss:  1191.6407
avg reward for last 100 episodes: -1245.7821782178219


 36%|███▌      | 1801/5000 [16:50<38:21,  1.39it/s]  

episode: 1800 episode reward: -1120 eps: 0.22708657290715678 avg reward (last 100): -1202.4158415841584 episode loss:  0.99681395
avg reward for last 100 episodes: -1202.4158415841584


 38%|███▊      | 1902/5000 [17:36<29:16,  1.76it/s]  

episode: 1900 episode reward: -1393 eps: 0.2248269113324206 avg reward (last 100): -1165.039603960396 episode loss:  2850.4983
avg reward for last 100 episodes: -1165.039603960396


 40%|████      | 2001/5000 [18:31<51:39,  1.03s/it]  

episode: 2000 episode reward: -1115 eps: 0.22258973488468664 avg reward (last 100): -1212.5346534653465 episode loss:  1813.0073
avg reward for last 100 episodes: -1212.5346534653465


 42%|████▏     | 2101/5000 [19:46<1:10:22,  1.46s/it]

episode: 2100 episode reward: -1569 eps: 0.22037481982207185 avg reward (last 100): -1192.3465346534654 episode loss:  2649.3042
avg reward for last 100 episodes: -1192.3465346534654


 44%|████▍     | 2201/5000 [20:49<15:05,  3.09it/s]  

episode: 2200 episode reward: -1107 eps: 0.21818194462907245 avg reward (last 100): -1199.881188118812 episode loss:  174.57864
avg reward for last 100 episodes: -1199.881188118812


 46%|████▌     | 2301/5000 [22:06<25:10,  1.79it/s]  

episode: 2300 episode reward: -1142 eps: 0.21601088999441068 avg reward (last 100): -1179.4653465346535 episode loss:  7780.588
avg reward for last 100 episodes: -1179.4653465346535


 48%|████▊     | 2401/5000 [23:26<13:29,  3.21it/s]  

episode: 2400 episode reward: -1099 eps: 0.21386143878910097 avg reward (last 100): -1227.5643564356435 episode loss:  18.229101
avg reward for last 100 episodes: -1227.5643564356435


 50%|█████     | 2501/5000 [24:07<17:36,  2.37it/s]

episode: 2500 episode reward: -1123 eps: 0.21173337604473474 avg reward (last 100): -1169.1287128712872 episode loss:  103.94551
avg reward for last 100 episodes: -1169.1287128712872


 52%|█████▏    | 2601/5000 [25:30<24:40,  1.62it/s]  

episode: 2600 episode reward: -1088 eps: 0.20962648893198127 avg reward (last 100): -1246.7128712871288 episode loss:  15651.653
avg reward for last 100 episodes: -1246.7128712871288


 54%|█████▍    | 2701/5000 [27:14<41:38,  1.09s/it]  

episode: 2700 episode reward: -1123 eps: 0.2075405667393023 avg reward (last 100): -1263.1683168316831 episode loss:  6055.0366
avg reward for last 100 episodes: -1263.1683168316831


 54%|█████▍    | 2721/5000 [27:40<23:10,  1.64it/s]  
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  19


  0%|          | 1/5000 [00:00<52:36,  1.58it/s]

episode: 0 episode reward: -1160 eps: 0.20708445659172434 avg reward (last 100): -1160.0 episode loss:  19433.078
avg reward for last 100 episodes: -1160.0


  0%|          | 25/5000 [00:33<1:52:04,  1.35s/it]
  0%|          | 2/5000 [00:00<07:52, 10.58it/s]

game won. Run 500 episodes on new env
running on env =  20
episode: 0 episode reward: -1007 eps: 0.20654670949095974 avg reward (last 100): -1007.0 episode loss:  63640.91
avg reward for last 100 episodes: -1007.0


  2%|▏         | 102/5000 [00:43<20:33,  3.97it/s] 

episode: 100 episode reward: -1199 eps: 0.20449143314040358 avg reward (last 100): -1188.3861386138615 episode loss:  39772.96
avg reward for last 100 episodes: -1188.3861386138615


  4%|▍         | 200/5000 [01:05<15:06,  5.29it/s]  

episode: 200 episode reward: -1017 eps: 0.20245660814870742 avg reward (last 100): -1097.3168316831684 episode loss:  13409.586
avg reward for last 100 episodes: -1097.3168316831684


  6%|▌         | 301/5000 [01:35<36:29,  2.15it/s]

episode: 300 episode reward: -1418 eps: 0.20044203101132604 avg reward (last 100): -1117.029702970297 episode loss:  29931.906
avg reward for last 100 episodes: -1117.029702970297


  8%|▊         | 403/5000 [02:03<17:58,  4.26it/s]

episode: 400 episode reward: -1122 eps: 0.19844750024871888 avg reward (last 100): -1133.4158415841584 episode loss:  4521.915
avg reward for last 100 episodes: -1133.4158415841584


 10%|█         | 500/5000 [02:47<34:44,  2.16it/s]  

episode: 500 episode reward: -1005 eps: 0.19647281638620004 avg reward (last 100): -1157.5148514851485 episode loss:  22589.875
avg reward for last 100 episodes: -1157.5148514851485


 12%|█▏        | 601/5000 [03:30<53:15,  1.38it/s]  

episode: 600 episode reward: -1330 eps: 0.1945177819339887 avg reward (last 100): -1161.8217821782177 episode loss:  12152.335
avg reward for last 100 episodes: -1161.8217821782177


 14%|█▍        | 701/5000 [04:35<1:30:12,  1.26s/it]

episode: 700 episode reward: -1490 eps: 0.19258220136745804 avg reward (last 100): -1158.3861386138615 episode loss:  38566.453
avg reward for last 100 episodes: -1158.3861386138615


 16%|█▌        | 801/5000 [05:26<42:53,  1.63it/s]  

episode: 800 episode reward: -1074 eps: 0.1906658811075805 avg reward (last 100): -1129.7722772277227 episode loss:  2831.2573
avg reward for last 100 episodes: -1129.7722772277227


 18%|█▊        | 900/5000 [06:16<22:15,  3.07it/s]  

episode: 900 episode reward: -1019 eps: 0.18876862950156784 avg reward (last 100): -1120.8118811881188 episode loss:  6223.1826
avg reward for last 100 episodes: -1120.8118811881188


 20%|██        | 1001/5000 [06:59<48:44,  1.37it/s] 

episode: 1000 episode reward: -1206 eps: 0.18689025680370386 avg reward (last 100): -1191.8316831683169 episode loss:  21992.104
avg reward for last 100 episodes: -1191.8316831683169


 21%|██        | 1044/5000 [07:29<28:23,  2.32it/s]  
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  21


  0%|          | 1/5000 [00:00<1:20:38,  1.03it/s]

episode: 0 episode reward: -1195 eps: 0.1860510982124391 avg reward (last 100): -1195.0 episode loss:  2274.504
avg reward for last 100 episodes: -1195.0


  2%|▏         | 101/5000 [01:31<1:02:09,  1.31it/s]

episode: 100 episode reward: -1145 eps: 0.1841997667480288 avg reward (last 100): -1237.148514851485 episode loss:  35806.047
avg reward for last 100 episodes: -1237.148514851485


  4%|▍         | 201/5000 [02:51<1:10:53,  1.13it/s]

episode: 200 episode reward: -1046 eps: 0.18236685725599094 avg reward (last 100): -1200.4950495049504 episode loss:  14996.049
avg reward for last 100 episodes: -1200.4950495049504


  6%|▌         | 279/5000 [04:30<1:16:20,  1.03it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  22
episode: 0 episode reward: -1004 eps: 0.18091367023617044 avg reward (last 100): -1004.0 episode loss:  64875.39
avg reward for last 100 episodes: -1004.0


  2%|▏         | 102/5000 [01:14<45:03,  1.81it/s]  

episode: 100 episode reward: -1022 eps: 0.17911345957754954 avg reward (last 100): -1227.079207920792 episode loss:  17466.516
avg reward for last 100 episodes: -1227.079207920792


  4%|▍         | 202/5000 [02:17<34:38,  2.31it/s]  

episode: 200 episode reward: -1008 eps: 0.1773311622054766 avg reward (last 100): -1239.4653465346535 episode loss:  18906.879
avg reward for last 100 episodes: -1239.4653465346535


  6%|▌         | 302/5000 [03:42<41:37,  1.88it/s]  

episode: 300 episode reward: -1379 eps: 0.17556659987090442 avg reward (last 100): -1259.2574257425742 episode loss:  603.78375
avg reward for last 100 episodes: -1259.2574257425742


  8%|▊         | 400/5000 [05:16<1:14:44,  1.03it/s]

episode: 400 episode reward: -1007 eps: 0.17381959609848163 avg reward (last 100): -1256.5940594059407 episode loss:  10219.686
avg reward for last 100 episodes: -1256.5940594059407


 10%|█         | 501/5000 [07:05<2:32:10,  2.03s/it]

episode: 500 episode reward: -2031 eps: 0.17208997616890304 avg reward (last 100): -1306.990099009901 episode loss:  1905.0857
avg reward for last 100 episodes: -1306.990099009901


 12%|█▏        | 600/5000 [08:32<47:33,  1.54it/s]  

episode: 600 episode reward: -1011 eps: 0.1703775671014363 avg reward (last 100): -1327.980198019802 episode loss:  15406.086
avg reward for last 100 episodes: -1327.980198019802


 13%|█▎        | 658/5000 [09:22<1:01:54,  1.17it/s]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  23


  0%|          | 1/5000 [00:00<51:25,  1.62it/s]

episode: 0 episode reward: -1187 eps: 0.16937524908465262 avg reward (last 100): -1187.0 episode loss:  12698.468
avg reward for last 100 episodes: -1187.0


  1%|          | 31/5000 [00:56<2:30:03,  1.81s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  24


  0%|          | 1/5000 [00:01<1:59:53,  1.44s/it]

episode: 0 episode reward: -1687 eps: 0.1688340875493247 avg reward (last 100): -1687.0 episode loss:  8786.625
avg reward for last 100 episodes: -1687.0


  2%|▏         | 101/5000 [01:02<1:28:22,  1.08s/it]

episode: 100 episode reward: -1429 eps: 0.16715407672676993 avg reward (last 100): -1188.7326732673268 episode loss:  6460.2783
avg reward for last 100 episodes: -1188.7326732673268


  4%|▍         | 201/5000 [02:31<46:17,  1.73it/s]  

episode: 200 episode reward: -1138 eps: 0.16549078312290547 avg reward (last 100): -1249.8910891089108 episode loss:  4995.9272
avg reward for last 100 episodes: -1249.8910891089108


  6%|▌         | 301/5000 [03:28<32:37,  2.40it/s]  

episode: 300 episode reward: -1111 eps: 0.16384404039034975 avg reward (last 100): -1166.029702970297 episode loss:  3065.3015
avg reward for last 100 episodes: -1166.029702970297


  8%|▊         | 402/5000 [04:18<17:51,  4.29it/s]  

episode: 400 episode reward: -1125 eps: 0.1622136838369883 avg reward (last 100): -1142.4851485148515 episode loss:  2471.9336
avg reward for last 100 episodes: -1142.4851485148515


 10%|█         | 501/5000 [04:58<31:34,  2.37it/s]  

episode: 500 episode reward: -1266 eps: 0.16059955040950158 avg reward (last 100): -1169.3366336633662 episode loss:  18983.938
avg reward for last 100 episodes: -1169.3366336633662


 12%|█▏        | 601/5000 [05:44<30:48,  2.38it/s]  

episode: 600 episode reward: -1381 eps: 0.1590014786770587 avg reward (last 100): -1202.3366336633662 episode loss:  2572.6553
avg reward for last 100 episodes: -1202.3366336633662


 14%|█▍        | 701/5000 [06:18<43:27,  1.65it/s]

episode: 700 episode reward: -1408 eps: 0.1574193088151722 avg reward (last 100): -1152.9405940594058 episode loss:  13487.725
avg reward for last 100 episodes: -1152.9405940594058


 16%|█▌        | 801/5000 [06:52<23:11,  3.02it/s]

episode: 800 episode reward: -1264 eps: 0.15585288258971408 avg reward (last 100): -1115.7821782178219 episode loss:  917.4838
avg reward for last 100 episodes: -1115.7821782178219


 18%|█▊        | 902/5000 [07:39<55:38,  1.23it/s]  

episode: 900 episode reward: -1055 eps: 0.15430204334109035 avg reward (last 100): -1119.7128712871288 episode loss:  1056.2688
avg reward for last 100 episodes: -1119.7128712871288


 20%|██        | 1001/5000 [08:09<13:05,  5.09it/s] 

episode: 1000 episode reward: -1195 eps: 0.1527666359685739 avg reward (last 100): -1105.2277227722773 episode loss:  378.61612
avg reward for last 100 episodes: -1105.2277227722773


 22%|██▏       | 1101/5000 [08:42<23:07,  2.81it/s]  

episode: 1100 episode reward: -1158 eps: 0.15124650691479208 avg reward (last 100): -1142.930693069307 episode loss:  5295.4736
avg reward for last 100 episodes: -1142.930693069307


 24%|██▍       | 1204/5000 [09:11<14:21,  4.41it/s]

episode: 1200 episode reward: -1046 eps: 0.14974150415036994 avg reward (last 100): -1121.4752475247524 episode loss:  668.3746
avg reward for last 100 episodes: -1121.4752475247524


 26%|██▌       | 1301/5000 [09:49<40:26,  1.52it/s]  

episode: 1300 episode reward: -1157 eps: 0.14825147715872514 avg reward (last 100): -1168.2970297029703 episode loss:  1105.1195
avg reward for last 100 episodes: -1168.2970297029703


 28%|██▊       | 1401/5000 [10:44<56:19,  1.06it/s]  

episode: 1400 episode reward: -1287 eps: 0.14677627692101491 avg reward (last 100): -1169.6435643564357 episode loss:  1276.6833
avg reward for last 100 episodes: -1169.6435643564357


 30%|███       | 1501/5000 [11:44<13:43,  4.25it/s]  

episode: 1500 episode reward: -1133 eps: 0.14531575590123247 avg reward (last 100): -1167.6534653465346 episode loss:  6861.0024
avg reward for last 100 episodes: -1167.6534653465346


 31%|███       | 1534/5000 [12:13<27:36,  2.09it/s]  
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  25


  0%|          | 1/5000 [00:02<3:52:39,  2.79s/it]

episode: 0 episode reward: -1907 eps: 0.1448080144339946 avg reward (last 100): -1907.0 episode loss:  1628.5059
avg reward for last 100 episodes: -1907.0


  0%|          | 5/5000 [00:15<4:18:33,  3.11s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  26


  0%|          | 1/5000 [00:01<2:13:54,  1.61s/it]

episode: 0 episode reward: -1471 eps: 0.14472115134364041 avg reward (last 100): -1471.0 episode loss:  12926.991
avg reward for last 100 episodes: -1471.0


  2%|▏         | 101/5000 [01:24<13:34,  6.01it/s] 

episode: 100 episode reward: -1067 eps: 0.14328108018242516 avg reward (last 100): -1259.6534653465346 episode loss:  1035.4889
avg reward for last 100 episodes: -1259.6534653465346


  4%|▎         | 176/5000 [02:01<55:42,  1.44it/s]  
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  27


  0%|          | 1/5000 [00:00<23:04,  3.61it/s]

episode: 0 episode reward: -1058 eps: 0.14218199780777724 avg reward (last 100): -1058.0 episode loss:  3657.7302
avg reward for last 100 episodes: -1058.0


  0%|          | 22/5000 [00:46<2:54:36,  2.10s/it]
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  28
episode: 0 episode reward: -1025 eps: 0.14185533868159533 avg reward (last 100): -1025.0 episode loss:  39696.215
avg reward for last 100 episodes: -1025.0


  2%|▏         | 101/5000 [01:05<1:08:49,  1.19it/s]

episode: 100 episode reward: -1175 eps: 0.14044378425155413 avg reward (last 100): -1303.1287128712872 episode loss:  2501.3645
avg reward for last 100 episodes: -1303.1287128712872


  4%|▍         | 201/5000 [02:46<2:06:48,  1.59s/it]

episode: 200 episode reward: -1442 eps: 0.13904627572156494 avg reward (last 100): -1342.8316831683169 episode loss:  15620.58
avg reward for last 100 episodes: -1342.8316831683169


  6%|▌         | 301/5000 [03:51<38:51,  2.02it/s]  

episode: 300 episode reward: -1153 eps: 0.13766267332563367 avg reward (last 100): -1284.4356435643565 episode loss:  9605.363
avg reward for last 100 episodes: -1284.4356435643565


  8%|▊         | 401/5000 [05:22<1:33:32,  1.22s/it]

episode: 400 episode reward: -1061 eps: 0.13629283868853 avg reward (last 100): -1392.049504950495 episode loss:  8289.994
avg reward for last 100 episodes: -1392.049504950495


 10%|█         | 501/5000 [06:20<1:09:12,  1.08it/s]

episode: 500 episode reward: -1916 eps: 0.13493663481194906 avg reward (last 100): -1258.6435643564357 episode loss:  15614.61
avg reward for last 100 episodes: -1258.6435643564357


 12%|█▏        | 601/5000 [07:17<1:52:07,  1.53s/it]

episode: 600 episode reward: -1174 eps: 0.1335939260608092 avg reward (last 100): -1266.6534653465346 episode loss:  328.77414
avg reward for last 100 episodes: -1266.6534653465346


 13%|█▎        | 665/5000 [08:23<54:44,  1.32it/s]  
  0%|          | 0/5000 [00:00<?, ?it/s]

game won. Run 500 episodes on new env
running on env =  29


  0%|          | 2/5000 [00:00<24:45,  3.36it/s]

episode: 0 episode reward: -1191 eps: 0.13271506563488036 avg reward (last 100): -1191.0 episode loss:  2707.2007
avg reward for last 100 episodes: -1191.0


  2%|▏         | 101/5000 [01:22<1:30:51,  1.11s/it]

episode: 100 episode reward: -1755 eps: 0.13139446296619536 avg reward (last 100): -1351.049504950495 episode loss:  40697.26
avg reward for last 100 episodes: -1351.049504950495


  4%|▍         | 201/5000 [02:41<40:21,  1.98it/s]  

episode: 200 episode reward: -1048 eps: 0.1300870011673897 avg reward (last 100): -1357.1683168316831 episode loss:  9692.519
avg reward for last 100 episodes: -1357.1683168316831


  6%|▌         | 300/5000 [04:01<1:28:31,  1.13s/it]

episode: 300 episode reward: -1021 eps: 0.1287925494781179 avg reward (last 100): -1375.4455445544554 episode loss:  11571.025
avg reward for last 100 episodes: -1375.4455445544554


  8%|▊         | 401/5000 [06:19<2:23:18,  1.87s/it]

episode: 400 episode reward: -1302 eps: 0.12751097843918635 avg reward (last 100): -1472.2178217821781 episode loss:  14365.626
avg reward for last 100 episodes: -1472.2178217821781


 10%|█         | 501/5000 [08:42<2:18:59,  1.85s/it]

episode: 500 episode reward: -1714 eps: 0.12624215987960608 avg reward (last 100): -1470.4851485148515 episode loss:  274.78897
avg reward for last 100 episodes: -1470.4851485148515


 12%|█▏        | 601/5000 [10:46<1:23:29,  1.14s/it]

episode: 600 episode reward: -1490 eps: 0.12498596690377446 avg reward (last 100): -1352.1089108910892 episode loss:  8578.328
avg reward for last 100 episodes: -1352.1089108910892


 14%|█▍        | 701/5000 [12:35<1:19:28,  1.11s/it]

episode: 700 episode reward: -1735 eps: 0.12374227387878355 avg reward (last 100): -1396.920792079208 episode loss:  25899.146
avg reward for last 100 episodes: -1396.920792079208


 16%|█▌        | 801/5000 [14:25<2:54:51,  2.50s/it]

episode: 800 episode reward: -3338 eps: 0.12251095642185622 avg reward (last 100): -1525.8118811881188 episode loss:  21306.844
avg reward for last 100 episodes: -1525.8118811881188


 18%|█▊        | 901/5000 [16:30<1:30:39,  1.33s/it]

episode: 900 episode reward: -1888 eps: 0.12129189138790626 avg reward (last 100): -1576.7920792079208 episode loss:  503.9428
avg reward for last 100 episodes: -1576.7920792079208


 20%|██        | 1001/5000 [18:41<1:46:17,  1.59s/it]

episode: 1000 episode reward: -1239 eps: 0.12008495685722234 avg reward (last 100): -1462.881188118812 episode loss:  12117.436
avg reward for last 100 episodes: -1462.881188118812


 22%|██▏       | 1101/5000 [21:30<1:38:00,  1.51s/it]

episode: 1100 episode reward: -1206 eps: 0.11889003212327494 avg reward (last 100): -1460.3762376237623 episode loss:  2530.2327
avg reward for last 100 episodes: -1460.3762376237623


 24%|██▍       | 1201/5000 [25:02<50:07,  1.26it/s]  

episode: 1200 episode reward: -1095 eps: 0.11770699768064427 avg reward (last 100): -1566.1386138613861 episode loss:  11080.656
avg reward for last 100 episodes: -1566.1386138613861


 26%|██▌       | 1302/5000 [27:40<50:35,  1.22it/s]  

episode: 1300 episode reward: -1682 eps: 0.11653573521306874 avg reward (last 100): -1487.2871287128712 episode loss:  572.63074
avg reward for last 100 episodes: -1487.2871287128712


 28%|██▊       | 1401/5000 [29:19<47:50,  1.25it/s]  

episode: 1400 episode reward: -1216 eps: 0.11537612758161157 avg reward (last 100): -1415.2079207920792 episode loss:  10256.634
avg reward for last 100 episodes: -1415.2079207920792


 30%|███       | 1501/5000 [30:57<51:06,  1.14it/s]  

episode: 1500 episode reward: -1426 eps: 0.11422805881294594 avg reward (last 100): -1424.6831683168316 episode loss:  1667.7441
avg reward for last 100 episodes: -1424.6831683168316


 32%|███▏      | 1601/5000 [32:25<51:00,  1.11it/s]  

episode: 1600 episode reward: -1219 eps: 0.11309141408775629 avg reward (last 100): -1381.9108910891089 episode loss:  7741.1543
avg reward for last 100 episodes: -1381.9108910891089


 34%|███▍      | 1703/5000 [33:48<55:33,  1.01s/it]  

episode: 1700 episode reward: -1887 eps: 0.11196607972925528 avg reward (last 100): -1363.7920792079208 episode loss:  13469.767
avg reward for last 100 episodes: -1363.7920792079208


 36%|███▌      | 1801/5000 [36:03<2:58:47,  3.35s/it]

episode: 1800 episode reward: -3874 eps: 0.11085194319181457 avg reward (last 100): -1624.3168316831684 episode loss:  157.9953
avg reward for last 100 episodes: -1624.3168316831684


 38%|███▊      | 1902/5000 [37:18<12:11,  4.23it/s]  

episode: 1900 episode reward: -1021 eps: 0.10974889304970954 avg reward (last 100): -1356.0891089108911 episode loss:  272.45862
avg reward for last 100 episodes: -1356.0891089108911


 40%|████      | 2001/5000 [38:35<1:15:04,  1.50s/it]

episode: 2000 episode reward: -1950 eps: 0.10865681898597505 avg reward (last 100): -1340.4257425742574 episode loss:  10078.484
avg reward for last 100 episodes: -1340.4257425742574


 42%|████▏     | 2101/5000 [40:33<1:14:45,  1.55s/it]

episode: 2100 episode reward: -2512 eps: 0.1075756117813727 avg reward (last 100): -1515.5247524752476 episode loss:  53.568436
avg reward for last 100 episodes: -1515.5247524752476


 44%|████▍     | 2202/5000 [41:53<22:05,  2.11it/s]  

episode: 2200 episode reward: -1116 eps: 0.10650516330346776 avg reward (last 100): -1359.2079207920792 episode loss:  780.61957
avg reward for last 100 episodes: -1359.2079207920792


 46%|████▌     | 2302/5000 [43:53<24:40,  1.82it/s]  

episode: 2300 episode reward: -1389 eps: 0.10544536649581479 avg reward (last 100): -1544.5049504950496 episode loss:  7.725091
avg reward for last 100 episodes: -1544.5049504950496


 48%|████▊     | 2401/5000 [45:36<24:13,  1.79it/s]  

episode: 2400 episode reward: -1203 eps: 0.10439611536725081 avg reward (last 100): -1437.9108910891089 episode loss:  335.2896
avg reward for last 100 episodes: -1437.9108910891089


 50%|█████     | 2501/5000 [46:46<12:19,  3.38it/s]  

episode: 2500 episode reward: -1072 eps: 0.10335730498129483 avg reward (last 100): -1317.6732673267327 episode loss:  5.108186
avg reward for last 100 episodes: -1317.6732673267327


 52%|█████▏    | 2601/5000 [48:24<52:37,  1.32s/it]  

episode: 2600 episode reward: -1760 eps: 0.10232883144565334 avg reward (last 100): -1442.851485148515 episode loss:  15127.111
avg reward for last 100 episodes: -1442.851485148515


 54%|█████▍    | 2700/5000 [49:43<16:26,  2.33it/s]  

episode: 2700 episode reward: -1013 eps: 0.10131059190182996 avg reward (last 100): -1339.8613861386139 episode loss:  178.0207
avg reward for last 100 episodes: -1339.8613861386139


 56%|█████▌    | 2803/5000 [50:51<11:24,  3.21it/s]  

episode: 2800 episode reward: -1124 eps: 0.10030248451483827 avg reward (last 100): -1304.079207920792 episode loss:  2249.4917
avg reward for last 100 episodes: -1304.079207920792


 58%|█████▊    | 2901/5000 [52:03<1:00:35,  1.73s/it]

episode: 2900 episode reward: -1938 eps: 0.1 avg reward (last 100): -1303.2772277227723 episode loss:  12099.115
avg reward for last 100 episodes: -1303.2772277227723


 60%|██████    | 3001/5000 [53:18<25:46,  1.29it/s]  

episode: 3000 episode reward: -1500 eps: 0.1 avg reward (last 100): -1322.970297029703 episode loss:  1217.2554
avg reward for last 100 episodes: -1322.970297029703


 62%|██████▏   | 3101/5000 [54:23<15:05,  2.10it/s]

episode: 3100 episode reward: -1267 eps: 0.1 avg reward (last 100): -1289.2970297029703 episode loss:  1370.7524
avg reward for last 100 episodes: -1289.2970297029703


 64%|██████▍   | 3201/5000 [55:47<1:05:12,  2.17s/it]

episode: 3200 episode reward: -1534 eps: 0.1 avg reward (last 100): -1374.9405940594058 episode loss:  1995.3048
avg reward for last 100 episodes: -1374.9405940594058


 66%|██████▌   | 3301/5000 [58:37<32:12,  1.14s/it]  

episode: 3300 episode reward: -1196 eps: 0.1 avg reward (last 100): -1550.2079207920792 episode loss:  16422.941
avg reward for last 100 episodes: -1550.2079207920792


 68%|██████▊   | 3401/5000 [1:00:22<21:07,  1.26it/s]  

episode: 3400 episode reward: -1471 eps: 0.1 avg reward (last 100): -1317.5544554455446 episode loss:  2141.939
avg reward for last 100 episodes: -1317.5544554455446


 70%|███████   | 3501/5000 [1:01:28<14:26,  1.73it/s]

episode: 3500 episode reward: -1473 eps: 0.1 avg reward (last 100): -1264.5742574257426 episode loss:  13197.191
avg reward for last 100 episodes: -1264.5742574257426


 72%|███████▏  | 3601/5000 [1:02:56<32:41,  1.40s/it]

episode: 3600 episode reward: -1739 eps: 0.1 avg reward (last 100): -1394.1683168316831 episode loss:  12259.983
avg reward for last 100 episodes: -1394.1683168316831


 74%|███████▍  | 3701/5000 [1:04:18<15:44,  1.38it/s]

episode: 3700 episode reward: -1079 eps: 0.1 avg reward (last 100): -1345.950495049505 episode loss:  526.34656
avg reward for last 100 episodes: -1345.950495049505


 76%|███████▌  | 3802/5000 [1:05:49<12:31,  1.59it/s]

episode: 3800 episode reward: -1651 eps: 0.1 avg reward (last 100): -1382.4257425742574 episode loss:  15.59693
avg reward for last 100 episodes: -1382.4257425742574


 78%|███████▊  | 3901/5000 [1:06:52<21:54,  1.20s/it]

episode: 3900 episode reward: -1441 eps: 0.1 avg reward (last 100): -1288.2277227722773 episode loss:  125.62919
avg reward for last 100 episodes: -1288.2277227722773


 80%|████████  | 4001/5000 [1:07:46<09:28,  1.76it/s]

episode: 4000 episode reward: -1350 eps: 0.1 avg reward (last 100): -1229.2871287128712 episode loss:  15.423364
avg reward for last 100 episodes: -1229.2871287128712


 82%|████████▏ | 4101/5000 [1:08:54<05:34,  2.69it/s]

episode: 4100 episode reward: -1201 eps: 0.1 avg reward (last 100): -1284.5544554455446 episode loss:  2.3089178
avg reward for last 100 episodes: -1284.5544554455446


 84%|████████▍ | 4202/5000 [1:10:27<07:53,  1.69it/s]

episode: 4200 episode reward: -1148 eps: 0.1 avg reward (last 100): -1405.7128712871288 episode loss:  14855.125
avg reward for last 100 episodes: -1405.7128712871288


 86%|████████▌ | 4301/5000 [1:12:22<07:18,  1.59it/s]

episode: 4300 episode reward: -1075 eps: 0.1 avg reward (last 100): -1519.2178217821781 episode loss:  396.46356
avg reward for last 100 episodes: -1519.2178217821781


 88%|████████▊ | 4401/5000 [1:14:01<14:41,  1.47s/it]

episode: 4400 episode reward: -1094 eps: 0.1 avg reward (last 100): -1415.4653465346535 episode loss:  0.067428485
avg reward for last 100 episodes: -1415.4653465346535


 90%|█████████ | 4501/5000 [1:15:30<06:37,  1.26it/s]

episode: 4500 episode reward: -1130 eps: 0.1 avg reward (last 100): -1380.90099009901 episode loss:  383.3667
avg reward for last 100 episodes: -1380.90099009901


 92%|█████████▏| 4600/5000 [1:16:52<03:05,  2.16it/s]

episode: 4600 episode reward: -1043 eps: 0.1 avg reward (last 100): -1340.7029702970297 episode loss:  85.61503
avg reward for last 100 episodes: -1340.7029702970297


 94%|█████████▍| 4702/5000 [1:17:49<02:26,  2.04it/s]

episode: 4700 episode reward: -1320 eps: 0.1 avg reward (last 100): -1249.6039603960396 episode loss:  4131.443
avg reward for last 100 episodes: -1249.6039603960396


 96%|█████████▌| 4801/5000 [1:18:50<01:31,  2.18it/s]

episode: 4800 episode reward: -1051 eps: 0.1 avg reward (last 100): -1271.3168316831684 episode loss:  105.30304
avg reward for last 100 episodes: -1271.3168316831684


 98%|█████████▊| 4901/5000 [1:19:46<00:37,  2.62it/s]

episode: 4900 episode reward: -1248 eps: 0.1 avg reward (last 100): -1226.4455445544554 episode loss:  310.33444
avg reward for last 100 episodes: -1226.4455445544554


100%|██████████| 5000/5000 [1:20:51<00:00,  1.03it/s]
  0%|          | 1/5000 [00:00<13:09,  6.33it/s]

env couldnt win
+---+---+---+---+
|W  |  G|   |   |
|   |   |   |   |
+---+---+---+---+
| P | P |   | P |
|   |   |   |   |
+---+---+---+---+
|   |   |   |   |
|   |   |   |   |
+---+---+---+---+
|   |   | P |   |
| A>|   |   |   |
+---+---+---+---+
Score : 0
running on env =  30
episode: 0 episode reward: -1063 eps: 0.1 avg reward (last 100): -1063.0 episode loss:  27207.682
avg reward for last 100 episodes: -1063.0


  2%|▏         | 101/5000 [00:30<24:49,  3.29it/s] 

episode: 100 episode reward: -1095 eps: 0.1 avg reward (last 100): -1128.0891089108911 episode loss:  1415.1782
avg reward for last 100 episodes: -1128.0891089108911


  4%|▍         | 201/5000 [00:57<21:05,  3.79it/s]

episode: 200 episode reward: -1071 eps: 0.1 avg reward (last 100): -1114.5742574257426 episode loss:  14684.638
avg reward for last 100 episodes: -1114.5742574257426


  6%|▌         | 302/5000 [01:23<13:29,  5.80it/s]

episode: 300 episode reward: -1021 eps: 0.1 avg reward (last 100): -1107.5544554455446 episode loss:  235.61446
avg reward for last 100 episodes: -1107.5544554455446


  8%|▊         | 400/5000 [02:19<17:27,  4.39it/s]  

episode: 400 episode reward: -1006 eps: 0.1 avg reward (last 100): -1239.6237623762377 episode loss:  138.21964
avg reward for last 100 episodes: -1239.6237623762377


 10%|█         | 501/5000 [04:17<1:16:58,  1.03s/it]

episode: 500 episode reward: -1306 eps: 0.1 avg reward (last 100): -1520.7920792079208 episode loss:  110.657646
avg reward for last 100 episodes: -1520.7920792079208


 12%|█▏        | 601/5000 [06:01<1:43:35,  1.41s/it]

episode: 600 episode reward: -1951 eps: 0.1 avg reward (last 100): -1430.09900990099 episode loss:  4437.5527
avg reward for last 100 episodes: -1430.09900990099


 14%|█▍        | 701/5000 [07:35<39:48,  1.80it/s]  

episode: 700 episode reward: -1293 eps: 0.1 avg reward (last 100): -1383.4356435643565 episode loss:  3523.3926
avg reward for last 100 episodes: -1383.4356435643565


 16%|█▌        | 801/5000 [09:27<1:33:32,  1.34s/it]

episode: 800 episode reward: -1131 eps: 0.1 avg reward (last 100): -1513.990099009901 episode loss:  2840.4954
avg reward for last 100 episodes: -1513.990099009901


 18%|█▊        | 901/5000 [11:28<2:21:56,  2.08s/it]

episode: 900 episode reward: -3229 eps: 0.1 avg reward (last 100): -1503.3960396039604 episode loss:  3665.406
avg reward for last 100 episodes: -1503.3960396039604


 20%|██        | 1001/5000 [13:35<1:18:36,  1.18s/it]

episode: 1000 episode reward: -1467 eps: 0.1 avg reward (last 100): -1566.980198019802 episode loss:  3712.0085
avg reward for last 100 episodes: -1566.980198019802


 22%|██▏       | 1101/5000 [15:53<1:43:31,  1.59s/it]

episode: 1100 episode reward: -1209 eps: 0.1 avg reward (last 100): -1549.6732673267327 episode loss:  12414.313
avg reward for last 100 episodes: -1549.6732673267327


 24%|██▍       | 1201/5000 [17:54<54:12,  1.17it/s]  

episode: 1200 episode reward: -1077 eps: 0.1 avg reward (last 100): -1505.1584158415842 episode loss:  3945.813
avg reward for last 100 episodes: -1505.1584158415842


 26%|██▌       | 1301/5000 [19:48<1:07:17,  1.09s/it]

episode: 1300 episode reward: -1503 eps: 0.1 avg reward (last 100): -1511.2871287128712 episode loss:  6.056345
avg reward for last 100 episodes: -1511.2871287128712


 28%|██▊       | 1401/5000 [21:26<37:56,  1.58it/s]  

episode: 1400 episode reward: -1100 eps: 0.1 avg reward (last 100): -1424.930693069307 episode loss:  3593.0166
avg reward for last 100 episodes: -1424.930693069307


 30%|███       | 1501/5000 [23:55<37:57,  1.54it/s]  

episode: 1500 episode reward: -1073 eps: 0.1 avg reward (last 100): -1610.90099009901 episode loss:  17165.12
avg reward for last 100 episodes: -1610.90099009901


 32%|███▏      | 1601/5000 [25:40<49:06,  1.15it/s]  

episode: 1600 episode reward: -1049 eps: 0.1 avg reward (last 100): -1470.6633663366338 episode loss:  2481.971
avg reward for last 100 episodes: -1470.6633663366338


 32%|███▏      | 1619/5000 [26:02<1:27:22,  1.55s/it]