# Deep Q-Learning for Cart Pole (PyTorch)

In this notebook, we will use Deep Q-Learning to solve Cart Pole.


Environment: (https://gymnasium.farama.org/environments/classic_control/cart_pole/).

Paper: (https://arxiv.org/abs/1312.5602).

In [11]:
import gymnasium as gym
from aux_gym_functions import *
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
# Virtual display
from pyvirtualdisplay import Display


## Observing the environment

## Setting hyperparameters and initializing an agent

In [12]:
import os
import pandas as pd
import json

results_PATH = "results"
PATH = os.path.join(results_PATH)
if not os.path.exists(PATH):
    os.makedirs(PATH)

## Playing and Training

In [13]:
new_line = {
    "name": None,
    "last_5%_train_reward": None,
    "last_5%_test_reward": None,
    "last_5%_length": None,
    "last_5%_training_error":None,
    "max_episodes": None,
    "start_epsilon": None,
    "final_epsilon": None,
    "learning_rate": None,
    "discount_factor": None,
    "double_q": None,
    "results_path": None,
}
df = pd.DataFrame(columns=list(new_line.keys()))

In [14]:
def simulate(params):
    test_seq = 0
    last_test_freq = 0 
    last_rewards = 0
    new_line["name"] = params[0]
    env = gym.make(new_line["name"], render_mode="rgb_array")
    new_line["learning_rate"] = params[2]
    new_line["max_episodes"] = params[3]
    new_line["start_epsilon"] = params[4]
    new_line["final_epsilon"] = params[5]
    new_line["discount_factor"] = params[6]
    new_line["double_q"] = params[1]

    epsilon_decay = new_line["start_epsilon"] / new_line["max_episodes"]
    batch_size = 64
    max_memory = 10000
    total_steps = 0
    test_freq = round(new_line["max_episodes"] * 0.025)
    tau = 0.01

    agent = DeepQLearningAgent(
        learning_rate=new_line["learning_rate"],
        initial_epsilon=new_line["start_epsilon"],
        epsilon_decay=epsilon_decay,
        final_epsilon=new_line["final_epsilon"],
        discount_factor=new_line["discount_factor"],
        batch_size=batch_size,
        max_memory=max_memory,
        observation_space=env.observation_space,
        action_space=env.action_space,
        is_double_network=new_line["double_q"],
        tau=tau,
    )

    folder_name = "-".join(
        [
            new_line["name"],
            str(new_line["max_episodes"]),
            str(round(new_line["start_epsilon"] * 100)),
            str(round(new_line["final_epsilon"] * 100)),
            str(round(new_line["learning_rate"] * 10**6)),
            str(round(new_line["discount_factor"] * 100)),
            str(new_line["double_q"]),
        ]
    )

    PATH = os.path.join(results_PATH, folder_name)
    if not os.path.exists(PATH):
        os.makedirs(PATH)

    # Create lists to contain total rewards, steps and accurracy per episode
    accuracies = []
    mean_rewards = []
    rewards = []
    steps = []

    steps_to_save_video = [0.1, 0.5, 1]
    episode_stop = [
        round(i * (new_line["max_episodes"] - 1)) for i in steps_to_save_video
    ]
    for episode_count in tqdm(range(new_line["max_episodes"])):
        state, info = env.reset()
        is_terminal = False
        total_reward = 0
        episode_step = 0

        # The Q-Network training
        while not is_terminal:
            episode_step += 1

            # Choose an action by greedily (with epsilon chance of random action) from the Q-Network
            action = agent.choose_action(state)

            # Play best or random move and get new state and reward from environment
            next_state, reward, terminated, truncated, info = env.step(action)
            is_terminal = terminated or truncated

            # Store experience
            agent.remember(state, action, reward, next_state, is_terminal)

            # Train neural network based on ReplayBuffer
            agent.update()

            # update the current observation
            state = next_state
            total_steps += 1
            total_reward += reward

        agent.decay_epsilon()

        mean_rewards.append(total_reward)
        if (episode_count + 1) % test_freq == 0:
            accur = test_accurracy(env, agent, env.spec.max_episode_steps, 25)
            if (
                (last_test_freq - np.mean(steps[:test_freq])) <= (np.mean(steps[:test_freq])*0.025)
                and len(accuracies) > 0
                and abs(accur - accuracies[-1]) <= (0.025 * accuracies[-1])
                and last_rewards - np.mean(rewards[-test_freq:]) <= (0.025 * last_rewards)
                and episode_count > new_line["max_episodes"]*0.5
            ):
                test_seq += 1
            else:
                test_seq = 0
            last_test_freq = np.mean(steps[:test_freq])
            last_rewards = np.mean(rewards[-test_freq:])
            accuracies.append(accur)

            print('step: %s, episode: %s, training reward mean: %s, test reward mean: %s, random move probability: %s' % (total_steps, episode_count+1, sum(mean_rewards)/test_freq, accur, agent.epsilon))
            mean_rewards.clear()

        rewards.append(total_reward)
        steps.append(episode_step)
        if episode_count in episode_stop:
            percentage = round((episode_count / new_line["max_episodes"]) * 100)
            video_name = os.path.join(
                results_PATH, folder_name, f"{percentage}.mp4"
            )
            record_trained_video(env, agent, video_name, env.spec.max_episode_steps)

        if test_seq == 10000000:
            break

    for file in os.listdir(os.path.join(results_PATH, folder_name)):
        if file.endswith(".json"):
            os.remove(os.path.join(os.path.join(results_PATH, folder_name), file))

    new_line["last_5%_train_reward"] = get_mean_last_5_percent(rewards)
    new_line["last_5%_test_reward"] = get_mean_last_5_percent(accuracies)
    new_line["last_5%_length"] = get_mean_last_5_percent(steps)
    new_line["last_5%_training_error"] = get_mean_last_5_percent(agent.training_error)
    new_line["results_path"] = os.path.join(results_PATH, folder_name)

    output = {'train_reward':rewards,
            'test_reward': accuracies,
            'episode_length':steps,
            'training_error':agent.training_error
    }
    #save output dict as pickle
    with open(os.path.join(results_PATH, folder_name, 'output.json'), 'w') as fp:
        json.dump(output, fp)
    df.loc[len(df)] = new_line
    df.to_csv(os.path.join(results_PATH, "results.csv"), index=False)
    env.close()
    return

In [15]:

params_list = [
    ["CartPole-v1", True, 1e-2, 10000, 1, .1, .99], 
    ["CartPole-v1", False, 1e-2, 10000, 1, .1, .99], 
    ["MountainCar-v0", True, 1e-2, 10000, 1, .1, .99], 
    ["MountainCar-v0", False, 1e-2, 10000, 1, .1, .99], 
    ["CartPole-v1", True, 1e-4, 10000, 1, .05, .99], 
    ["CartPole-v1", False, 1e-4, 10000, 1, .05, .99], 
    ["MountainCar-v0", True, 1e-4, 10000, 1, .05, .99], 
    ["MountainCar-v0", False, 1e-4, 10000, 1, .05, .99],
    ["CartPole-v1", True, 1e-6, 10000, 1, .01, .99], 
    ["CartPole-v1", False, 1e-6, 10000, 1, .01, .99], 
    ["MountainCar-v0", True, 1e-6, 10000, 1, .01, .99], 
    ["MountainCar-v0", False, 1e-6, 10000, 1, .01, .99]
] 
for params in params_list:
    simulate(params)

Double Deep Q-learning agent started with PyTorch


  3%|▎         | 255/10000 [00:10<09:26, 17.19it/s]

step: 5500, episode: 250, training reward mean: 22.0, test reward mean: 161.92, random move probability: 0.9750000000000028


  5%|▌         | 501/10000 [00:23<11:58, 13.21it/s]

step: 11336, episode: 500, training reward mean: 23.344, test reward mean: 121.88, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [00:31<06:41, 23.03it/s]

step: 17139, episode: 750, training reward mean: 23.212, test reward mean: 98.68, random move probability: 0.9250000000000083


 10%|▉         | 998/10000 [00:40<05:29, 27.33it/s]

step: 22794, episode: 1000, training reward mean: 22.62, test reward mean: 90.8, random move probability: 0.900000000000011


 10%|▉         | 998/10000 [00:40<05:29, 27.33it/s]

step 90 : 0 , [-2.4348905  -3.2904606  -0.1809813   0.20787317] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-True/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-True/10.mp4



 10%|█         | 1001/10000 [00:40<16:14,  9.23it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-True/10.mp4


 13%|█▎        | 1253/10000 [00:51<07:09, 20.36it/s]

step: 28531, episode: 1250, training reward mean: 22.948, test reward mean: 79.08, random move probability: 0.8750000000000138


 15%|█▌        | 1502/10000 [01:01<09:10, 15.45it/s]

step: 34179, episode: 1500, training reward mean: 22.592, test reward mean: 91.64, random move probability: 0.8500000000000165


 18%|█▊        | 1755/10000 [01:12<12:42, 10.82it/s]

step: 40067, episode: 1750, training reward mean: 23.552, test reward mean: 500.0, random move probability: 0.8250000000000193


 20%|██        | 2005/10000 [01:25<06:39, 19.99it/s]

step: 46237, episode: 2000, training reward mean: 24.68, test reward mean: 90.8, random move probability: 0.800000000000022


 23%|██▎       | 2253/10000 [01:35<09:34, 13.48it/s]

step: 51500, episode: 2250, training reward mean: 21.052, test reward mean: 92.64, random move probability: 0.7750000000000248


 25%|██▌       | 2506/10000 [01:43<04:44, 26.38it/s]

step: 57081, episode: 2500, training reward mean: 22.324, test reward mean: 94.72, random move probability: 0.7500000000000275


 28%|██▊       | 2756/10000 [01:51<04:24, 27.34it/s]

step: 62622, episode: 2750, training reward mean: 22.164, test reward mean: 69.36, random move probability: 0.7250000000000303


 30%|███       | 3005/10000 [01:58<04:42, 24.77it/s]

step: 68041, episode: 3000, training reward mean: 21.676, test reward mean: 76.8, random move probability: 0.700000000000033


 33%|███▎      | 3254/10000 [02:06<04:57, 22.71it/s]

step: 73374, episode: 3250, training reward mean: 21.332, test reward mean: 72.24, random move probability: 0.6750000000000358


 35%|███▌      | 3503/10000 [02:14<05:26, 19.91it/s]

step: 78683, episode: 3500, training reward mean: 21.236, test reward mean: 145.68, random move probability: 0.6500000000000385


 38%|███▊      | 3755/10000 [02:23<04:42, 22.07it/s]

step: 84475, episode: 3750, training reward mean: 23.168, test reward mean: 93.8, random move probability: 0.6250000000000413


 40%|████      | 4003/10000 [02:31<05:03, 19.73it/s]

step: 89921, episode: 4000, training reward mean: 21.784, test reward mean: 90.44, random move probability: 0.600000000000044


 43%|████▎     | 4255/10000 [02:40<04:23, 21.81it/s]

step: 96284, episode: 4250, training reward mean: 25.452, test reward mean: 100.68, random move probability: 0.5750000000000468


 45%|████▌     | 4505/10000 [02:51<04:23, 20.88it/s]

step: 103301, episode: 4500, training reward mean: 28.068, test reward mean: 92.64, random move probability: 0.5500000000000496


 48%|████▊     | 4752/10000 [03:08<13:16,  6.59it/s]

step: 114440, episode: 4750, training reward mean: 44.556, test reward mean: 500.0, random move probability: 0.5250000000000523


 50%|████▉     | 4999/10000 [03:17<03:14, 25.68it/s]

step: 120966, episode: 5000, training reward mean: 26.104, test reward mean: 361.92, random move probability: 0.5000000000000551


 50%|████▉     | 4999/10000 [03:20<03:14, 25.68it/s]

step 500 : 0 , [-1.1971039   0.00725859 -0.00243389  0.04031722] , 1.0 , False , True , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-True/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-True/50.mp4



 50%|█████     | 5002/10000 [03:21<22:48,  3.65it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-True/50.mp4


 53%|█████▎    | 5253/10000 [03:32<04:52, 16.25it/s]

step: 128138, episode: 5250, training reward mean: 28.688, test reward mean: 219.56, random move probability: 0.4750000000000578


 55%|█████▌    | 5504/10000 [03:42<04:22, 17.10it/s]

step: 134358, episode: 5500, training reward mean: 24.88, test reward mean: 91.28, random move probability: 0.4500000000000606


 58%|█████▊    | 5752/10000 [03:54<05:14, 13.52it/s]

step: 140144, episode: 5750, training reward mean: 23.144, test reward mean: 94.44, random move probability: 0.4250000000000633


 60%|██████    | 6004/10000 [04:07<05:56, 11.20it/s]

step: 147284, episode: 6000, training reward mean: 28.56, test reward mean: 117.04, random move probability: 0.4000000000000661


 63%|██████▎   | 6252/10000 [04:36<07:41,  8.13it/s]

step: 164479, episode: 6250, training reward mean: 68.78, test reward mean: 500.0, random move probability: 0.37500000000006883


 65%|██████▌   | 6501/10000 [04:52<07:34,  7.70it/s]

step: 173809, episode: 6500, training reward mean: 37.32, test reward mean: 500.0, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [05:12<05:34,  9.70it/s]

step: 185626, episode: 6750, training reward mean: 47.268, test reward mean: 98.72, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [05:28<03:35, 13.91it/s]

step: 193985, episode: 7000, training reward mean: 33.436, test reward mean: 167.24, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [05:44<02:45, 16.66it/s]

step: 203709, episode: 7250, training reward mean: 38.896, test reward mean: 107.6, random move probability: 0.27500000000007985


 75%|███████▌  | 7503/10000 [06:14<07:18,  5.70it/s]

step: 220596, episode: 7500, training reward mean: 67.548, test reward mean: 480.92, random move probability: 0.2500000000000826


 78%|███████▊  | 7751/10000 [06:46<07:58,  4.70it/s]

step: 236615, episode: 7750, training reward mean: 64.076, test reward mean: 394.08, random move probability: 0.22500000000008535


 80%|████████  | 8001/10000 [07:05<03:20,  9.96it/s]

step: 248398, episode: 8000, training reward mean: 47.132, test reward mean: 146.52, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [07:22<02:43, 10.69it/s]

step: 259703, episode: 8250, training reward mean: 45.22, test reward mean: 159.72, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [07:58<14:21,  1.74it/s]

step: 282814, episode: 8500, training reward mean: 92.444, test reward mean: 149.24, random move probability: 0.15000000000009361


 88%|████████▊ | 8751/10000 [08:44<02:45,  7.53it/s]

step: 307802, episode: 8750, training reward mean: 99.952, test reward mean: 104.08, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [09:47<07:27,  2.23it/s]

step: 344561, episode: 9000, training reward mean: 147.036, test reward mean: 481.6, random move probability: 0.10000000000009565


 93%|█████████▎| 9251/10000 [10:27<02:32,  4.91it/s]

step: 368807, episode: 9250, training reward mean: 96.984, test reward mean: 56.08, random move probability: 0.1


 95%|█████████▌| 9500/10000 [11:20<02:51,  2.92it/s]

step: 400772, episode: 9500, training reward mean: 127.86, test reward mean: 139.76, random move probability: 0.1


 98%|█████████▊| 9750/10000 [12:18<02:46,  1.50it/s]

step: 434598, episode: 9750, training reward mean: 135.304, test reward mean: 472.8, random move probability: 0.1


100%|█████████▉| 9997/10000 [12:53<00:00,  7.38it/s]

step: 454376, episode: 10000, training reward mean: 79.112, test reward mean: 87.96, random move probability: 0.1


100%|█████████▉| 9997/10000 [12:53<00:00,  7.38it/s]

step 85 : 0 , [-2.4119022  -2.7660122   0.00843554  0.31193265] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-True/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-True/100.mp4



100%|█████████▉| 9997/10000 [12:53<00:00,  7.38it/s]

Moviepy - Done !


100%|██████████| 10000/10000 [12:53<00:00, 12.92it/s]


Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  3%|▎         | 256/10000 [00:06<06:34, 24.69it/s]

step: 5433, episode: 250, training reward mean: 21.732, test reward mean: 143.88, random move probability: 0.9750000000000028


  5%|▌         | 501/10000 [00:14<08:27, 18.72it/s]

step: 11495, episode: 500, training reward mean: 24.248, test reward mean: 196.2, random move probability: 0.9500000000000055


  8%|▊         | 753/10000 [00:22<11:38, 13.24it/s]

step: 18183, episode: 750, training reward mean: 26.752, test reward mean: 246.36, random move probability: 0.9250000000000083


 10%|▉         | 997/10000 [00:30<05:05, 29.47it/s]

step: 24784, episode: 1000, training reward mean: 26.404, test reward mean: 271.92, random move probability: 0.900000000000011


 10%|▉         | 997/10000 [00:32<05:05, 29.47it/s]

step 211 : 0 , [-2.0906343  -1.9737749  -0.20963693 -0.19271484] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-False/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-False/10.mp4



 10%|█         | 1004/10000 [00:32<21:15,  7.06it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-False/10.mp4


 13%|█▎        | 1253/10000 [00:42<11:31, 12.65it/s]

step: 32154, episode: 1250, training reward mean: 29.48, test reward mean: 277.88, random move probability: 0.8750000000000138


 15%|█▌        | 1505/10000 [00:52<10:50, 13.05it/s]

step: 39668, episode: 1500, training reward mean: 30.056, test reward mean: 303.08, random move probability: 0.8500000000000165


 18%|█▊        | 1751/10000 [01:03<10:32, 13.04it/s]

step: 48053, episode: 1750, training reward mean: 33.54, test reward mean: 309.52, random move probability: 0.8250000000000193


 20%|██        | 2002/10000 [01:15<13:39,  9.76it/s]

step: 57575, episode: 2000, training reward mean: 38.088, test reward mean: 327.0, random move probability: 0.800000000000022


 23%|██▎       | 2254/10000 [01:28<12:26, 10.37it/s]

step: 67645, episode: 2250, training reward mean: 40.28, test reward mean: 311.16, random move probability: 0.7750000000000248


 25%|██▌       | 2502/10000 [01:44<15:07,  8.26it/s]

step: 79519, episode: 2500, training reward mean: 47.496, test reward mean: 356.48, random move probability: 0.7500000000000275


 28%|██▊       | 2753/10000 [02:00<16:06,  7.50it/s]

step: 91368, episode: 2750, training reward mean: 47.396, test reward mean: 376.88, random move probability: 0.7250000000000303


 30%|███       | 3002/10000 [02:19<14:23,  8.10it/s]

step: 105233, episode: 3000, training reward mean: 55.46, test reward mean: 379.48, random move probability: 0.700000000000033


 33%|███▎      | 3253/10000 [02:39<15:33,  7.23it/s]

step: 120203, episode: 3250, training reward mean: 59.88, test reward mean: 320.24, random move probability: 0.6750000000000358


 35%|███▌      | 3501/10000 [03:04<21:03,  5.14it/s]

step: 138091, episode: 3500, training reward mean: 71.552, test reward mean: 394.16, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [03:29<22:48,  4.57it/s]

step: 156838, episode: 3750, training reward mean: 74.988, test reward mean: 440.88, random move probability: 0.6250000000000413


 40%|████      | 4001/10000 [03:57<18:13,  5.49it/s]

step: 178299, episode: 4000, training reward mean: 85.844, test reward mean: 353.6, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [04:27<18:12,  5.26it/s]

step: 201023, episode: 4250, training reward mean: 90.896, test reward mean: 327.24, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [05:05<25:14,  3.63it/s]

step: 228234, episode: 4500, training reward mean: 108.844, test reward mean: 324.12, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [05:43<30:43,  2.85it/s]

step: 257296, episode: 4750, training reward mean: 116.248, test reward mean: 460.08, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [06:28<27:47,  3.00it/s]

step: 291362, episode: 5000, training reward mean: 136.264, test reward mean: 383.48, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [06:30<27:47,  3.00it/s]

step 409 : 0 , [2.4029837  0.92336845 0.04073556 0.265093  ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-False/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-False/50.mp4



 50%|█████     | 5002/10000 [06:30<50:29,  1.65it/s]  

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-False/50.mp4


 52%|█████▎    | 5250/10000 [07:17<29:12,  2.71it/s]

step: 325574, episode: 5250, training reward mean: 136.848, test reward mean: 413.64, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [08:04<18:45,  4.00it/s]

step: 359888, episode: 5500, training reward mean: 137.256, test reward mean: 232.12, random move probability: 0.4500000000000606


 58%|█████▊    | 5754/10000 [08:25<08:56,  7.91it/s]

step: 375352, episode: 5750, training reward mean: 61.856, test reward mean: 500.0, random move probability: 0.4250000000000633


 60%|██████    | 6001/10000 [08:44<14:17,  4.66it/s]

step: 389734, episode: 6000, training reward mean: 57.528, test reward mean: 134.8, random move probability: 0.4000000000000661


 63%|██████▎   | 6255/10000 [09:01<02:58, 20.96it/s]

step: 401990, episode: 6250, training reward mean: 49.024, test reward mean: 100.8, random move probability: 0.37500000000006883


 65%|██████▌   | 6502/10000 [09:19<06:45,  8.62it/s]

step: 415568, episode: 6500, training reward mean: 54.312, test reward mean: 115.84, random move probability: 0.3500000000000716


 68%|██████▊   | 6753/10000 [09:36<03:22, 16.04it/s]

step: 428071, episode: 6750, training reward mean: 50.012, test reward mean: 117.08, random move probability: 0.32500000000007434


 70%|███████   | 7001/10000 [10:03<07:54,  6.32it/s]

step: 448248, episode: 7000, training reward mean: 80.708, test reward mean: 149.16, random move probability: 0.3000000000000771


 73%|███████▎  | 7251/10000 [11:06<09:29,  4.83it/s]

step: 495498, episode: 7250, training reward mean: 189.0, test reward mean: 138.12, random move probability: 0.27500000000007985


 75%|███████▌  | 7501/10000 [12:03<09:19,  4.47it/s]

step: 535648, episode: 7500, training reward mean: 160.6, test reward mean: 89.24, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [12:58<05:18,  7.06it/s]

step: 576764, episode: 7750, training reward mean: 164.464, test reward mean: 9.32, random move probability: 0.22500000000008535


 80%|████████  | 8007/10000 [13:45<00:44, 45.24it/s]

step: 609036, episode: 8000, training reward mean: 129.088, test reward mean: 9.4, random move probability: 0.2000000000000881


 83%|████████▎ | 8260/10000 [13:49<00:27, 63.62it/s]

step: 611805, episode: 8250, training reward mean: 11.076, test reward mean: 9.24, random move probability: 0.17500000000009086


 85%|████████▌ | 8508/10000 [13:53<00:22, 66.77it/s]

step: 614398, episode: 8500, training reward mean: 10.372, test reward mean: 9.32, random move probability: 0.15000000000009361


 88%|████████▊ | 8758/10000 [13:57<00:18, 67.09it/s]

step: 616932, episode: 8750, training reward mean: 10.136, test reward mean: 9.8, random move probability: 0.12500000000009637


 90%|█████████ | 9001/10000 [14:36<02:54,  5.72it/s]

step: 645089, episode: 9000, training reward mean: 112.628, test reward mean: 106.08, random move probability: 0.10000000000009565


 93%|█████████▎| 9258/10000 [14:46<00:16, 43.91it/s]

step: 652064, episode: 9250, training reward mean: 27.9, test reward mean: 9.28, random move probability: 0.1


 95%|█████████▌| 9501/10000 [15:00<02:05,  3.99it/s]

step: 662650, episode: 9500, training reward mean: 42.344, test reward mean: 150.96, random move probability: 0.1


 98%|█████████▊| 9751/10000 [16:04<01:34,  2.63it/s]

step: 714231, episode: 9750, training reward mean: 206.324, test reward mean: 272.76, random move probability: 0.1


100%|█████████▉| 9999/10000 [16:57<00:00, 21.26it/s]

step: 758264, episode: 10000, training reward mean: 176.132, test reward mean: 9.32, random move probability: 0.1
step 10 : 1 , [-0.13340099 -1.5207468   0.2590622   2.5260925 ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-10-10000-99-False/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-10-10000-99-False/100.mp4



100%|██████████| 10000/10000 [16:58<00:00,  9.82it/s]


Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-10-10000-99-False/100.mp4
Double Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [01:09<1:09:09,  2.35it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [02:20<1:10:29,  2.25it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [03:33<1:08:25,  2.25it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [04:46<1:05:48,  2.28it/s]

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [04:47<1:05:48,  2.28it/s]

step 200 : 2 , [-0.3633355  -0.00227401] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-True/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-True/10.mp4



 10%|█         | 1001/10000 [04:47<1:56:12,  1.29it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-True/10.mp4


 12%|█▎        | 1250/10000 [06:00<1:03:35,  2.29it/s]

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -180.44, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [07:14<1:02:20,  2.27it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [08:29<1:01:10,  2.25it/s]

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -178.24, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [09:42<58:39,  2.27it/s]  

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -196.96, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [10:56<49:12,  2.62it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -115.52, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [12:10<50:07,  2.49it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -144.04, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [13:25<48:49,  2.47it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -130.04, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [14:39<48:59,  2.38it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -171.24, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [15:54<46:13,  2.43it/s]

step: 650000, episode: 3250, training reward mean: -200.0, test reward mean: -140.12, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [17:08<46:15,  2.34it/s]

step: 700000, episode: 3500, training reward mean: -200.0, test reward mean: -165.2, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [18:22<44:12,  2.36it/s]

step: 750000, episode: 3750, training reward mean: -200.0, test reward mean: -168.52, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [19:36<39:56,  2.50it/s]

step: 799906, episode: 4000, training reward mean: -199.624, test reward mean: -139.96, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [20:51<43:48,  2.19it/s]

step: 849877, episode: 4250, training reward mean: -199.884, test reward mean: -200.0, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [22:06<38:14,  2.40it/s]

step: 899827, episode: 4500, training reward mean: -199.8, test reward mean: -160.28, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [23:21<35:15,  2.48it/s]

step: 949659, episode: 4750, training reward mean: -199.328, test reward mean: -126.48, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [24:36<36:29,  2.28it/s]

step: 999224, episode: 5000, training reward mean: -198.26, test reward mean: -185.76, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [24:37<36:29,  2.28it/s]

step 160 : 1 , [0.5001243  0.02773802] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-True/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-True/50.mp4



 50%|█████     | 5001/10000 [24:37<54:25,  1.53it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-True/50.mp4


 52%|█████▎    | 5250/10000 [25:53<31:53,  2.48it/s]

step: 1048905, episode: 5250, training reward mean: -198.724, test reward mean: -128.8, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [27:09<29:11,  2.57it/s]

step: 1098381, episode: 5500, training reward mean: -197.904, test reward mean: -116.32, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [28:24<29:15,  2.42it/s]

step: 1147044, episode: 5750, training reward mean: -194.652, test reward mean: -146.28, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [29:39<31:13,  2.14it/s]

step: 1195417, episode: 6000, training reward mean: -193.492, test reward mean: -200.0, random move probability: 0.4000000000000661


 62%|██████▎   | 6250/10000 [30:54<27:18,  2.29it/s]

step: 1244063, episode: 6250, training reward mean: -194.584, test reward mean: -176.92, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [32:10<24:48,  2.35it/s]

step: 1293105, episode: 6500, training reward mean: -196.168, test reward mean: -165.28, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [33:21<21:04,  2.57it/s]

step: 1339624, episode: 6750, training reward mean: -186.076, test reward mean: -133.56, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [34:31<20:12,  2.47it/s]

step: 1385406, episode: 7000, training reward mean: -183.128, test reward mean: -130.72, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [35:41<17:52,  2.56it/s]

step: 1430863, episode: 7250, training reward mean: -181.828, test reward mean: -174.32, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [36:52<15:34,  2.67it/s]

step: 1477280, episode: 7500, training reward mean: -185.668, test reward mean: -131.76, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [38:04<16:32,  2.27it/s]

step: 1523729, episode: 7750, training reward mean: -185.796, test reward mean: -183.12, random move probability: 0.22500000000008535


 80%|████████  | 8000/10000 [39:13<14:13,  2.34it/s]

step: 1568027, episode: 8000, training reward mean: -177.192, test reward mean: -170.76, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [40:21<13:16,  2.20it/s]

step: 1611850, episode: 8250, training reward mean: -175.292, test reward mean: -189.24, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [41:31<09:14,  2.71it/s]

step: 1657226, episode: 8500, training reward mean: -181.504, test reward mean: -147.56, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [42:39<07:34,  2.75it/s]

step: 1701302, episode: 8750, training reward mean: -176.304, test reward mean: -147.24, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [43:45<07:21,  2.27it/s]

step: 1743831, episode: 9000, training reward mean: -170.116, test reward mean: -176.96, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [44:52<04:59,  2.50it/s]

step: 1786189, episode: 9250, training reward mean: -169.432, test reward mean: -200.0, random move probability: 0.1


 95%|█████████▌| 9500/10000 [45:55<03:20,  2.50it/s]

step: 1827584, episode: 9500, training reward mean: -165.58, test reward mean: -140.32, random move probability: 0.1


 98%|█████████▊| 9750/10000 [46:58<01:37,  2.58it/s]

step: 1867393, episode: 9750, training reward mean: -159.236, test reward mean: -200.0, random move probability: 0.1


100%|█████████▉| 9999/10000 [48:03<00:00,  4.17it/s]

step: 1909566, episode: 10000, training reward mean: -168.692, test reward mean: -138.0, random move probability: 0.1


100%|█████████▉| 9999/10000 [48:04<00:00,  4.17it/s]

step 153 : 2 , [0.51563764 0.04206521] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-True/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-True/100.mp4



100%|██████████| 10000/10000 [48:04<00:00,  3.47it/s]


Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [00:57<1:03:29,  2.56it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [01:57<1:03:49,  2.48it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [02:58<1:02:12,  2.48it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [04:01<59:03,  2.54it/s] 

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [04:02<59:03,  2.54it/s]

step 200 : 2 , [-0.3170078  -0.00529002] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-False/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-False/10.mp4



 10%|█         | 1001/10000 [04:02<1:45:27,  1.42it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-False/10.mp4


 12%|█▎        | 1250/10000 [05:04<59:12,  2.46it/s]  

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [06:05<57:15,  2.47it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [07:07<53:48,  2.56it/s]

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -196.4, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [08:09<52:49,  2.52it/s]

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [09:12<52:43,  2.45it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [10:14<51:22,  2.43it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [11:17<49:26,  2.44it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [12:20<47:38,  2.45it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [13:22<44:36,  2.52it/s]

step: 650000, episode: 3250, training reward mean: -200.0, test reward mean: -194.96, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [14:25<41:22,  2.62it/s]

step: 700000, episode: 3500, training reward mean: -200.0, test reward mean: -166.44, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [15:28<40:18,  2.58it/s]

step: 750000, episode: 3750, training reward mean: -200.0, test reward mean: -170.2, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [16:31<40:55,  2.44it/s]

step: 800000, episode: 4000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [17:36<41:16,  2.32it/s]

step: 850000, episode: 4250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [18:40<33:01,  2.78it/s]

step: 900000, episode: 4500, training reward mean: -200.0, test reward mean: -132.4, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [19:44<35:01,  2.50it/s]

step: 949957, episode: 4750, training reward mean: -199.828, test reward mean: -200.0, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [20:47<30:13,  2.76it/s]

step: 999852, episode: 5000, training reward mean: -199.58, test reward mean: -135.0, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [20:48<30:13,  2.76it/s]

step 200 : 2 , [-0.3044136  -0.00575759] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-False/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-False/50.mp4



 50%|█████     | 5001/10000 [20:49<54:33,  1.53it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-False/50.mp4


 52%|█████▎    | 5250/10000 [21:53<31:48,  2.49it/s]

step: 1049797, episode: 5250, training reward mean: -199.78, test reward mean: -191.16, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [22:56<26:34,  2.82it/s]

step: 1099724, episode: 5500, training reward mean: -199.708, test reward mean: -119.84, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [24:00<28:21,  2.50it/s]

step: 1149673, episode: 5750, training reward mean: -199.796, test reward mean: -188.96, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [25:04<23:43,  2.81it/s]

step: 1199598, episode: 6000, training reward mean: -199.7, test reward mean: -115.48, random move probability: 0.4000000000000661


 62%|██████▎   | 6250/10000 [26:08<21:01,  2.97it/s]

step: 1249545, episode: 6250, training reward mean: -199.788, test reward mean: -124.92, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [27:13<23:47,  2.45it/s]

step: 1299012, episode: 6500, training reward mean: -197.868, test reward mean: -200.0, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [28:18<22:01,  2.46it/s]

step: 1348963, episode: 6750, training reward mean: -199.804, test reward mean: -196.76, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [29:23<20:25,  2.45it/s]

step: 1398289, episode: 7000, training reward mean: -197.304, test reward mean: -200.0, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [30:27<19:12,  2.39it/s]

step: 1447731, episode: 7250, training reward mean: -197.768, test reward mean: -200.0, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [31:32<17:06,  2.44it/s]

step: 1497538, episode: 7500, training reward mean: -199.228, test reward mean: -200.0, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [32:37<15:16,  2.45it/s]

step: 1547448, episode: 7750, training reward mean: -199.64, test reward mean: -200.0, random move probability: 0.22500000000008535


 80%|████████  | 8000/10000 [33:42<13:56,  2.39it/s]

step: 1597448, episode: 8000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [34:47<12:18,  2.37it/s]

step: 1647376, episode: 8250, training reward mean: -199.712, test reward mean: -200.0, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [35:53<10:31,  2.38it/s]

step: 1697359, episode: 8500, training reward mean: -199.932, test reward mean: -200.0, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [36:59<08:36,  2.42it/s]

step: 1746958, episode: 8750, training reward mean: -198.396, test reward mean: -200.0, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [38:05<07:04,  2.36it/s]

step: 1796772, episode: 9000, training reward mean: -199.256, test reward mean: -200.0, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [39:10<05:08,  2.43it/s]

step: 1846026, episode: 9250, training reward mean: -197.016, test reward mean: -200.0, random move probability: 0.1


 95%|█████████▌| 9500/10000 [40:15<03:24,  2.45it/s]

step: 1895950, episode: 9500, training reward mean: -199.696, test reward mean: -200.0, random move probability: 0.1


 98%|█████████▊| 9750/10000 [41:20<01:45,  2.37it/s]

step: 1945768, episode: 9750, training reward mean: -199.272, test reward mean: -200.0, random move probability: 0.1


100%|█████████▉| 9999/10000 [42:25<00:00,  3.92it/s]

step: 1995233, episode: 10000, training reward mean: -197.86, test reward mean: -200.0, random move probability: 0.1


100%|█████████▉| 9999/10000 [42:26<00:00,  3.92it/s]

step 200 : 2 , [-0.34711215 -0.00358112] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-10-10000-99-False/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-10-10000-99-False/100.mp4



100%|██████████| 10000/10000 [42:26<00:00,  3.93it/s]


Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-10-10000-99-False/100.mp4
Double Deep Q-learning agent started with PyTorch


  3%|▎         | 255/10000 [00:07<09:36, 16.89it/s]

step: 5922, episode: 250, training reward mean: 23.688, test reward mean: 329.96, random move probability: 0.9750000000000028


  5%|▌         | 503/10000 [00:16<09:27, 16.72it/s]

step: 12209, episode: 500, training reward mean: 25.148, test reward mean: 209.24, random move probability: 0.9500000000000055


  8%|▊         | 755/10000 [00:25<08:06, 18.99it/s]

step: 18635, episode: 750, training reward mean: 25.704, test reward mean: 205.88, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [00:33<09:24, 15.94it/s]

step: 24841, episode: 1000, training reward mean: 24.824, test reward mean: 191.24, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [00:34<09:24, 15.94it/s]

step 245 : 1 , [1.379808   1.5953273  0.2150542  0.24314983] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-True/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-True/10.mp4



 10%|█         | 1003/10000 [00:35<25:43,  5.83it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-True/10.mp4


 13%|█▎        | 1255/10000 [00:44<08:38, 16.86it/s]

step: 31569, episode: 1250, training reward mean: 26.912, test reward mean: 204.92, random move probability: 0.8750000000000138


 15%|█▌        | 1504/10000 [00:54<09:31, 14.86it/s]

step: 38823, episode: 1500, training reward mean: 29.016, test reward mean: 244.96, random move probability: 0.8500000000000165


 18%|█▊        | 1753/10000 [01:05<11:25, 12.04it/s]

step: 46645, episode: 1750, training reward mean: 31.288, test reward mean: 239.16, random move probability: 0.8250000000000193


 20%|██        | 2002/10000 [01:17<11:44, 11.36it/s]

step: 54991, episode: 2000, training reward mean: 33.384, test reward mean: 244.44, random move probability: 0.800000000000022


 23%|██▎       | 2252/10000 [01:30<14:28,  8.92it/s]

step: 64733, episode: 2250, training reward mean: 38.968, test reward mean: 230.36, random move probability: 0.7750000000000248


 25%|██▌       | 2504/10000 [01:45<13:19,  9.38it/s]

step: 75498, episode: 2500, training reward mean: 43.06, test reward mean: 280.8, random move probability: 0.7500000000000275


 28%|██▊       | 2754/10000 [02:00<11:39, 10.36it/s]

step: 86379, episode: 2750, training reward mean: 43.524, test reward mean: 298.44, random move probability: 0.7250000000000303


 30%|███       | 3003/10000 [02:16<13:17,  8.78it/s]

step: 97461, episode: 3000, training reward mean: 44.328, test reward mean: 293.4, random move probability: 0.700000000000033


 33%|███▎      | 3252/10000 [02:34<11:59,  9.38it/s]

step: 109839, episode: 3250, training reward mean: 49.512, test reward mean: 265.08, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [02:55<16:47,  6.45it/s]

step: 124464, episode: 3500, training reward mean: 58.5, test reward mean: 316.96, random move probability: 0.6500000000000385


 38%|███▊      | 3751/10000 [03:18<18:31,  5.62it/s]

step: 140726, episode: 3750, training reward mean: 65.048, test reward mean: 331.04, random move probability: 0.6250000000000413


 40%|████      | 4001/10000 [03:43<20:29,  4.88it/s]

step: 158808, episode: 4000, training reward mean: 72.328, test reward mean: 288.72, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [04:11<11:41,  8.20it/s]

step: 178554, episode: 4250, training reward mean: 78.984, test reward mean: 184.64, random move probability: 0.5750000000000468


 45%|████▌     | 4504/10000 [04:31<07:16, 12.58it/s]

step: 192768, episode: 4500, training reward mean: 56.856, test reward mean: 148.04, random move probability: 0.5500000000000496


 48%|████▊     | 4754/10000 [04:49<06:15, 13.98it/s]

step: 205782, episode: 4750, training reward mean: 52.056, test reward mean: 201.12, random move probability: 0.5250000000000523


 50%|████▉     | 4998/10000 [05:06<05:30, 15.15it/s]

step: 217731, episode: 5000, training reward mean: 47.796, test reward mean: 200.36, random move probability: 0.5000000000000551


 50%|████▉     | 4998/10000 [05:07<05:30, 15.15it/s]

step 195 : 0 , [ 0.09660391  3.2189722  -0.2216095  -1.8859117 ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-True/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-True/50.mp4



 50%|█████     | 5003/10000 [05:08<13:04,  6.37it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-True/50.mp4


 53%|█████▎    | 5251/10000 [05:30<13:55,  5.69it/s]

step: 233076, episode: 5250, training reward mean: 61.38, test reward mean: 250.04, random move probability: 0.4750000000000578


 55%|█████▌    | 5502/10000 [06:13<24:59,  3.00it/s]

step: 263032, episode: 5500, training reward mean: 119.824, test reward mean: 500.0, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [06:49<17:47,  3.98it/s]

step: 288420, episode: 5750, training reward mean: 101.552, test reward mean: 500.0, random move probability: 0.4250000000000633


 60%|██████    | 6002/10000 [07:14<13:23,  4.98it/s]

step: 305485, episode: 6000, training reward mean: 68.26, test reward mean: 403.48, random move probability: 0.4000000000000661


 63%|██████▎   | 6255/10000 [07:36<05:14, 11.92it/s]

step: 321018, episode: 6250, training reward mean: 62.132, test reward mean: 236.68, random move probability: 0.37500000000006883


 65%|██████▌   | 6501/10000 [08:11<12:55,  4.51it/s]

step: 345751, episode: 6500, training reward mean: 98.932, test reward mean: 163.56, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [08:46<11:39,  4.64it/s]

step: 369639, episode: 6750, training reward mean: 95.552, test reward mean: 422.0, random move probability: 0.32500000000007434


 70%|███████   | 7001/10000 [09:10<06:24,  7.80it/s]

step: 385962, episode: 7000, training reward mean: 65.292, test reward mean: 280.44, random move probability: 0.3000000000000771


 73%|███████▎  | 7252/10000 [09:34<08:21,  5.49it/s]

step: 402375, episode: 7250, training reward mean: 65.652, test reward mean: 500.0, random move probability: 0.27500000000007985


 75%|███████▌  | 7501/10000 [10:03<06:21,  6.56it/s]

step: 422272, episode: 7500, training reward mean: 79.588, test reward mean: 231.72, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [10:42<16:01,  2.34it/s]

step: 449284, episode: 7750, training reward mean: 108.048, test reward mean: 433.68, random move probability: 0.22500000000008535


 80%|████████  | 8000/10000 [11:17<09:06,  3.66it/s]

step: 473549, episode: 8000, training reward mean: 97.06, test reward mean: 269.84, random move probability: 0.2000000000000881


 82%|████████▏ | 8249/10000 [12:04<04:02,  7.23it/s]

step: 506631, episode: 8250, training reward mean: 132.328, test reward mean: 454.76, random move probability: 0.17500000000009086


 85%|████████▌ | 8501/10000 [13:00<09:34,  2.61it/s]

step: 543723, episode: 8500, training reward mean: 148.368, test reward mean: 500.0, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [14:04<11:26,  1.82it/s]

step: 587899, episode: 8750, training reward mean: 176.704, test reward mean: 500.0, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [15:21<11:29,  1.45it/s]

step: 641375, episode: 9000, training reward mean: 213.904, test reward mean: 500.0, random move probability: 0.10000000000009565


 93%|█████████▎| 9251/10000 [16:44<05:37,  2.22it/s]

step: 697940, episode: 9250, training reward mean: 226.26, test reward mean: 500.0, random move probability: 0.07500000000009494


 95%|█████████▌| 9500/10000 [18:26<05:30,  1.51it/s]

step: 767940, episode: 9500, training reward mean: 280.0, test reward mean: 500.0, random move probability: 0.05000000000009422


 98%|█████████▊| 9751/10000 [20:16<01:55,  2.16it/s]

step: 843415, episode: 9750, training reward mean: 301.9, test reward mean: 500.0, random move probability: 0.05


100%|█████████▉| 9999/10000 [21:58<00:00,  2.25it/s]

step: 915363, episode: 10000, training reward mean: 287.792, test reward mean: 500.0, random move probability: 0.05


100%|█████████▉| 9999/10000 [22:02<00:00,  2.25it/s]

step 500 : 1 , [ 0.37114012 -1.4545726   0.12555978  0.86795723] , 1.0 , False , True , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-True/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-True/100.mp4



100%|██████████| 10000/10000 [22:02<00:00,  7.56it/s]


Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  3%|▎         | 252/10000 [00:05<08:36, 18.88it/s]

step: 5521, episode: 250, training reward mean: 22.084, test reward mean: 235.76, random move probability: 0.9750000000000028


  5%|▌         | 504/10000 [00:13<07:31, 21.04it/s]

step: 11576, episode: 500, training reward mean: 24.22, test reward mean: 234.28, random move probability: 0.9500000000000055


  8%|▊         | 753/10000 [00:21<09:39, 15.97it/s]

step: 18268, episode: 750, training reward mean: 26.768, test reward mean: 301.48, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [00:28<10:16, 14.61it/s]

step: 24847, episode: 1000, training reward mean: 26.316, test reward mean: 253.48, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [00:29<10:16, 14.61it/s]

step 162 : 1 , [1.6067016  1.8685538  0.21382046 0.00313929] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-False/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-False/10.mp4



 10%|█         | 1003/10000 [00:29<20:10,  7.43it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-False/10.mp4


 13%|█▎        | 1253/10000 [00:38<10:10, 14.33it/s]

step: 32102, episode: 1250, training reward mean: 29.02, test reward mean: 285.28, random move probability: 0.8750000000000138


 15%|█▌        | 1504/10000 [00:47<09:15, 15.30it/s]

step: 39683, episode: 1500, training reward mean: 30.324, test reward mean: 292.0, random move probability: 0.8500000000000165


 18%|█▊        | 1755/10000 [00:57<09:25, 14.58it/s]

step: 48676, episode: 1750, training reward mean: 35.972, test reward mean: 327.28, random move probability: 0.8250000000000193


 20%|██        | 2003/10000 [01:08<11:18, 11.78it/s]

step: 58052, episode: 2000, training reward mean: 37.504, test reward mean: 277.2, random move probability: 0.800000000000022


 23%|██▎       | 2254/10000 [01:20<10:30, 12.28it/s]

step: 67863, episode: 2250, training reward mean: 39.244, test reward mean: 304.64, random move probability: 0.7750000000000248


 25%|██▌       | 2504/10000 [01:32<10:07, 12.34it/s]

step: 77901, episode: 2500, training reward mean: 40.152, test reward mean: 326.08, random move probability: 0.7500000000000275


 28%|██▊       | 2754/10000 [01:46<10:40, 11.31it/s]

step: 89443, episode: 2750, training reward mean: 46.168, test reward mean: 362.36, random move probability: 0.7250000000000303


 30%|███       | 3001/10000 [02:00<13:05,  8.91it/s]

step: 101515, episode: 3000, training reward mean: 48.288, test reward mean: 296.56, random move probability: 0.700000000000033


 33%|███▎      | 3253/10000 [02:17<12:37,  8.91it/s]

step: 115421, episode: 3250, training reward mean: 55.624, test reward mean: 279.36, random move probability: 0.6750000000000358


 35%|███▌      | 3502/10000 [02:36<17:12,  6.29it/s]

step: 131494, episode: 3500, training reward mean: 64.292, test reward mean: 387.0, random move probability: 0.6500000000000385


 38%|███▊      | 3751/10000 [02:56<20:44,  5.02it/s]

step: 148201, episode: 3750, training reward mean: 66.828, test reward mean: 384.8, random move probability: 0.6250000000000413


 40%|████      | 4001/10000 [03:19<14:43,  6.79it/s]

step: 167319, episode: 4000, training reward mean: 76.472, test reward mean: 319.8, random move probability: 0.600000000000044


 43%|████▎     | 4252/10000 [03:44<20:53,  4.58it/s]

step: 188188, episode: 4250, training reward mean: 83.476, test reward mean: 318.04, random move probability: 0.5750000000000468


 45%|████▍     | 4499/10000 [04:08<08:29, 10.81it/s]

step: 209567, episode: 4500, training reward mean: 85.516, test reward mean: 348.8, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [04:40<17:15,  5.07it/s]

step: 235395, episode: 4750, training reward mean: 103.312, test reward mean: 258.52, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [05:02<10:46,  7.74it/s]

step: 253501, episode: 5000, training reward mean: 72.424, test reward mean: 494.92, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [05:03<10:46,  7.74it/s]

step 500 : 0 , [ 1.3296939  -0.38069075 -0.01858739  0.22478294] , 1.0 , False , True , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-False/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-False/50.mp4



 50%|█████     | 5002/10000 [05:04<30:02,  2.77it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-False/50.mp4


 53%|█████▎    | 5252/10000 [05:17<04:46, 16.59it/s]

step: 263925, episode: 5250, training reward mean: 41.696, test reward mean: 148.44, random move probability: 0.4750000000000578


 55%|█████▌    | 5502/10000 [05:31<06:55, 10.82it/s]

step: 276235, episode: 5500, training reward mean: 49.24, test reward mean: 152.68, random move probability: 0.4500000000000606


 58%|█████▊    | 5751/10000 [05:56<12:38,  5.60it/s]

step: 297451, episode: 5750, training reward mean: 84.864, test reward mean: 500.0, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [06:29<07:28,  8.91it/s]

step: 324996, episode: 6000, training reward mean: 110.18, test reward mean: 134.4, random move probability: 0.4000000000000661


 63%|██████▎   | 6253/10000 [06:52<05:53, 10.60it/s]

step: 343674, episode: 6250, training reward mean: 74.712, test reward mean: 191.48, random move probability: 0.37500000000006883


 65%|██████▌   | 6502/10000 [07:13<06:54,  8.44it/s]

step: 361537, episode: 6500, training reward mean: 71.452, test reward mean: 266.4, random move probability: 0.3500000000000716


 68%|██████▊   | 6754/10000 [07:39<07:17,  7.42it/s]

step: 382224, episode: 6750, training reward mean: 82.748, test reward mean: 343.48, random move probability: 0.32500000000007434


 70%|███████   | 7002/10000 [08:04<07:41,  6.49it/s]

step: 401797, episode: 7000, training reward mean: 78.292, test reward mean: 322.36, random move probability: 0.3000000000000771


 73%|███████▎  | 7253/10000 [08:31<05:08,  8.91it/s]

step: 424167, episode: 7250, training reward mean: 89.48, test reward mean: 142.76, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [09:50<17:24,  2.39it/s]

step: 488455, episode: 7500, training reward mean: 257.152, test reward mean: 321.12, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [10:58<30:00,  1.25it/s]

step: 543529, episode: 7750, training reward mean: 220.296, test reward mean: 485.08, random move probability: 0.22500000000008535


 80%|████████  | 8001/10000 [12:32<16:16,  2.05it/s]

step: 620592, episode: 8000, training reward mean: 308.252, test reward mean: 500.0, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [13:57<13:49,  2.11it/s]

step: 688952, episode: 8250, training reward mean: 273.44, test reward mean: 363.96, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [15:18<14:30,  1.72it/s]

step: 754214, episode: 8500, training reward mean: 261.048, test reward mean: 500.0, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [16:51<13:19,  1.56it/s]

step: 827563, episode: 8750, training reward mean: 293.396, test reward mean: 350.76, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [18:03<07:10,  2.32it/s]

step: 885229, episode: 9000, training reward mean: 230.664, test reward mean: 449.2, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [19:26<09:55,  1.26it/s]

step: 951514, episode: 9250, training reward mean: 265.14, test reward mean: 480.6, random move probability: 0.07500000000009494


 95%|█████████▌| 9500/10000 [21:08<07:28,  1.12it/s]

step: 1032226, episode: 9500, training reward mean: 322.848, test reward mean: 500.0, random move probability: 0.05000000000009422


 98%|█████████▊| 9751/10000 [21:49<00:42,  5.88it/s]

step: 1064822, episode: 9750, training reward mean: 130.384, test reward mean: 99.76, random move probability: 0.05


100%|█████████▉| 9999/10000 [22:26<00:00,  7.64it/s]

step: 1095803, episode: 10000, training reward mean: 123.924, test reward mean: 114.32, random move probability: 0.05


100%|█████████▉| 9999/10000 [22:27<00:00,  7.64it/s]

step 113 : 1 , [-2.4059448  -1.9885154  -0.05605858 -0.15594687] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-5-100-99-False/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-5-100-99-False/100.mp4



100%|██████████| 10000/10000 [22:27<00:00,  7.42it/s]


Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-5-100-99-False/100.mp4
Double Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [01:08<1:09:22,  2.34it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [02:19<1:10:00,  2.26it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [03:30<1:07:15,  2.29it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [04:42<1:04:53,  2.31it/s]

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [04:43<1:04:53,  2.31it/s]

step 200 : 0 , [-0.70899665  0.00418106] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-True/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-True/10.mp4



 10%|█         | 1001/10000 [04:43<1:53:11,  1.33it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-True/10.mp4


 12%|█▎        | 1250/10000 [05:54<1:04:19,  2.27it/s]

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [07:06<1:01:16,  2.31it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [08:17<55:50,  2.46it/s]  

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -162.12, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [09:29<57:24,  2.32it/s]

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [10:41<48:38,  2.66it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -119.0, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [11:54<49:48,  2.51it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -130.72, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [13:08<48:35,  2.49it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -139.04, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [14:20<47:33,  2.45it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -160.08, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [15:34<46:32,  2.42it/s]

step: 649983, episode: 3250, training reward mean: -199.932, test reward mean: -148.32, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [16:46<44:44,  2.42it/s]

step: 699983, episode: 3500, training reward mean: -200.0, test reward mean: -154.44, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [18:01<39:46,  2.62it/s]

step: 749958, episode: 3750, training reward mean: -199.9, test reward mean: -119.04, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [19:15<43:58,  2.27it/s]

step: 799769, episode: 4000, training reward mean: -199.244, test reward mean: -200.0, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [20:27<36:40,  2.61it/s]

step: 849393, episode: 4250, training reward mean: -198.496, test reward mean: -128.12, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [21:41<36:15,  2.53it/s]

step: 898828, episode: 4500, training reward mean: -197.74, test reward mean: -131.96, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [22:53<31:52,  2.75it/s]

step: 947545, episode: 4750, training reward mean: -194.868, test reward mean: -113.48, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [24:03<30:22,  2.74it/s]

step: 995200, episode: 5000, training reward mean: -190.62, test reward mean: -125.64, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [24:04<30:22,  2.74it/s]

step 107 : 2 , [0.5003698  0.01174917] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-True/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-True/50.mp4



 50%|█████     | 5000/10000 [24:04<30:22,  2.74it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-True/50.mp4


 52%|█████▎    | 5250/10000 [25:15<29:17,  2.70it/s]

step: 1042440, episode: 5250, training reward mean: -188.96, test reward mean: -114.76, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [26:23<27:57,  2.68it/s]

step: 1088391, episode: 5500, training reward mean: -183.804, test reward mean: -109.92, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [27:30<26:32,  2.67it/s]

step: 1133703, episode: 5750, training reward mean: -181.248, test reward mean: -124.84, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [28:35<22:22,  2.98it/s]

step: 1177617, episode: 6000, training reward mean: -175.656, test reward mean: -128.28, random move probability: 0.4000000000000661


 62%|██████▎   | 6250/10000 [29:40<23:09,  2.70it/s]

step: 1220062, episode: 6250, training reward mean: -169.78, test reward mean: -132.32, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [30:43<20:07,  2.90it/s]

step: 1261966, episode: 6500, training reward mean: -167.616, test reward mean: -109.2, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [31:47<18:05,  2.99it/s]

step: 1304763, episode: 6750, training reward mean: -171.188, test reward mean: -101.24, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [32:53<17:29,  2.86it/s]

step: 1348341, episode: 7000, training reward mean: -174.312, test reward mean: -100.32, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [33:57<14:52,  3.08it/s]

step: 1391168, episode: 7250, training reward mean: -171.308, test reward mean: -99.56, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [34:59<13:36,  3.06it/s]

step: 1432441, episode: 7500, training reward mean: -165.092, test reward mean: -101.4, random move probability: 0.2500000000000826


 78%|███████▊  | 7751/10000 [36:00<10:45,  3.48it/s]

step: 1472789, episode: 7750, training reward mean: -161.392, test reward mean: -106.6, random move probability: 0.22500000000008535


 80%|████████  | 8001/10000 [37:00<09:46,  3.41it/s]

step: 1512063, episode: 8000, training reward mean: -157.096, test reward mean: -107.4, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [37:57<08:52,  3.29it/s]

step: 1550575, episode: 8250, training reward mean: -154.048, test reward mean: -107.96, random move probability: 0.17500000000009086


 85%|████████▌ | 8501/10000 [38:54<06:24,  3.90it/s]

step: 1587915, episode: 8500, training reward mean: -149.36, test reward mean: -106.36, random move probability: 0.15000000000009361


 88%|████████▊ | 8751/10000 [39:47<04:56,  4.22it/s]

step: 1623027, episode: 8750, training reward mean: -140.448, test reward mean: -102.8, random move probability: 0.12500000000009637


 90%|█████████ | 9001/10000 [40:35<03:53,  4.28it/s]

step: 1654602, episode: 9000, training reward mean: -126.3, test reward mean: -108.0, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [41:23<02:58,  4.20it/s]

step: 1686062, episode: 9250, training reward mean: -125.84, test reward mean: -102.8, random move probability: 0.07500000000009494


 95%|█████████▌| 9501/10000 [42:07<02:09,  3.85it/s]

step: 1714760, episode: 9500, training reward mean: -114.792, test reward mean: -100.64, random move probability: 0.05000000000009422


 98%|█████████▊| 9751/10000 [42:51<00:59,  4.18it/s]

step: 1743423, episode: 9750, training reward mean: -114.652, test reward mean: -105.52, random move probability: 0.05


100%|█████████▉| 9999/10000 [43:32<00:00,  6.26it/s]

step: 1770731, episode: 10000, training reward mean: -109.232, test reward mean: -103.08, random move probability: 0.05


100%|█████████▉| 9999/10000 [43:33<00:00,  6.26it/s]

step 112 : 2 , [0.52078015 0.02140611] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-True/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-True/100.mp4



100%|██████████| 10000/10000 [43:33<00:00,  3.83it/s]


Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [00:57<1:02:31,  2.60it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [01:54<1:01:05,  2.59it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [02:54<1:01:01,  2.53it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [03:54<57:40,  2.60it/s] 

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [03:55<57:40,  2.60it/s]

step 200 : 0 , [-0.7181171   0.00471106] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-False/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-False/10.mp4



 10%|█         | 1001/10000 [03:56<1:44:19,  1.44it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-False/10.mp4


 12%|█▎        | 1250/10000 [04:56<57:36,  2.53it/s]  

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [05:57<55:13,  2.57it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [06:58<55:00,  2.50it/s]

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [07:59<53:23,  2.50it/s]

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [09:00<51:57,  2.49it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [10:02<48:56,  2.55it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [11:02<48:10,  2.51it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [12:03<46:47,  2.49it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [13:05<45:10,  2.49it/s]

step: 650000, episode: 3250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [14:07<43:40,  2.48it/s]

step: 700000, episode: 3500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [15:08<41:18,  2.52it/s]

step: 750000, episode: 3750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [16:10<40:05,  2.49it/s]

step: 800000, episode: 4000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [17:11<31:58,  3.00it/s]

step: 849920, episode: 4250, training reward mean: -199.68, test reward mean: -131.48, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [18:11<32:39,  2.81it/s]

step: 899102, episode: 4500, training reward mean: -196.728, test reward mean: -131.16, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [19:12<31:36,  2.77it/s]

step: 948483, episode: 4750, training reward mean: -197.524, test reward mean: -125.84, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [20:12<26:24,  3.16it/s]

step: 996687, episode: 5000, training reward mean: -192.816, test reward mean: -115.56, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [20:13<26:24,  3.16it/s]

step 89 : 2 , [0.5084752  0.01813406] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-False/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-False/50.mp4



 50%|█████     | 5001/10000 [20:13<37:41,  2.21it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-False/50.mp4


 52%|█████▎    | 5250/10000 [21:11<26:13,  3.02it/s]

step: 1043633, episode: 5250, training reward mean: -187.784, test reward mean: -117.4, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [22:08<22:27,  3.34it/s]

step: 1089810, episode: 5500, training reward mean: -184.708, test reward mean: -104.2, random move probability: 0.4500000000000606


 58%|█████▊    | 5751/10000 [23:04<18:41,  3.79it/s]

step: 1134746, episode: 5750, training reward mean: -179.744, test reward mean: -97.92, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [23:59<20:09,  3.31it/s]

step: 1179003, episode: 6000, training reward mean: -177.028, test reward mean: -110.84, random move probability: 0.4000000000000661


 63%|██████▎   | 6251/10000 [24:54<16:49,  3.71it/s]

step: 1222820, episode: 6250, training reward mean: -175.268, test reward mean: -104.2, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [25:48<17:05,  3.41it/s]

step: 1265439, episode: 6500, training reward mean: -170.476, test reward mean: -111.04, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [26:41<16:19,  3.32it/s]

step: 1307764, episode: 6750, training reward mean: -169.3, test reward mean: -100.4, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [27:34<13:49,  3.62it/s]

step: 1349779, episode: 7000, training reward mean: -168.06, test reward mean: -106.04, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [28:27<14:53,  3.08it/s]

step: 1391426, episode: 7250, training reward mean: -166.588, test reward mean: -144.88, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [29:18<11:41,  3.56it/s]

step: 1432464, episode: 7500, training reward mean: -164.152, test reward mean: -99.2, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [30:11<10:27,  3.59it/s]

step: 1473963, episode: 7750, training reward mean: -165.996, test reward mean: -106.6, random move probability: 0.22500000000008535


 80%|████████  | 8001/10000 [31:01<07:54,  4.21it/s]

step: 1513183, episode: 8000, training reward mean: -156.88, test reward mean: -105.48, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [31:50<08:36,  3.39it/s]

step: 1551938, episode: 8250, training reward mean: -155.02, test reward mean: -101.72, random move probability: 0.17500000000009086


 85%|████████▌ | 8501/10000 [32:36<05:29,  4.55it/s]

step: 1587411, episode: 8500, training reward mean: -141.892, test reward mean: -101.16, random move probability: 0.15000000000009361


 88%|████████▊ | 8751/10000 [33:19<05:02,  4.13it/s]

step: 1620767, episode: 8750, training reward mean: -133.424, test reward mean: -105.72, random move probability: 0.12500000000009637


 90%|█████████ | 9001/10000 [34:00<03:51,  4.32it/s]

step: 1653320, episode: 9000, training reward mean: -130.212, test reward mean: -121.72, random move probability: 0.10000000000009565


 93%|█████████▎| 9251/10000 [34:41<02:26,  5.12it/s]

step: 1684803, episode: 9250, training reward mean: -125.932, test reward mean: -120.92, random move probability: 0.07500000000009494


 95%|█████████▌| 9501/10000 [35:20<01:51,  4.46it/s]

step: 1715335, episode: 9500, training reward mean: -122.128, test reward mean: -139.12, random move probability: 0.05000000000009422


 98%|█████████▊| 9751/10000 [35:59<00:51,  4.80it/s]

step: 1744535, episode: 9750, training reward mean: -116.8, test reward mean: -119.24, random move probability: 0.05


100%|█████████▉| 9999/10000 [36:36<00:00,  7.05it/s]

step: 1773215, episode: 10000, training reward mean: -114.72, test reward mean: -115.56, random move probability: 0.05


100%|█████████▉| 9999/10000 [36:36<00:00,  7.05it/s]

step 106 : 2 , [0.50870836 0.02354641] , -1.0 , True , False , {}
Moviepy - Building video results/MountainCar-v0-10000-100-5-100-99-False/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-5-100-99-False/100.mp4



100%|██████████| 10000/10000 [36:37<00:00,  4.55it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-5-100-99-False/100.mp4





Double Deep Q-learning agent started with PyTorch


  3%|▎         | 255/10000 [00:06<04:19, 37.54it/s]

step: 5281, episode: 250, training reward mean: 21.124, test reward mean: 9.6, random move probability: 0.9750000000000028


  5%|▌         | 503/10000 [00:13<04:39, 34.01it/s]

step: 10813, episode: 500, training reward mean: 22.128, test reward mean: 9.12, random move probability: 0.9500000000000055


  8%|▊         | 754/10000 [00:21<04:11, 36.74it/s]

step: 16258, episode: 750, training reward mean: 21.78, test reward mean: 9.2, random move probability: 0.9250000000000083


 10%|▉         | 999/10000 [00:28<04:34, 32.74it/s]

step: 21851, episode: 1000, training reward mean: 22.372, test reward mean: 9.52, random move probability: 0.900000000000011
step 10 : 1 , [ 0.17718823  1.991976   -0.25194725 -3.0541263 ] , 1.0 , True , False , {}


 10%|▉         | 999/10000 [00:28<04:34, 32.74it/s]

Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-True/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-True/10.mp4



 10%|█         | 1003/10000 [00:29<07:34, 19.81it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-True/10.mp4


 13%|█▎        | 1253/10000 [00:36<04:32, 32.05it/s]

step: 27381, episode: 1250, training reward mean: 22.12, test reward mean: 9.28, random move probability: 0.8750000000000138


 15%|█▌        | 1506/10000 [00:44<04:10, 33.86it/s]

step: 32777, episode: 1500, training reward mean: 21.584, test reward mean: 9.28, random move probability: 0.8500000000000165


 18%|█▊        | 1757/10000 [00:50<03:39, 37.57it/s]

step: 37840, episode: 1750, training reward mean: 20.252, test reward mean: 9.24, random move probability: 0.8250000000000193


 20%|██        | 2007/10000 [00:57<03:44, 35.57it/s]

step: 42708, episode: 2000, training reward mean: 19.472, test reward mean: 9.44, random move probability: 0.800000000000022


 23%|██▎       | 2258/10000 [01:04<03:10, 40.60it/s]

step: 47515, episode: 2250, training reward mean: 19.228, test reward mean: 9.44, random move probability: 0.7750000000000248


 25%|██▌       | 2504/10000 [01:10<03:50, 32.46it/s]

step: 52199, episode: 2500, training reward mean: 18.736, test reward mean: 9.2, random move probability: 0.7500000000000275


 28%|██▊       | 2758/10000 [01:17<02:58, 40.65it/s]

step: 56978, episode: 2750, training reward mean: 19.116, test reward mean: 9.36, random move probability: 0.7250000000000303


 30%|███       | 3004/10000 [01:22<03:12, 36.31it/s]

step: 61244, episode: 3000, training reward mean: 17.064, test reward mean: 9.4, random move probability: 0.700000000000033


 33%|███▎      | 3259/10000 [01:29<02:34, 43.66it/s]

step: 65721, episode: 3250, training reward mean: 17.908, test reward mean: 9.44, random move probability: 0.6750000000000358


 35%|███▌      | 3509/10000 [01:34<02:21, 45.98it/s]

step: 69851, episode: 3500, training reward mean: 16.52, test reward mean: 9.28, random move probability: 0.6500000000000385


 38%|███▊      | 3755/10000 [01:40<02:43, 38.21it/s]

step: 73819, episode: 3750, training reward mean: 15.872, test reward mean: 9.4, random move probability: 0.6250000000000413


 40%|████      | 4004/10000 [01:45<02:20, 42.59it/s]

step: 77888, episode: 4000, training reward mean: 16.276, test reward mean: 9.4, random move probability: 0.600000000000044


 43%|████▎     | 4254/10000 [01:51<02:28, 38.78it/s]

step: 81763, episode: 4250, training reward mean: 15.5, test reward mean: 9.52, random move probability: 0.5750000000000468


 45%|████▌     | 4508/10000 [01:56<01:55, 47.43it/s]

step: 85508, episode: 4500, training reward mean: 14.98, test reward mean: 9.28, random move probability: 0.5500000000000496


 48%|████▊     | 4758/10000 [02:01<01:34, 55.22it/s]

step: 89104, episode: 4750, training reward mean: 14.384, test reward mean: 9.04, random move probability: 0.5250000000000523


 50%|████▉     | 4996/10000 [02:06<01:39, 50.20it/s]

step: 92623, episode: 5000, training reward mean: 14.076, test reward mean: 9.28, random move probability: 0.5000000000000551
step 10 : 1 , [ 0.12389745  1.9088172  -0.23065734 -3.0414512 ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-True/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-True/50.mp4



 50%|█████     | 5002/10000 [02:06<02:03, 40.54it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-True/50.mp4


 53%|█████▎    | 5255/10000 [02:11<01:40, 47.26it/s]

step: 96154, episode: 5250, training reward mean: 14.124, test reward mean: 9.68, random move probability: 0.4750000000000578


 55%|█████▌    | 5506/10000 [02:16<01:25, 52.70it/s]

step: 99458, episode: 5500, training reward mean: 13.216, test reward mean: 9.24, random move probability: 0.4500000000000606


 58%|█████▊    | 5759/10000 [02:21<01:14, 57.11it/s]

step: 102774, episode: 5750, training reward mean: 13.264, test reward mean: 9.36, random move probability: 0.4250000000000633


 60%|██████    | 6009/10000 [02:25<01:19, 50.27it/s]

step: 105918, episode: 6000, training reward mean: 12.576, test reward mean: 9.44, random move probability: 0.4000000000000661


 63%|██████▎   | 6262/10000 [02:30<01:00, 61.87it/s]

step: 108904, episode: 6250, training reward mean: 11.944, test reward mean: 9.16, random move probability: 0.37500000000006883


 65%|██████▌   | 6511/10000 [02:34<00:59, 58.97it/s]

step: 111706, episode: 6500, training reward mean: 11.208, test reward mean: 9.28, random move probability: 0.3500000000000716


 68%|██████▊   | 6762/10000 [02:38<00:52, 61.54it/s]

step: 114442, episode: 6750, training reward mean: 10.944, test reward mean: 9.16, random move probability: 0.32500000000007434


 70%|███████   | 7007/10000 [02:42<00:51, 58.68it/s]

step: 117292, episode: 7000, training reward mean: 11.4, test reward mean: 9.44, random move probability: 0.3000000000000771


 73%|███████▎  | 7261/10000 [02:46<00:48, 55.98it/s]

step: 120143, episode: 7250, training reward mean: 11.404, test reward mean: 9.4, random move probability: 0.27500000000007985


 75%|███████▌  | 7510/10000 [02:50<00:40, 62.12it/s]

step: 122912, episode: 7500, training reward mean: 11.076, test reward mean: 9.48, random move probability: 0.2500000000000826


 78%|███████▊  | 7759/10000 [02:54<00:37, 60.21it/s]

step: 125621, episode: 7750, training reward mean: 10.836, test reward mean: 9.12, random move probability: 0.22500000000008535


 80%|████████  | 8013/10000 [02:58<00:31, 62.81it/s]

step: 128270, episode: 8000, training reward mean: 10.596, test reward mean: 9.32, random move probability: 0.2000000000000881


 83%|████████▎ | 8257/10000 [03:02<00:27, 64.02it/s]

step: 130866, episode: 8250, training reward mean: 10.384, test reward mean: 9.28, random move probability: 0.17500000000009086


 85%|████████▌ | 8513/10000 [03:06<00:22, 66.10it/s]

step: 133446, episode: 8500, training reward mean: 10.32, test reward mean: 9.44, random move probability: 0.15000000000009361


 88%|████████▊ | 8759/10000 [03:09<00:18, 65.94it/s]

step: 135955, episode: 8750, training reward mean: 10.036, test reward mean: 9.2, random move probability: 0.12500000000009637


 90%|█████████ | 9013/10000 [03:13<00:15, 65.13it/s]

step: 138425, episode: 9000, training reward mean: 9.88, test reward mean: 9.24, random move probability: 0.10000000000009565


 93%|█████████▎| 9256/10000 [03:17<00:11, 62.53it/s]

step: 140888, episode: 9250, training reward mean: 9.852, test reward mean: 11.44, random move probability: 0.07500000000009494


 95%|█████████▌| 9510/10000 [03:22<00:07, 65.18it/s]

step: 144397, episode: 9500, training reward mean: 14.036, test reward mean: 9.32, random move probability: 0.05000000000009422


 98%|█████████▊| 9761/10000 [03:26<00:03, 69.55it/s]

step: 146775, episode: 9750, training reward mean: 9.512, test reward mean: 9.28, random move probability: 0.02500000000009372


100%|█████████▉| 9993/10000 [03:29<00:00, 73.24it/s]

step: 149157, episode: 10000, training reward mean: 9.528, test reward mean: 9.36, random move probability: 0.01
step 10 : 1 , [ 0.12789959  1.9195154  -0.25675145 -3.1111119 ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-True/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-True/100.mp4



100%|██████████| 10000/10000 [03:29<00:00, 47.70it/s]


Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  3%|▎         | 252/10000 [00:05<04:17, 37.84it/s]

step: 5044, episode: 250, training reward mean: 20.176, test reward mean: 10.96, random move probability: 0.9750000000000028


  5%|▌         | 505/10000 [00:11<04:14, 37.24it/s]

step: 10312, episode: 500, training reward mean: 21.072, test reward mean: 9.32, random move probability: 0.9500000000000055


  8%|▊         | 760/10000 [00:16<03:06, 49.51it/s]

step: 15446, episode: 750, training reward mean: 20.536, test reward mean: 9.32, random move probability: 0.9250000000000083


                                                   

step: 20612, episode: 1000, training reward mean: 20.664, test reward mean: 9.6, random move probability: 0.900000000000011
step 10 : 0 , [-0.11864131 -1.9045552   0.2103536   2.990226  ] , 1.0 , True , False , {}


 10%|▉         | 996/10000 [00:22<03:02, 49.24it/s]

Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-False/10.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-False/10.mp4



 10%|█         | 1005/10000 [00:22<04:56, 30.30it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-False/10.mp4


 13%|█▎        | 1258/10000 [00:28<03:29, 41.82it/s]

step: 26132, episode: 1250, training reward mean: 22.08, test reward mean: 9.6, random move probability: 0.8750000000000138


 15%|█▌        | 1510/10000 [00:34<02:52, 49.31it/s]

step: 31505, episode: 1500, training reward mean: 21.492, test reward mean: 9.32, random move probability: 0.8500000000000165


 18%|█▊        | 1754/10000 [00:40<03:33, 38.65it/s]

step: 36692, episode: 1750, training reward mean: 20.748, test reward mean: 9.44, random move probability: 0.8250000000000193


 20%|██        | 2005/10000 [00:46<02:56, 45.27it/s]

step: 41719, episode: 2000, training reward mean: 20.108, test reward mean: 9.24, random move probability: 0.800000000000022


 23%|██▎       | 2256/10000 [00:51<02:57, 43.61it/s]

step: 46851, episode: 2250, training reward mean: 20.528, test reward mean: 9.44, random move probability: 0.7750000000000248


 25%|██▌       | 2507/10000 [00:56<02:35, 48.15it/s]

step: 51276, episode: 2500, training reward mean: 17.7, test reward mean: 9.64, random move probability: 0.7500000000000275


 28%|██▊       | 2756/10000 [01:02<02:44, 44.10it/s]

step: 55988, episode: 2750, training reward mean: 18.848, test reward mean: 9.36, random move probability: 0.7250000000000303


 30%|███       | 3008/10000 [01:07<02:30, 46.32it/s]

step: 60619, episode: 3000, training reward mean: 18.524, test reward mean: 9.32, random move probability: 0.700000000000033


 33%|███▎      | 3259/10000 [01:12<02:00, 55.81it/s]

step: 64620, episode: 3250, training reward mean: 16.004, test reward mean: 9.52, random move probability: 0.6750000000000358


 35%|███▌      | 3510/10000 [01:16<01:51, 58.01it/s]

step: 68724, episode: 3500, training reward mean: 16.416, test reward mean: 9.4, random move probability: 0.6500000000000385


 38%|███▊      | 3758/10000 [01:21<01:55, 53.88it/s]

step: 72853, episode: 3750, training reward mean: 16.516, test reward mean: 9.12, random move probability: 0.6250000000000413


 40%|████      | 4009/10000 [01:26<01:54, 52.34it/s]

step: 76941, episode: 4000, training reward mean: 16.352, test reward mean: 9.6, random move probability: 0.600000000000044


 43%|████▎     | 4255/10000 [01:30<01:38, 58.18it/s]

step: 80635, episode: 4250, training reward mean: 14.776, test reward mean: 9.16, random move probability: 0.5750000000000468


 45%|████▌     | 4509/10000 [01:34<01:40, 54.45it/s]

step: 84334, episode: 4500, training reward mean: 14.796, test reward mean: 9.32, random move probability: 0.5500000000000496


 48%|████▊     | 4759/10000 [01:38<01:31, 57.51it/s]

step: 87885, episode: 4750, training reward mean: 14.204, test reward mean: 9.32, random move probability: 0.5250000000000523


 50%|████▉     | 4995/10000 [01:43<01:20, 62.20it/s]

step: 91414, episode: 5000, training reward mean: 14.116, test reward mean: 9.44, random move probability: 0.5000000000000551
step 9 : 0 , [-0.18661657 -1.750619    0.24548279  2.832317  ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-False/50.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-False/50.mp4



 50%|█████     | 5002/10000 [01:43<01:44, 47.75it/s]

Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-False/50.mp4


 53%|█████▎    | 5261/10000 [01:47<01:11, 65.94it/s]

step: 94778, episode: 5250, training reward mean: 13.456, test reward mean: 9.4, random move probability: 0.4750000000000578


 55%|█████▌    | 5511/10000 [01:51<01:15, 59.30it/s]

step: 98208, episode: 5500, training reward mean: 13.72, test reward mean: 9.32, random move probability: 0.4500000000000606


 58%|█████▊    | 5756/10000 [01:55<01:15, 55.86it/s]

step: 101458, episode: 5750, training reward mean: 13.0, test reward mean: 9.6, random move probability: 0.4250000000000633


 60%|██████    | 6012/10000 [01:58<01:01, 64.51it/s]

step: 104554, episode: 6000, training reward mean: 12.384, test reward mean: 9.4, random move probability: 0.4000000000000661


 63%|██████▎   | 6262/10000 [02:02<00:53, 69.65it/s]

step: 107560, episode: 6250, training reward mean: 12.024, test reward mean: 9.96, random move probability: 0.37500000000006883


 65%|██████▌   | 6508/10000 [02:06<00:49, 70.18it/s]

step: 110569, episode: 6500, training reward mean: 12.036, test reward mean: 9.48, random move probability: 0.3500000000000716


 68%|██████▊   | 6765/10000 [02:09<00:43, 74.87it/s]

step: 113453, episode: 6750, training reward mean: 11.536, test reward mean: 9.4, random move probability: 0.32500000000007434


 70%|███████   | 7008/10000 [02:13<00:43, 69.19it/s]

step: 116321, episode: 7000, training reward mean: 11.472, test reward mean: 9.36, random move probability: 0.3000000000000771


 73%|███████▎  | 7262/10000 [02:16<00:37, 73.16it/s]

step: 119108, episode: 7250, training reward mean: 11.148, test reward mean: 9.4, random move probability: 0.27500000000007985


 75%|███████▌  | 7510/10000 [02:19<00:40, 61.49it/s]

step: 121901, episode: 7500, training reward mean: 11.172, test reward mean: 9.88, random move probability: 0.2500000000000826


 78%|███████▊  | 7758/10000 [02:23<00:34, 65.74it/s]

step: 124832, episode: 7750, training reward mean: 11.724, test reward mean: 11.2, random move probability: 0.22500000000008535


 80%|████████  | 8005/10000 [02:27<00:38, 51.63it/s]

step: 128153, episode: 8000, training reward mean: 13.284, test reward mean: 13.76, random move probability: 0.2000000000000881


 83%|████████▎ | 8264/10000 [02:31<00:22, 76.18it/s]

step: 131506, episode: 8250, training reward mean: 13.412, test reward mean: 9.44, random move probability: 0.17500000000009086


 85%|████████▌ | 8511/10000 [02:34<00:19, 77.45it/s]

step: 134058, episode: 8500, training reward mean: 10.208, test reward mean: 9.56, random move probability: 0.15000000000009361


 88%|████████▊ | 8763/10000 [02:38<00:16, 75.31it/s]

step: 136581, episode: 8750, training reward mean: 10.092, test reward mean: 9.2, random move probability: 0.12500000000009637


 90%|█████████ | 9008/10000 [02:41<00:12, 77.78it/s]

step: 139084, episode: 9000, training reward mean: 10.012, test reward mean: 9.6, random move probability: 0.10000000000009565


 93%|█████████▎| 9265/10000 [02:44<00:09, 80.78it/s]

step: 141531, episode: 9250, training reward mean: 9.788, test reward mean: 9.28, random move probability: 0.07500000000009494


 95%|█████████▌| 9517/10000 [02:47<00:05, 82.66it/s]

step: 143930, episode: 9500, training reward mean: 9.596, test reward mean: 9.12, random move probability: 0.05000000000009422


 98%|█████████▊| 9754/10000 [02:51<00:07, 31.87it/s]

step: 146785, episode: 9750, training reward mean: 11.42, test reward mean: 28.44, random move probability: 0.02500000000009372


100%|█████████▉| 9996/10000 [02:56<00:00, 59.95it/s]

step: 150852, episode: 10000, training reward mean: 16.268, test reward mean: 12.56, random move probability: 0.01
step 12 : 1 , [-0.137697   -0.40019003  0.22532645  0.9571071 ] , 1.0 , True , False , {}
Moviepy - Building video results/CartPole-v1-10000-100-1-1-99-False/100.mp4.
Moviepy - Writing video results/CartPole-v1-10000-100-1-1-99-False/100.mp4



100%|██████████| 10000/10000 [02:56<00:00, 56.74it/s]


Moviepy - Done !
Moviepy - video ready results/CartPole-v1-10000-100-1-1-99-False/100.mp4
Double Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [01:09<1:11:36,  2.27it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [02:20<1:08:26,  2.31it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [03:32<1:07:14,  2.29it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [04:44<1:06:01,  2.27it/s]

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [04:45<1:06:01,  2.27it/s]

step 200 : 0 , [-0.8105807   0.00635756] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-True/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-True/10.mp4



 10%|█         | 1001/10000 [04:46<1:53:20,  1.32it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-True/10.mp4


 12%|█▎        | 1250/10000 [05:57<1:02:59,  2.32it/s]

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [07:08<1:02:12,  2.28it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [08:19<59:48,  2.30it/s]  

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [09:32<59:18,  2.25it/s]

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [10:44<56:21,  2.29it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [11:55<54:19,  2.30it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [13:08<53:38,  2.25it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [14:21<53:06,  2.20it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [15:33<49:58,  2.25it/s]

step: 650000, episode: 3250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [16:45<48:27,  2.24it/s]

step: 700000, episode: 3500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [17:59<45:42,  2.28it/s]

step: 750000, episode: 3750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [19:12<45:01,  2.22it/s]

step: 800000, episode: 4000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [20:26<43:32,  2.20it/s]

step: 850000, episode: 4250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [21:40<40:25,  2.27it/s]

step: 900000, episode: 4500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [22:53<38:47,  2.26it/s]

step: 950000, episode: 4750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [24:08<38:30,  2.16it/s]

step: 1000000, episode: 5000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [24:09<38:30,  2.16it/s]

step 200 : 1 , [-0.52648866 -0.00619298] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-True/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-True/50.mp4



 50%|█████     | 5001/10000 [24:09<1:01:45,  1.35it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-True/50.mp4


 52%|█████▎    | 5250/10000 [25:21<35:24,  2.24it/s]  

step: 1050000, episode: 5250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [26:34<33:46,  2.22it/s]

step: 1100000, episode: 5500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [27:48<31:28,  2.25it/s]

step: 1150000, episode: 5750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [29:02<29:18,  2.27it/s]

step: 1200000, episode: 6000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4000000000000661


 62%|██████▎   | 6250/10000 [30:15<28:11,  2.22it/s]

step: 1250000, episode: 6250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [31:29<26:29,  2.20it/s]

step: 1300000, episode: 6500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [32:43<24:39,  2.20it/s]

step: 1350000, episode: 6750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [33:58<22:12,  2.25it/s]

step: 1400000, episode: 7000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [35:12<20:15,  2.26it/s]

step: 1450000, episode: 7250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [36:26<18:56,  2.20it/s]

step: 1500000, episode: 7500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [37:40<16:41,  2.25it/s]

step: 1550000, episode: 7750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.22500000000008535


 80%|████████  | 8000/10000 [38:55<14:59,  2.22it/s]

step: 1600000, episode: 8000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [40:10<13:07,  2.22it/s]

step: 1650000, episode: 8250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [41:25<11:28,  2.18it/s]

step: 1700000, episode: 8500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [42:40<09:31,  2.19it/s]

step: 1750000, episode: 8750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [43:55<07:34,  2.20it/s]

step: 1800000, episode: 9000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [45:11<05:43,  2.18it/s]

step: 1850000, episode: 9250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.07500000000009494


 95%|█████████▌| 9500/10000 [46:26<03:41,  2.25it/s]

step: 1900000, episode: 9500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.05000000000009422


 98%|█████████▊| 9750/10000 [47:41<01:51,  2.24it/s]

step: 1950000, episode: 9750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.02500000000009372


100%|█████████▉| 9999/10000 [48:55<00:00,  3.37it/s]

step: 2000000, episode: 10000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.01


100%|█████████▉| 9999/10000 [48:57<00:00,  3.37it/s]

step 200 : 0 , [-0.7024872  0.003751 ] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-True/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-True/100.mp4



100%|██████████| 10000/10000 [48:57<00:00,  3.40it/s]


Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-True/100.mp4
Deep Q-learning agent started with PyTorch


  2%|▎         | 250/10000 [00:56<1:01:47,  2.63it/s]

step: 50000, episode: 250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9750000000000028


  5%|▌         | 500/10000 [01:55<1:00:57,  2.60it/s]

step: 100000, episode: 500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9500000000000055


  8%|▊         | 750/10000 [02:54<1:00:26,  2.55it/s]

step: 150000, episode: 750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.9250000000000083


 10%|█         | 1000/10000 [03:53<58:31,  2.56it/s] 

step: 200000, episode: 1000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.900000000000011


 10%|█         | 1000/10000 [03:54<58:31,  2.56it/s]

step 200 : 1 , [-0.5265592  -0.00608717] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-False/10.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-False/10.mp4



 10%|█         | 1001/10000 [03:55<1:46:34,  1.41it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-False/10.mp4


 12%|█▎        | 1250/10000 [04:54<57:50,  2.52it/s]  

step: 250000, episode: 1250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8750000000000138


 15%|█▌        | 1500/10000 [05:55<56:49,  2.49it/s]

step: 300000, episode: 1500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8500000000000165


 18%|█▊        | 1750/10000 [06:55<54:59,  2.50it/s]

step: 350000, episode: 1750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.8250000000000193


 20%|██        | 2000/10000 [07:55<53:10,  2.51it/s]

step: 400000, episode: 2000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.800000000000022


 22%|██▎       | 2250/10000 [08:55<50:05,  2.58it/s]

step: 450000, episode: 2250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7750000000000248


 25%|██▌       | 2500/10000 [09:56<50:08,  2.49it/s]

step: 500000, episode: 2500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7500000000000275


 28%|██▊       | 2750/10000 [10:56<47:11,  2.56it/s]

step: 550000, episode: 2750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.7250000000000303


 30%|███       | 3000/10000 [11:57<46:57,  2.48it/s]

step: 600000, episode: 3000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.700000000000033


 32%|███▎      | 3250/10000 [12:58<43:45,  2.57it/s]

step: 650000, episode: 3250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6750000000000358


 35%|███▌      | 3500/10000 [13:59<42:27,  2.55it/s]

step: 700000, episode: 3500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6500000000000385


 38%|███▊      | 3750/10000 [14:59<40:42,  2.56it/s]

step: 750000, episode: 3750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.6250000000000413


 40%|████      | 4000/10000 [16:01<40:34,  2.46it/s]

step: 800000, episode: 4000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.600000000000044


 42%|████▎     | 4250/10000 [17:03<38:44,  2.47it/s]

step: 850000, episode: 4250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5750000000000468


 45%|████▌     | 4500/10000 [18:04<35:53,  2.55it/s]

step: 900000, episode: 4500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5500000000000496


 48%|████▊     | 4750/10000 [19:06<34:54,  2.51it/s]

step: 950000, episode: 4750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5250000000000523


 50%|█████     | 5000/10000 [20:07<32:56,  2.53it/s]

step: 1000000, episode: 5000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.5000000000000551


 50%|█████     | 5000/10000 [20:08<32:56,  2.53it/s]

step 200 : 2 , [-0.26502421 -0.00645698] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-False/50.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-False/50.mp4



 50%|█████     | 5001/10000 [20:08<57:08,  1.46it/s]

Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-False/50.mp4


 52%|█████▎    | 5250/10000 [21:11<32:05,  2.47it/s]

step: 1050000, episode: 5250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4750000000000578


 55%|█████▌    | 5500/10000 [22:13<29:45,  2.52it/s]

step: 1100000, episode: 5500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4500000000000606


 57%|█████▊    | 5750/10000 [23:14<28:53,  2.45it/s]

step: 1150000, episode: 5750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4250000000000633


 60%|██████    | 6000/10000 [24:16<26:09,  2.55it/s]

step: 1200000, episode: 6000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.4000000000000661


 62%|██████▎   | 6250/10000 [25:19<25:21,  2.46it/s]

step: 1250000, episode: 6250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.37500000000006883


 65%|██████▌   | 6500/10000 [26:22<24:11,  2.41it/s]

step: 1300000, episode: 6500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.3500000000000716


 68%|██████▊   | 6750/10000 [27:24<21:58,  2.46it/s]

step: 1350000, episode: 6750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.32500000000007434


 70%|███████   | 7000/10000 [28:27<20:18,  2.46it/s]

step: 1400000, episode: 7000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.3000000000000771


 72%|███████▎  | 7250/10000 [29:30<18:46,  2.44it/s]

step: 1450000, episode: 7250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.27500000000007985


 75%|███████▌  | 7500/10000 [30:33<16:57,  2.46it/s]

step: 1500000, episode: 7500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.2500000000000826


 78%|███████▊  | 7750/10000 [31:35<15:17,  2.45it/s]

step: 1550000, episode: 7750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.22500000000008535


 80%|████████  | 8000/10000 [32:39<13:37,  2.45it/s]

step: 1600000, episode: 8000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.2000000000000881


 82%|████████▎ | 8250/10000 [33:41<11:50,  2.46it/s]

step: 1650000, episode: 8250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.17500000000009086


 85%|████████▌ | 8500/10000 [34:45<09:53,  2.53it/s]

step: 1699984, episode: 8500, training reward mean: -199.936, test reward mean: -200.0, random move probability: 0.15000000000009361


 88%|████████▊ | 8750/10000 [35:48<08:15,  2.52it/s]

step: 1749984, episode: 8750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.12500000000009637


 90%|█████████ | 9000/10000 [36:51<06:46,  2.46it/s]

step: 1799984, episode: 9000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.10000000000009565


 92%|█████████▎| 9250/10000 [37:54<05:10,  2.42it/s]

step: 1849984, episode: 9250, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.07500000000009494


 95%|█████████▌| 9500/10000 [38:59<03:26,  2.42it/s]

step: 1899984, episode: 9500, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.05000000000009422


 98%|█████████▊| 9750/10000 [40:03<01:42,  2.44it/s]

step: 1949984, episode: 9750, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.02500000000009372


100%|█████████▉| 9999/10000 [41:06<00:00,  3.89it/s]

step: 1999984, episode: 10000, training reward mean: -200.0, test reward mean: -200.0, random move probability: 0.01


100%|█████████▉| 9999/10000 [41:07<00:00,  3.89it/s]

step 200 : 0 , [-0.70648825  0.00402056] , -1.0 , False , True , {}
Moviepy - Building video results/MountainCar-v0-10000-100-1-1-99-False/100.mp4.
Moviepy - Writing video results/MountainCar-v0-10000-100-1-1-99-False/100.mp4



100%|██████████| 10000/10000 [41:08<00:00,  4.05it/s]


Moviepy - Done !
Moviepy - video ready results/MountainCar-v0-10000-100-1-1-99-False/100.mp4
