In [None]:
# !pip install swifter
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension

In [None]:
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
from tqdm import tqdm
from itertools import product
import pickle
import random
import sqlite3
import itertools
import os
import pandas as pd
from collections import defaultdict

np.set_printoptions(precision=3)

In [None]:
class Agent:
    def __init__(self):
        pass

    def actRules(self, state):
        return 1
    
    def actNaively(self):
        return 4


In [None]:
#KEY PARAMETERS

lambda_value = 0.6
table_name = f"gs_results_td_lambda_{lambda_value}".replace(".", "")
directory = "e-greedy/agents" #dir to create
gs_db_name = 'e-greedy/grid_search_greedy_r900.db'

### RADIUS SET TO 900

In [None]:
class Car:
    def __init__(self, tyre="Intermediate"):
        self.default_tyre = tyre
        self.possible_tyres = ["Ultrasoft", "Soft", "Intermediate", "Fullwet"]
        self.pitstop_time = 23
        self.reset()
    
    
    def reset(self):
        self.change_tyre(self.default_tyre)
    
    
    def degrade(self, w, r):
        if self.tyre == "Ultrasoft":
            self.condition *= (1 - 0.0050*w - (2500-r)/90000)
        elif self.tyre == "Soft":
            self.condition *= (1 - 0.0051*w - (2500-r)/93000)
        elif self.tyre == "Intermediate":
            self.condition *= (1 - 0.0052*abs(0.5-w) - (2500-r)/95000)
        elif self.tyre == "Fullwet":
            self.condition *= (1 - 0.0053*(1-w) - (2500-r)/97000)
        
        
    def change_tyre(self, new_tyre):
        assert new_tyre in self.possible_tyres
        self.tyre = new_tyre
        self.condition = 1.00
    
    
    def get_velocity(self):
        if self.tyre == "Ultrasoft":
            vel = 80.7*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Soft":
            vel = 80.1*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Intermediate":
            vel = 79.5*(0.2 + 0.8*self.condition**1.5)
        elif self.tyre == "Fullwet":
            vel = 79.0*(0.2 + 0.8*self.condition**1.5)
        return vel

    
class Track:
    def __init__(self, car=Car()):
        # self.radius and self.cur_weather are defined in self.reset()
        self.total_laps = 162
        self.car = car
        self.possible_weather = ["Dry", "20% Wet", "40% Wet", "60% Wet", "80% Wet", "100% Wet"]
        self.wetness = {
            "Dry": 0.00, "20% Wet": 0.20, "40% Wet": 0.40, "60% Wet": 0.60, "80% Wet": 0.80, "100% Wet": 1.00
        }
        self.p_transition = {
            "Dry": {
                "Dry": 0.987, "20% Wet": 0.013, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "20% Wet": {
                "Dry": 0.012, "20% Wet": 0.975, "40% Wet": 0.013, "60% Wet": 0.000, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "40% Wet": {
                "Dry": 0.000, "20% Wet": 0.012, "40% Wet": 0.975, "60% Wet": 0.013, "80% Wet": 0.000, "100% Wet": 0.000
            },
            "60% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.012, "60% Wet": 0.975, "80% Wet": 0.013, "100% Wet": 0.000
            },
            "80% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.012, "80% Wet": 0.975, "100% Wet": 0.013
            },
            "100% Wet": {
                "Dry": 0.000, "20% Wet": 0.000, "40% Wet": 0.000, "60% Wet": 0.000, "80% Wet": 0.012, "100% Wet": 0.988
            }
        }
        self.reset()
    
    
    def reset(self):
        # self.radius = np.random.randint(600,1201)
        self.radius = 900
        self.cur_weather = np.random.choice(self.possible_weather)
        self.is_done = False
        self.pitstop = False
        self.laps_cleared = 0
        self.car.reset()
        return self._get_state()
    
    
    def _get_state(self):
        return [self.car.tyre, self.car.condition, self.cur_weather, self.radius, self.laps_cleared]
        
    
    def transition(self, action=0):
        """
        Args:
            action (int):
                0. Make a pitstop and fit new ‘Ultrasoft’ tyres
                1. Make a pitstop and fit new ‘Soft’ tyres
                2. Make a pitstop and fit new ‘Intermediate’ tyres
                3. Make a pitstop and fit new ‘Fullwet’ tyres
                4. Continue the next lap without changing tyres
        """
        ## Pitstop time will be added on the first eight of the subsequent lap
        time_taken = 0
        if self.laps_cleared == int(self.laps_cleared):
            if self.pitstop:
                self.car.change_tyre(self.committed_tyre)
                time_taken += self.car.pitstop_time
                self.pitstop = False
        
        ## The environment is coded such that only an action taken at the start of the three-quarters mark of each lap matters
        if self.laps_cleared - int(self.laps_cleared) == 0.75:
            if action < 4:
                self.pitstop = True
                self.committed_tyre = self.car.possible_tyres[action]
            else:
                self.pitstop = False
        
        self.cur_weather = np.random.choice(
            self.possible_weather, p=list(self.p_transition[self.cur_weather].values())
        )
        # we assume that degration happens only after a car has travelled the one-eighth lap
        velocity = self.car.get_velocity()
        time_taken += (2*np.pi*self.radius/8) / velocity
        reward = 0 - time_taken
        self.car.degrade(
            w=self.wetness[self.cur_weather], r=self.radius
        )
        self.laps_cleared += 0.125
        
        if self.laps_cleared == self.total_laps:
            self.is_done = True
        
        next_state = self._get_state()
        return reward, next_state, self.is_done, velocity
    
    def step(self, action):
        return self.transition(action)

In [None]:
new_car = Car()
env = Track(new_car)

agent = Agent()

In [None]:
#Sanity check..

state = env.reset()    
done = False
G = 0
while not done:
    action = agent.actNaively()
    reward, next_state, done, velocity = env.transition(action)
    # added velocity for sanity check
    # state = deepcopy(next_state)
    state = next_state
    G += reward

print("G: %d" % G)

## TDLambda Agent

Discretizing States:

Both the current state and the next_state are discretized using the _discretize_state method. This helps in managing the Q-values in a table format for states that are continuous or too granular.
Initialization of Q-values and Eligibility Traces:

If the current state or next_state is not in the Q-table (Q), it's added with an initial Q-value of zero for all actions.
Similarly, if the current state is not in the eligibility traces table (E), it's added with an initial eligibility trace of zero for all actions.
Temporal Difference (TD) Error Calculation:

The agent calculates the best action for the next_state based on the current Q-values.
The TD error is then computed. It's the difference between the expected Q-value (based on the received reward and the estimated future reward for next_state) and the current Q-value for the taken action at state.
Eligibility Trace Update:

The eligibility trace for the current state-action pair (state, action) is incremented by 1.
This helps in assigning "credit" or "blame" for this state-action pair for any future reward or punishment. SARSA(λ) updates not just the current state-action pair but also the recent ones based on their eligibility traces.
Ensure All States in Q have an Eligibility Trace:

The agent makes sure that for all states in the Q-table, there's a corresponding entry in the eligibility traces table. If not, it adds them with a value of zero.
Update Q-values Using TD Error and Eligibility Traces:

The Q-values for all state-action pairs are updated based on the TD error and their respective eligibility traces. The greater the eligibility trace for a state-action pair, the more it gets updated.
This is the heart of the SARSA(λ) algorithm, allowing the Q-value updates to be spread out not just to the current state-action pair but also to the previous ones based on their eligibility.
Decay All Eligibility Traces:

After updating the Q-values, the eligibility traces for all state-action pairs are decayed by a factor of gamma * td_lambda. This ensures that the traces for older state-action pairs decrease over time, giving more importance to recent state-action pairs for future Q-value updates.
In essence, the update method blends the strengths of both SARSA and eligibility traces, providing a more sophisticated update mechanism that takes into account both immediate and more distant state-action pairs when adjusting Q-values.

In [None]:
class TDLambdaAgent:
    def __init__(self, epsilon=0.1, epsilon_decay=0.995, alpha=0.1,
                  gamma=0.99, td_lambda=lambda_value, n_actions=5,
                    no_change_after_lap=150, state_space_discretization=100):
        self.epsilon = epsilon
        self.alpha = alpha
        self.gamma = gamma
        self.Q = defaultdict(lambda: np.zeros(self.n_actions))
        self.E = defaultdict(lambda: np.zeros(self.n_actions))  # Eligibility traces
        self.n_actions = n_actions
        self.state_space_discretization = state_space_discretization
        self.epsilon_decay = epsilon_decay
        self.no_change_after_lap = no_change_after_lap
        self.td_lambda = td_lambda
        
    def _discretize_state(self, state):
        tyre, condition, weather, radius, laps_cleared = state
        condition = int(condition * self.state_space_discretization)
        
        # Discretizing the radius as well by rounding to nearest hundred
        radius = round(radius, -2)
        return (tyre, condition, weather, radius, laps_cleared)
    
    def act(self, state):
        state = self._discretize_state(state)
        _, _, _, _, laps_cleared = state
        
        if np.random.rand() < self.epsilon:
            if laps_cleared >= self.no_change_after_lap:
                return 4  # Don't change tires
            return np.random.choice(self.n_actions)
        else:
            action = np.argmax(self.Q.get(state, np.zeros(self.n_actions)))
            if laps_cleared >= self.no_change_after_lap and action < 4:
                return 4  # Don't change tires
            return action
        
    def update(self, state, action, reward, next_state):
        state = self._discretize_state(state)
        next_state = self._discretize_state(next_state)
        
        best_next_action = np.argmax(self.Q[next_state])
        td_error = reward + self.gamma * self.Q[next_state][best_next_action] - self.Q[state][action]
        
        # Increment the eligibility trace for the current state-action pair
        self.E[state][action] += 1  
        
        # Update Q-values for all state-action pairs using TD error and the eligibility traces
        for s, actions in self.Q.items():
            for a in range(self.n_actions):
                self.Q[s][a] += self.alpha * td_error * self.E[s][a]
                
                # Decay the eligibility trace for the state-action pair
                self.E[s][a] *= self.gamma * self.td_lambda

# Training the agent
agent = TDLambdaAgent()

num_episodes = 5
eval_episodes = 1

episode_rewards = []
for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        action = agent.act(state)
        reward, next_state, done, _ = env.step(action)
        agent.update(state, action, reward, next_state)
        # state = deepcopy(next_state)
        state = next_state
        episode_reward += reward
    episode_rewards.append(episode_reward)
    agent.epsilon *= agent.epsilon_decay
    
# Evaluating the agent
total_rewards = []
for _ in tqdm(range(eval_episodes)):
    state = env.reset()
    done = False
    episode_reward = 0
    while not done:
        action = agent.act(state)
        reward, next_state, done, _ = env.step(action)
        # state = deepcopy(next_state)
        state = next_state
        episode_reward += reward
    total_rewards.append(episode_reward)

print(f"Average reward over {eval_episodes} episodes: {np.mean(total_rewards)}")


In [None]:
num_episodes = 5
eval_episodes = 1

# Grid Search
"""
param_grid: dict
    A dictionary containing hyperparameters and their possible values to be explored during grid search. Each key represents a hyperparameter, and its corresponding value is a list of values that will be tested for that hyperparameter.

    Parameters:
    - epsilon: list of floats
        The initial exploration rate for the epsilon-greedy policy. Values represent the probability at which the agent will explore, i.e., take random actions. Higher values mean more exploration and less exploitation.

    - epsilon_decay: list of floats
        The rate at which epsilon will be reduced after each episode. Values are in the range (0, 1) with values closer to 1 meaning a slower decay of epsilon over time.

    - alpha: list of floats
        The learning rate for the Q-learning algorithm. Determines to what extent newly acquired information overrides old information. A value of 0 would make the agent not learn anything, while a value of 1 would make the agent consider only the most recent information.

    - gamma: list of floats
        The discount factor for the Q-learning algorithm. Represents the agent's consideration for future rewards. A value of 0 makes the agent short-sighted by only considering current rewards, while a value close to 1 will make it aim for a long-term high reward.

    - no_change_after_lap: list of integers
        Specifies the lap number after which the agent will not change tires. If the current lap exceeds this number, actions to change tires will not be taken.
"""
param_grid = {
    'epsilon': [0.0, 0.025, 0.05, 0.1, 0.2],
    'epsilon_decay': [0.990, 0.995, 0.999],
    'alpha': [0.0, 0.2, 0.4, 0.6, 0.8, 0.9, 1.0],
    'gamma': [0.0, 0.2, 0.4, 0.6, 0.8, 0.9, 1.0],
    'no_change_after_lap': [x for x in range(100, 161, 5)]
}

In [None]:
def setup_database():
    conn = sqlite3.connect(gs_db_name)
    c = conn.cursor()
    c.execute(f'''CREATE TABLE IF NOT EXISTS {table_name}
                 (epsilon real, epsilon_decay real, alpha real, gamma real, no_change_after_lap integer, 
                  avg_last_50 real, min_last_50 real, max_last_50 real, 
                  eval_avg real, overall_avg real)''')
    conn.commit()
    conn.close()

setup_database()

In [22]:
# if not os.path.exists(directory):
#     os.makedirs(directory)

def parameter_combinations(param_grid):
    """
    Generate random combinations of parameters from the given parameter grid.
    
    Args:
    - param_grid (dict): Dictionary containing hyperparameters and their possible values.

    Yields:
    - dict: Random combination of hyperparameters.
    """
    # Connect to the SQLite database and fetch existing combinations
    conn = sqlite3.connect(gs_db_name)
    c = conn.cursor()

    # Check if the table exists. If not, create an empty DataFrame.
    c.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}';")
    if c.fetchone():
        existing_combinations_df = pd.read_sql_query(f"SELECT epsilon, epsilon_decay, alpha, gamma, no_change_after_lap FROM {table_name}", conn)
    else:
        existing_combinations_df = pd.DataFrame()

    conn.close()

    # Generate and check combinations
    keys, values = zip(*param_grid.items())
    print(len(keys))
    all_combinations = list(itertools.product(*values))
    random.shuffle(all_combinations)

    for combination in all_combinations:
        param_dict = dict(zip(keys, combination))

        # Check if this combination is already in the database
        conditions = np.logical_and.reduce([existing_combinations_df[k] == v for k, v in param_dict.items()])
        if not existing_combinations_df[conditions].empty:
            print('SKIPPING')
            continue  # Skip this combination if it's already in the database

        yield param_dict

# 1. Plot Episode Rewards
def plot_rewards(rewards):
    plt.plot(rewards)
    plt.title('Episode Rewards Over Time')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.show()

# 2. Plot Average Episode Rewards
def plot_avg_rewards(rewards, window=5):
    averages = [np.mean(rewards[max(0, i-window+1):i+1]) for i in range(len(rewards))]
    plt.plot(averages)
    plt.title(f'Average Episode Rewards Over Last {window} Episodes')
    plt.xlabel('Episode')
    plt.ylabel('Average Reward')
    plt.show()

# 3. Implement Grid Search
def grid_search(param_grid, num_episodes, eval_episodes):
    conn = sqlite3.connect(gs_db_name)
    c = conn.cursor()

    total_combinations = np.prod([len(v) for v in param_grid.values()])
    print(f"Total combinations: {total_combinations}")
    count = 0

    for params in tqdm(parameter_combinations(param_grid)):
        count += 1
        print(f"\nRunning combination {count}/{total_combinations} with parameters: {params}")

        agent = TDLambdaAgent(epsilon=params['epsilon'], epsilon_decay=params['epsilon_decay'], alpha=params['alpha'], gamma=params['gamma'], no_change_after_lap=params['no_change_after_lap'])
        
        # Training phase
        training_rewards = []  # List to store rewards from each episode
        for episode in range(num_episodes):
            if episode % 100 == 0:  # Print update every 100 episodes
                print(f"    Training Episode {episode}/{num_episodes}")
            state = env.reset()
            done = False
            episode_reward = 0 
            while not done:
                action = agent.act(state)
                reward, next_state, done, _ = env.step(action)
                episode_reward += reward
                agent.update(state, action, reward, next_state)
                # state = deepcopy(next_state)
                state = next_state
            training_rewards.append(episode_reward)  # Add the total reward for this episode to the list


        # Evaluation phase
        eval_rewards = []
        for episode in range(eval_episodes):
            if episode % 5 == 0:  # Print update every 100 episodes
                print(f"    Evaluation Episode {episode}/{eval_episodes}")
            state = env.reset()
            done = False
            episode_reward = 0
            while not done:
                action = agent.act(state)
                reward, next_state, done, _ = env.step(action)
                episode_reward += reward
                # state = deepcopy(next_state)
                state = next_state
            eval_rewards.append(episode_reward)

        # Compute metrics based on training rewards
        avg_last_50 = np.mean(training_rewards[-50:])
        min_last_50 = np.min(training_rewards[-50:])
        max_last_50 = np.max(training_rewards[-50:])
        eval_avg = np.mean(eval_rewards)
        

        # Print evaluation metrics
        print(f"    Average Reward over last 50 episodes: {avg_last_50:.2f}")
        print(f"    Min Reward over last 50 episodes: {min_last_50:.2f}")
        print(f"    Max Reward over last 50 episodes: {max_last_50:.2f}")
        print(f"    Evaluation Average Reward: {eval_avg:.2f}")

        # Save metrics to database
        c.execute(f"""INSERT INTO {table_name} (epsilon, epsilon_decay, alpha, gamma, no_change_after_lap, 
                                               avg_last_50, min_last_50, max_last_50, 
                                               eval_avg, overall_avg) 
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""", 
                  (params['epsilon'], params['epsilon_decay'], params['alpha'], params['gamma'], 
                   params['no_change_after_lap'], avg_last_50, min_last_50, max_last_50, 
                   eval_avg, np.mean(training_rewards)))

        conn.commit()
        
    conn.close()

grid_search(param_grid, num_episodes, eval_episodes)

46it [11:36, 15.96s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18216.49
    Min Reward over last 50 episodes: -18248.62
    Max Reward over last 50 episodes: -18184.36
    Evaluation Average Reward: -17908.78

Running combination 47/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 155}
    Training Episode 0/2


47it [11:51, 15.76s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16341.76
    Min Reward over last 50 episodes: -16432.36
    Max Reward over last 50 episodes: -16251.17
    Evaluation Average Reward: -16414.86

Running combination 48/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


48it [12:06, 15.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16102.28
    Min Reward over last 50 episodes: -16105.15
    Max Reward over last 50 episodes: -16099.42
    Evaluation Average Reward: -16088.13

Running combination 49/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 1.0, 'no_change_after_lap': 150}
    Training Episode 0/2


49it [12:21, 15.51s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17019.77
    Min Reward over last 50 episodes: -17027.81
    Max Reward over last 50 episodes: -17011.72
    Evaluation Average Reward: -16897.15

Running combination 50/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


50it [12:36, 15.26s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17852.16
    Min Reward over last 50 episodes: -17854.75
    Max Reward over last 50 episodes: -17849.57
    Evaluation Average Reward: -17973.63

Running combination 51/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


51it [12:51, 15.27s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18104.14
    Min Reward over last 50 episodes: -18326.98
    Max Reward over last 50 episodes: -17881.30
    Evaluation Average Reward: -17852.36

Running combination 52/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 110}
    Training Episode 0/2


52it [13:09, 15.88s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26876.54
    Min Reward over last 50 episodes: -27122.52
    Max Reward over last 50 episodes: -26630.57
    Evaluation Average Reward: -26847.82

Running combination 53/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


53it [13:26, 16.34s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29198.66
    Min Reward over last 50 episodes: -29409.94
    Max Reward over last 50 episodes: -28987.38
    Evaluation Average Reward: -29339.09

Running combination 54/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.8, 'no_change_after_lap': 150}
    Training Episode 0/2


54it [13:43, 16.52s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17250.79
    Min Reward over last 50 episodes: -17271.64
    Max Reward over last 50 episodes: -17229.95
    Evaluation Average Reward: -16997.30

Running combination 55/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 125}
    Training Episode 0/2


55it [14:00, 16.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23087.51
    Min Reward over last 50 episodes: -23330.25
    Max Reward over last 50 episodes: -22844.76
    Evaluation Average Reward: -22779.80

Running combination 56/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 100}
    Training Episode 0/2


56it [14:16, 16.35s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29309.89
    Min Reward over last 50 episodes: -29414.93
    Max Reward over last 50 episodes: -29204.85
    Evaluation Average Reward: -29224.59

Running combination 57/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


57it [14:31, 16.10s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16073.60
    Min Reward over last 50 episodes: -16120.72
    Max Reward over last 50 episodes: -16026.49
    Evaluation Average Reward: -16082.88

Running combination 58/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


58it [14:46, 15.56s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19329.46
    Min Reward over last 50 episodes: -19513.65
    Max Reward over last 50 episodes: -19145.28
    Evaluation Average Reward: -19034.58

Running combination 59/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 1.0, 'no_change_after_lap': 105}
    Training Episode 0/2


59it [15:01, 15.55s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28006.51
    Min Reward over last 50 episodes: -28112.54
    Max Reward over last 50 episodes: -27900.48
    Evaluation Average Reward: -28112.33

Running combination 60/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 155}
    Training Episode 0/2


60it [15:17, 15.61s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16313.40
    Min Reward over last 50 episodes: -16344.67
    Max Reward over last 50 episodes: -16282.14
    Evaluation Average Reward: -16188.56

Running combination 61/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.2, 'no_change_after_lap': 155}
    Training Episode 0/2


61it [15:32, 15.57s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16373.34
    Min Reward over last 50 episodes: -16420.07
    Max Reward over last 50 episodes: -16326.61
    Evaluation Average Reward: -16443.72

Running combination 62/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


62it [15:49, 15.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19372.08
    Min Reward over last 50 episodes: -19429.90
    Max Reward over last 50 episodes: -19314.27
    Evaluation Average Reward: -19015.41

Running combination 63/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 150}
    Training Episode 0/2


63it [16:06, 16.22s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16999.43
    Min Reward over last 50 episodes: -17044.46
    Max Reward over last 50 episodes: -16954.40
    Evaluation Average Reward: -17091.85

Running combination 64/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 1.0, 'no_change_after_lap': 150}
    Training Episode 0/2


64it [16:22, 16.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17179.90
    Min Reward over last 50 episodes: -17235.33
    Max Reward over last 50 episodes: -17124.48
    Evaluation Average Reward: -17183.61

Running combination 65/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 155}
    Training Episode 0/2


65it [16:37, 15.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16258.09
    Min Reward over last 50 episodes: -16304.49
    Max Reward over last 50 episodes: -16211.69
    Evaluation Average Reward: -16283.58

Running combination 66/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 130}
    Training Episode 0/2


66it [16:53, 15.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21867.59
    Min Reward over last 50 episodes: -21984.36
    Max Reward over last 50 episodes: -21750.83
    Evaluation Average Reward: -21868.72

Running combination 67/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 100}
    Training Episode 0/2


67it [17:08, 15.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29358.76
    Min Reward over last 50 episodes: -29415.44
    Max Reward over last 50 episodes: -29302.08
    Evaluation Average Reward: -29635.22

Running combination 68/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


68it [17:23, 15.48s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24379.52
    Min Reward over last 50 episodes: -24431.08
    Max Reward over last 50 episodes: -24327.96
    Evaluation Average Reward: -24628.90

Running combination 69/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


69it [17:39, 15.39s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21742.49
    Min Reward over last 50 episodes: -21980.23
    Max Reward over last 50 episodes: -21504.75
    Evaluation Average Reward: -21436.86

Running combination 70/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 1.0, 'no_change_after_lap': 130}
    Training Episode 0/2


70it [17:53, 15.24s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21852.32
    Min Reward over last 50 episodes: -21950.38
    Max Reward over last 50 episodes: -21754.25
    Evaluation Average Reward: -21572.78

Running combination 71/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 160}
    Training Episode 0/2


71it [18:09, 15.23s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16007.03
    Min Reward over last 50 episodes: -16024.08
    Max Reward over last 50 episodes: -15989.97
    Evaluation Average Reward: -16071.58

Running combination 72/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 135}
    Training Episode 0/2


72it [18:23, 15.10s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20497.52
    Min Reward over last 50 episodes: -20737.51
    Max Reward over last 50 episodes: -20257.53
    Evaluation Average Reward: -20544.08

Running combination 73/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 155}
    Training Episode 0/2


73it [18:38, 15.10s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16338.75
    Min Reward over last 50 episodes: -16428.01
    Max Reward over last 50 episodes: -16249.50
    Evaluation Average Reward: -16398.81

Running combination 74/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


74it [18:53, 14.94s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24605.12
    Min Reward over last 50 episodes: -24650.92
    Max Reward over last 50 episodes: -24559.32
    Evaluation Average Reward: -24309.61

Running combination 75/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 130}
    Training Episode 0/2


75it [19:08, 15.08s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21752.35
    Min Reward over last 50 episodes: -21849.37
    Max Reward over last 50 episodes: -21655.33
    Evaluation Average Reward: -21608.94

Running combination 76/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 125}
    Training Episode 0/2


76it [19:23, 15.04s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23055.12
    Min Reward over last 50 episodes: -23143.07
    Max Reward over last 50 episodes: -22967.18
    Evaluation Average Reward: -23370.43

Running combination 77/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 130}
    Training Episode 0/2


77it [19:39, 15.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22065.52
    Min Reward over last 50 episodes: -22151.99
    Max Reward over last 50 episodes: -21979.05
    Evaluation Average Reward: -21939.74

Running combination 78/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.6, 'no_change_after_lap': 100}
    Training Episode 0/2


78it [19:54, 15.13s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29482.45
    Min Reward over last 50 episodes: -29604.93
    Max Reward over last 50 episodes: -29359.97
    Evaluation Average Reward: -29553.93

Running combination 79/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 150}
    Training Episode 0/2


79it [20:09, 15.16s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16989.07
    Min Reward over last 50 episodes: -17039.47
    Max Reward over last 50 episodes: -16938.68
    Evaluation Average Reward: -17243.29

Running combination 80/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.6, 'no_change_after_lap': 110}
    Training Episode 0/2


80it [20:24, 15.03s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26692.62
    Min Reward over last 50 episodes: -26695.77
    Max Reward over last 50 episodes: -26689.48
    Evaluation Average Reward: -26688.74

Running combination 81/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


81it [20:39, 15.02s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24287.42
    Min Reward over last 50 episodes: -24486.78
    Max Reward over last 50 episodes: -24088.06
    Evaluation Average Reward: -24335.42

Running combination 82/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


82it [20:54, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21940.41
    Min Reward over last 50 episodes: -21973.07
    Max Reward over last 50 episodes: -21907.75
    Evaluation Average Reward: -21566.11

Running combination 83/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 105}
    Training Episode 0/2


83it [21:09, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28093.57
    Min Reward over last 50 episodes: -28160.99
    Max Reward over last 50 episodes: -28026.14
    Evaluation Average Reward: -27956.77

Running combination 84/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 130}
    Training Episode 0/2


84it [21:23, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22102.50
    Min Reward over last 50 episodes: -22136.88
    Max Reward over last 50 episodes: -22068.11
    Evaluation Average Reward: -21996.84

Running combination 85/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


85it [21:37, 14.64s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24563.95
    Min Reward over last 50 episodes: -24637.85
    Max Reward over last 50 episodes: -24490.06
    Evaluation Average Reward: -24173.19

Running combination 86/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


86it [21:52, 14.75s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16080.35
    Min Reward over last 50 episodes: -16131.39
    Max Reward over last 50 episodes: -16029.32
    Evaluation Average Reward: -16101.13

Running combination 87/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


87it [22:07, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28159.17
    Min Reward over last 50 episodes: -28238.59
    Max Reward over last 50 episodes: -28079.74
    Evaluation Average Reward: -28310.28

Running combination 88/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 160}
    Training Episode 0/2


88it [22:22, 14.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16056.67
    Min Reward over last 50 episodes: -16083.75
    Max Reward over last 50 episodes: -16029.60
    Evaluation Average Reward: -16099.41

Running combination 89/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 105}
    Training Episode 0/2


89it [22:37, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28209.09
    Min Reward over last 50 episodes: -28464.39
    Max Reward over last 50 episodes: -27953.79
    Evaluation Average Reward: -28373.08

Running combination 90/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


90it [22:51, 14.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18412.79
    Min Reward over last 50 episodes: -18449.44
    Max Reward over last 50 episodes: -18376.14
    Evaluation Average Reward: -18398.00

Running combination 91/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 160}
    Training Episode 0/2


91it [23:06, 14.77s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16068.54
    Min Reward over last 50 episodes: -16127.47
    Max Reward over last 50 episodes: -16009.62
    Evaluation Average Reward: -16103.70

Running combination 92/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


92it [23:21, 14.62s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24251.00
    Min Reward over last 50 episodes: -24472.24
    Max Reward over last 50 episodes: -24029.76
    Evaluation Average Reward: -24088.60

Running combination 93/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 110}
    Training Episode 0/2


93it [23:36, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26889.70
    Min Reward over last 50 episodes: -27133.66
    Max Reward over last 50 episodes: -26645.74
    Evaluation Average Reward: -26991.32

Running combination 94/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 125}
    Training Episode 0/2


94it [23:51, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22991.30
    Min Reward over last 50 episodes: -23152.72
    Max Reward over last 50 episodes: -22829.88
    Evaluation Average Reward: -22880.96

Running combination 95/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 130}
    Training Episode 0/2


95it [24:06, 15.00s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21927.73
    Min Reward over last 50 episodes: -22075.33
    Max Reward over last 50 episodes: -21780.13
    Evaluation Average Reward: -22077.22

Running combination 96/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 105}
    Training Episode 0/2


96it [24:21, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28034.09
    Min Reward over last 50 episodes: -28050.98
    Max Reward over last 50 episodes: -28017.20
    Evaluation Average Reward: -27830.82

Running combination 97/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 125}
    Training Episode 0/2


97it [24:36, 14.94s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22925.29
    Min Reward over last 50 episodes: -23071.41
    Max Reward over last 50 episodes: -22779.17
    Evaluation Average Reward: -23369.49

Running combination 98/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 155}
    Training Episode 0/2


98it [24:51, 15.04s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16329.42
    Min Reward over last 50 episodes: -16390.88
    Max Reward over last 50 episodes: -16267.95
    Evaluation Average Reward: -16403.12

Running combination 99/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 115}
    Training Episode 0/2


99it [25:06, 14.96s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25819.07
    Min Reward over last 50 episodes: -25829.01
    Max Reward over last 50 episodes: -25809.12
    Evaluation Average Reward: -25760.87

Running combination 100/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


100it [25:21, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25491.90
    Min Reward over last 50 episodes: -25757.40
    Max Reward over last 50 episodes: -25226.40
    Evaluation Average Reward: -25460.20

Running combination 101/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 120}
    Training Episode 0/2


101it [25:36, 14.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24390.48
    Min Reward over last 50 episodes: -24463.81
    Max Reward over last 50 episodes: -24317.14
    Evaluation Average Reward: -24645.05

Running combination 102/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


102it [25:51, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19303.28
    Min Reward over last 50 episodes: -19328.41
    Max Reward over last 50 episodes: -19278.15
    Evaluation Average Reward: -19377.38

Running combination 103/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


103it [26:05, 14.83s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25915.59
    Min Reward over last 50 episodes: -25961.73
    Max Reward over last 50 episodes: -25869.46
    Evaluation Average Reward: -25718.24

Running combination 104/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


104it [26:20, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16097.96
    Min Reward over last 50 episodes: -16118.64
    Max Reward over last 50 episodes: -16077.28
    Evaluation Average Reward: -16006.02

Running combination 105/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


105it [26:35, 14.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16097.52
    Min Reward over last 50 episodes: -16123.56
    Max Reward over last 50 episodes: -16071.48
    Evaluation Average Reward: -16066.32

Running combination 106/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


106it [26:50, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24385.37
    Min Reward over last 50 episodes: -24496.13
    Max Reward over last 50 episodes: -24274.62
    Evaluation Average Reward: -24268.96

Running combination 107/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 140}
    Training Episode 0/2


107it [27:05, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19477.03
    Min Reward over last 50 episodes: -19631.57
    Max Reward over last 50 episodes: -19322.50
    Evaluation Average Reward: -19528.77

Running combination 108/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 145}
    Training Episode 0/2


108it [27:19, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18122.37
    Min Reward over last 50 episodes: -18162.91
    Max Reward over last 50 episodes: -18081.84
    Evaluation Average Reward: -18370.57

Running combination 109/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 125}
    Training Episode 0/2


109it [27:34, 14.68s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23214.72
    Min Reward over last 50 episodes: -23263.91
    Max Reward over last 50 episodes: -23165.53
    Evaluation Average Reward: -23226.64

Running combination 110/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 1.0, 'no_change_after_lap': 155}
    Training Episode 0/2


110it [27:48, 14.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16316.26
    Min Reward over last 50 episodes: -16383.46
    Max Reward over last 50 episodes: -16249.06
    Evaluation Average Reward: -16366.84

Running combination 111/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 125}
    Training Episode 0/2


111it [28:03, 14.75s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23278.28
    Min Reward over last 50 episodes: -23279.38
    Max Reward over last 50 episodes: -23277.18
    Evaluation Average Reward: -23016.24

Running combination 112/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 105}
    Training Episode 0/2


112it [28:18, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28254.57
    Min Reward over last 50 episodes: -28269.62
    Max Reward over last 50 episodes: -28239.53
    Evaluation Average Reward: -28009.53

Running combination 113/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 125}
    Training Episode 0/2


113it [28:34, 14.94s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23123.34
    Min Reward over last 50 episodes: -23313.20
    Max Reward over last 50 episodes: -22933.49
    Evaluation Average Reward: -22796.15

Running combination 114/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 115}
    Training Episode 0/2


114it [28:48, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25768.70
    Min Reward over last 50 episodes: -25812.75
    Max Reward over last 50 episodes: -25724.66
    Evaluation Average Reward: -25486.45

Running combination 115/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 130}
    Training Episode 0/2


115it [29:03, 14.82s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21759.16
    Min Reward over last 50 episodes: -22013.93
    Max Reward over last 50 episodes: -21504.40
    Evaluation Average Reward: -21885.82

Running combination 116/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


116it [29:17, 14.65s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18192.30
    Min Reward over last 50 episodes: -18413.62
    Max Reward over last 50 episodes: -17970.98
    Evaluation Average Reward: -17902.23

Running combination 117/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.6, 'no_change_after_lap': 155}
    Training Episode 0/2


117it [29:32, 14.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16284.86
    Min Reward over last 50 episodes: -16313.80
    Max Reward over last 50 episodes: -16255.93
    Evaluation Average Reward: -16446.67

Running combination 118/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


118it [29:47, 14.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18056.16
    Min Reward over last 50 episodes: -18233.78
    Max Reward over last 50 episodes: -17878.55
    Evaluation Average Reward: -17822.76

Running combination 119/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 155}
    Training Episode 0/2


119it [30:01, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16392.61
    Min Reward over last 50 episodes: -16427.66
    Max Reward over last 50 episodes: -16357.56
    Evaluation Average Reward: -16327.99

Running combination 120/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


120it [30:16, 14.75s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19353.49
    Min Reward over last 50 episodes: -19583.36
    Max Reward over last 50 episodes: -19123.63
    Evaluation Average Reward: -19069.95

Running combination 121/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 150}
    Training Episode 0/2


121it [30:31, 14.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17104.68
    Min Reward over last 50 episodes: -17296.14
    Max Reward over last 50 episodes: -16913.22
    Evaluation Average Reward: -17251.25

Running combination 122/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 115}
    Training Episode 0/2


122it [30:46, 14.80s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25671.25
    Min Reward over last 50 episodes: -25724.65
    Max Reward over last 50 episodes: -25617.85
    Evaluation Average Reward: -25726.49

Running combination 123/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 115}
    Training Episode 0/2


123it [31:00, 14.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25745.11
    Min Reward over last 50 episodes: -25829.12
    Max Reward over last 50 episodes: -25661.11
    Evaluation Average Reward: -25272.31

Running combination 124/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


124it [31:15, 14.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18054.12
    Min Reward over last 50 episodes: -18210.28
    Max Reward over last 50 episodes: -17897.97
    Evaluation Average Reward: -18001.50

Running combination 125/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 130}
    Training Episode 0/2


125it [31:30, 14.76s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22105.44
    Min Reward over last 50 episodes: -22141.41
    Max Reward over last 50 episodes: -22069.46
    Evaluation Average Reward: -21869.80

Running combination 126/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 155}
    Training Episode 0/2


126it [31:45, 14.88s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16301.05
    Min Reward over last 50 episodes: -16305.44
    Max Reward over last 50 episodes: -16296.67
    Evaluation Average Reward: -16368.43

Running combination 127/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 1.0, 'no_change_after_lap': 105}
    Training Episode 0/2


127it [32:00, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27814.95
    Min Reward over last 50 episodes: -27927.24
    Max Reward over last 50 episodes: -27702.65
    Evaluation Average Reward: -27912.92

Running combination 128/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 155}
    Training Episode 0/2


128it [32:15, 14.87s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16377.91
    Min Reward over last 50 episodes: -16406.24
    Max Reward over last 50 episodes: -16349.58
    Evaluation Average Reward: -16297.61

Running combination 129/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 140}
    Training Episode 0/2


129it [32:30, 14.83s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19193.15
    Min Reward over last 50 episodes: -19205.52
    Max Reward over last 50 episodes: -19180.77
    Evaluation Average Reward: -19589.44

Running combination 130/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.2, 'no_change_after_lap': 155}
    Training Episode 0/2


130it [32:45, 14.80s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16309.74
    Min Reward over last 50 episodes: -16396.07
    Max Reward over last 50 episodes: -16223.40
    Evaluation Average Reward: -16419.44

Running combination 131/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 100}
    Training Episode 0/2


131it [33:00, 15.05s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29291.31
    Min Reward over last 50 episodes: -29459.80
    Max Reward over last 50 episodes: -29122.82
    Evaluation Average Reward: -29251.56

Running combination 132/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 130}
    Training Episode 0/2


132it [33:15, 15.02s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21779.12
    Min Reward over last 50 episodes: -22077.50
    Max Reward over last 50 episodes: -21480.75
    Evaluation Average Reward: -21559.75

Running combination 133/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 140}
    Training Episode 0/2


133it [33:30, 15.02s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19353.66
    Min Reward over last 50 episodes: -19385.63
    Max Reward over last 50 episodes: -19321.70
    Evaluation Average Reward: -19587.19

Running combination 134/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 100}
    Training Episode 0/2


134it [33:45, 14.86s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29529.66
    Min Reward over last 50 episodes: -29647.27
    Max Reward over last 50 episodes: -29412.06
    Evaluation Average Reward: -29379.45

Running combination 135/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 110}
    Training Episode 0/2


135it [34:00, 14.95s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26983.33
    Min Reward over last 50 episodes: -27114.19
    Max Reward over last 50 episodes: -26852.47
    Evaluation Average Reward: -26520.72

Running combination 136/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.6, 'no_change_after_lap': 115}
    Training Episode 0/2


136it [34:15, 14.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25641.94
    Min Reward over last 50 episodes: -25779.32
    Max Reward over last 50 episodes: -25504.55
    Evaluation Average Reward: -25449.92

Running combination 137/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.0, 'no_change_after_lap': 150}
    Training Episode 0/2


137it [34:30, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17090.01
    Min Reward over last 50 episodes: -17283.57
    Max Reward over last 50 episodes: -16896.44
    Evaluation Average Reward: -17000.80

Running combination 138/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


138it [34:44, 14.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16065.92
    Min Reward over last 50 episodes: -16113.00
    Max Reward over last 50 episodes: -16018.83
    Evaluation Average Reward: -16047.32

Running combination 139/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


139it [34:59, 14.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17925.55
    Min Reward over last 50 episodes: -17995.30
    Max Reward over last 50 episodes: -17855.79
    Evaluation Average Reward: -18146.85

Running combination 140/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.6, 'no_change_after_lap': 140}
    Training Episode 0/2


140it [35:14, 14.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19417.12
    Min Reward over last 50 episodes: -19591.94
    Max Reward over last 50 episodes: -19242.31
    Evaluation Average Reward: -19319.24

Running combination 141/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 155}
    Training Episode 0/2


141it [35:28, 14.65s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16283.90
    Min Reward over last 50 episodes: -16341.97
    Max Reward over last 50 episodes: -16225.82
    Evaluation Average Reward: -16256.94

Running combination 142/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 145}
    Training Episode 0/2


142it [35:43, 14.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18255.75
    Min Reward over last 50 episodes: -18306.56
    Max Reward over last 50 episodes: -18204.94
    Evaluation Average Reward: -18323.05

Running combination 143/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 110}
    Training Episode 0/2


143it [35:58, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26801.13
    Min Reward over last 50 episodes: -27154.91
    Max Reward over last 50 episodes: -26447.35
    Evaluation Average Reward: -26612.05
SKIPPING

Running combination 144/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 100}
    Training Episode 0/2


144it [36:13, 14.98s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29221.86
    Min Reward over last 50 episodes: -29400.24
    Max Reward over last 50 episodes: -29043.48
    Evaluation Average Reward: -29182.63
SKIPPING

Running combination 145/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.6, 'no_change_after_lap': 130}
    Training Episode 0/2


145it [36:28, 14.82s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22077.49
    Min Reward over last 50 episodes: -22090.43
    Max Reward over last 50 episodes: -22064.56
    Evaluation Average Reward: -21962.22

Running combination 146/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 115}
    Training Episode 0/2


146it [36:43, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25658.71
    Min Reward over last 50 episodes: -25820.06
    Max Reward over last 50 episodes: -25497.36
    Evaluation Average Reward: -25815.01

Running combination 147/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 115}
    Training Episode 0/2


147it [36:58, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25602.64
    Min Reward over last 50 episodes: -25834.15
    Max Reward over last 50 episodes: -25371.12
    Evaluation Average Reward: -25814.09

Running combination 148/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 130}
    Training Episode 0/2


148it [37:12, 14.87s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22045.03
    Min Reward over last 50 episodes: -22116.18
    Max Reward over last 50 episodes: -21973.88
    Evaluation Average Reward: -21655.43

Running combination 149/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 130}
    Training Episode 0/2


149it [37:27, 14.83s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22027.45
    Min Reward over last 50 episodes: -22115.43
    Max Reward over last 50 episodes: -21939.48
    Evaluation Average Reward: -21817.28

Running combination 150/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 110}
    Training Episode 0/2


150it [37:42, 14.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26812.99
    Min Reward over last 50 episodes: -27067.95
    Max Reward over last 50 episodes: -26558.02
    Evaluation Average Reward: -27022.27

Running combination 151/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 150}
    Training Episode 0/2


151it [37:57, 14.96s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17194.43
    Min Reward over last 50 episodes: -17260.83
    Max Reward over last 50 episodes: -17128.03
    Evaluation Average Reward: -17105.94

Running combination 152/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


152it [38:12, 14.89s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18341.24
    Min Reward over last 50 episodes: -18354.17
    Max Reward over last 50 episodes: -18328.31
    Evaluation Average Reward: -18214.87

Running combination 153/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 100}
    Training Episode 0/2


153it [38:26, 14.62s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29444.25
    Min Reward over last 50 episodes: -29615.80
    Max Reward over last 50 episodes: -29272.70
    Evaluation Average Reward: -28997.29

Running combination 154/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 120}
    Training Episode 0/2


154it [38:41, 14.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24245.34
    Min Reward over last 50 episodes: -24359.28
    Max Reward over last 50 episodes: -24131.41
    Evaluation Average Reward: -24228.21

Running combination 155/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 120}
    Training Episode 0/2


155it [38:55, 14.62s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24371.83
    Min Reward over last 50 episodes: -24415.01
    Max Reward over last 50 episodes: -24328.66
    Evaluation Average Reward: -24464.49

Running combination 156/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 115}
    Training Episode 0/2


156it [39:10, 14.58s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25711.40
    Min Reward over last 50 episodes: -25755.15
    Max Reward over last 50 episodes: -25667.65
    Evaluation Average Reward: -25416.68

Running combination 157/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 135}
    Training Episode 0/2


157it [39:25, 14.77s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20596.47
    Min Reward over last 50 episodes: -20725.66
    Max Reward over last 50 episodes: -20467.28
    Evaluation Average Reward: -20858.77

Running combination 158/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 105}
    Training Episode 0/2


158it [39:40, 14.87s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28166.40
    Min Reward over last 50 episodes: -28189.52
    Max Reward over last 50 episodes: -28143.29
    Evaluation Average Reward: -27761.09

Running combination 159/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.4, 'no_change_after_lap': 125}
    Training Episode 0/2


159it [39:55, 14.86s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23226.26
    Min Reward over last 50 episodes: -23368.90
    Max Reward over last 50 episodes: -23083.62
    Evaluation Average Reward: -22780.42

Running combination 160/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 125}
    Training Episode 0/2


160it [40:09, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23140.31
    Min Reward over last 50 episodes: -23334.39
    Max Reward over last 50 episodes: -22946.23
    Evaluation Average Reward: -23294.79

Running combination 161/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 105}
    Training Episode 0/2


161it [40:24, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27906.97
    Min Reward over last 50 episodes: -28022.34
    Max Reward over last 50 episodes: -27791.61
    Evaluation Average Reward: -28432.36

Running combination 162/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 100}
    Training Episode 0/2


162it [40:39, 14.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29166.06
    Min Reward over last 50 episodes: -29234.29
    Max Reward over last 50 episodes: -29097.84
    Evaluation Average Reward: -29543.74

Running combination 163/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 115}
    Training Episode 0/2


163it [40:54, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25716.88
    Min Reward over last 50 episodes: -25739.05
    Max Reward over last 50 episodes: -25694.71
    Evaluation Average Reward: -25481.75

Running combination 164/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


164it [41:08, 14.69s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16094.68
    Min Reward over last 50 episodes: -16155.36
    Max Reward over last 50 episodes: -16034.01
    Evaluation Average Reward: -16069.86

Running combination 165/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 1.0, 'no_change_after_lap': 125}
    Training Episode 0/2


165it [41:23, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23132.97
    Min Reward over last 50 episodes: -23347.89
    Max Reward over last 50 episodes: -22918.06
    Evaluation Average Reward: -23382.80

Running combination 166/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 155}
    Training Episode 0/2


166it [41:39, 15.07s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16370.09
    Min Reward over last 50 episodes: -16444.14
    Max Reward over last 50 episodes: -16296.05
    Evaluation Average Reward: -16307.09

Running combination 167/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 145}
    Training Episode 0/2


167it [41:53, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18262.78
    Min Reward over last 50 episodes: -18425.68
    Max Reward over last 50 episodes: -18099.89
    Evaluation Average Reward: -18221.07

Running combination 168/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 110}
    Training Episode 0/2


168it [42:08, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27030.01
    Min Reward over last 50 episodes: -27068.72
    Max Reward over last 50 episodes: -26991.30
    Evaluation Average Reward: -27123.80

Running combination 169/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 135}
    Training Episode 0/2


169it [42:24, 14.95s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20422.94
    Min Reward over last 50 episodes: -20444.38
    Max Reward over last 50 episodes: -20401.51
    Evaluation Average Reward: -20653.25

Running combination 170/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


170it [42:38, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19492.40
    Min Reward over last 50 episodes: -19644.81
    Max Reward over last 50 episodes: -19339.98
    Evaluation Average Reward: -19383.85

Running combination 171/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 120}
    Training Episode 0/2


171it [42:54, 15.08s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24271.20
    Min Reward over last 50 episodes: -24432.82
    Max Reward over last 50 episodes: -24109.59
    Evaluation Average Reward: -24420.29

Running combination 172/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


172it [43:08, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -15985.65
    Min Reward over last 50 episodes: -16010.50
    Max Reward over last 50 episodes: -15960.80
    Evaluation Average Reward: -16074.80

Running combination 173/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 125}
    Training Episode 0/2


173it [43:24, 15.07s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23094.79
    Min Reward over last 50 episodes: -23275.66
    Max Reward over last 50 episodes: -22913.92
    Evaluation Average Reward: -22948.66

Running combination 174/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


174it [43:38, 14.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29417.11
    Min Reward over last 50 episodes: -29421.74
    Max Reward over last 50 episodes: -29412.49
    Evaluation Average Reward: -29431.85

Running combination 175/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 120}
    Training Episode 0/2


175it [43:53, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24312.08
    Min Reward over last 50 episodes: -24320.10
    Max Reward over last 50 episodes: -24304.06
    Evaluation Average Reward: -24471.39

Running combination 176/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.0, 'no_change_after_lap': 140}
    Training Episode 0/2


176it [44:08, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19310.27
    Min Reward over last 50 episodes: -19508.72
    Max Reward over last 50 episodes: -19111.83
    Evaluation Average Reward: -19021.51

Running combination 177/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


177it [44:23, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28114.72
    Min Reward over last 50 episodes: -28350.44
    Max Reward over last 50 episodes: -27879.00
    Evaluation Average Reward: -28281.96

Running combination 178/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 160}
    Training Episode 0/2


178it [44:38, 14.89s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16101.45
    Min Reward over last 50 episodes: -16164.53
    Max Reward over last 50 episodes: -16038.37
    Evaluation Average Reward: -16025.06

Running combination 179/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 1.0, 'no_change_after_lap': 130}
    Training Episode 0/2


179it [44:53, 15.00s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21885.23
    Min Reward over last 50 episodes: -22034.98
    Max Reward over last 50 episodes: -21735.48
    Evaluation Average Reward: -21906.36

Running combination 180/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 135}
    Training Episode 0/2


180it [45:08, 14.98s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20765.91
    Min Reward over last 50 episodes: -20797.07
    Max Reward over last 50 episodes: -20734.76
    Evaluation Average Reward: -20687.18

Running combination 181/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 125}
    Training Episode 0/2


181it [45:23, 15.01s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23059.95
    Min Reward over last 50 episodes: -23295.57
    Max Reward over last 50 episodes: -22824.34
    Evaluation Average Reward: -23125.39

Running combination 182/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


182it [45:38, 15.00s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18204.75
    Min Reward over last 50 episodes: -18330.19
    Max Reward over last 50 episodes: -18079.30
    Evaluation Average Reward: -18306.88

Running combination 183/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 130}
    Training Episode 0/2


183it [45:53, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21743.00
    Min Reward over last 50 episodes: -21890.14
    Max Reward over last 50 episodes: -21595.86
    Evaluation Average Reward: -21521.92

Running combination 184/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.6, 'no_change_after_lap': 160}
    Training Episode 0/2


184it [46:08, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16038.23
    Min Reward over last 50 episodes: -16090.63
    Max Reward over last 50 episodes: -15985.84
    Evaluation Average Reward: -16154.14

Running combination 185/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


185it [46:23, 14.99s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16128.75
    Min Reward over last 50 episodes: -16156.69
    Max Reward over last 50 episodes: -16100.81
    Evaluation Average Reward: -15953.53

Running combination 186/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 110}
    Training Episode 0/2


186it [46:38, 14.99s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26717.62
    Min Reward over last 50 episodes: -26847.38
    Max Reward over last 50 episodes: -26587.86
    Evaluation Average Reward: -26904.10

Running combination 187/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


187it [46:53, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19425.46
    Min Reward over last 50 episodes: -19572.19
    Max Reward over last 50 episodes: -19278.74
    Evaluation Average Reward: -19418.19

Running combination 188/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 100}
    Training Episode 0/2


188it [47:08, 15.06s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29542.07
    Min Reward over last 50 episodes: -29636.76
    Max Reward over last 50 episodes: -29447.39
    Evaluation Average Reward: -29661.78

Running combination 189/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 105}
    Training Episode 0/2


189it [47:24, 15.24s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28106.31
    Min Reward over last 50 episodes: -28221.06
    Max Reward over last 50 episodes: -27991.56
    Evaluation Average Reward: -28269.82

Running combination 190/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


190it [47:39, 15.15s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19171.22
    Min Reward over last 50 episodes: -19243.78
    Max Reward over last 50 episodes: -19098.65
    Evaluation Average Reward: -19121.64

Running combination 191/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 160}
    Training Episode 0/2


191it [47:53, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16033.77
    Min Reward over last 50 episodes: -16086.33
    Max Reward over last 50 episodes: -15981.21
    Evaluation Average Reward: -16143.30

Running combination 192/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 110}
    Training Episode 0/2


192it [48:08, 14.89s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26791.94
    Min Reward over last 50 episodes: -26800.88
    Max Reward over last 50 episodes: -26783.01
    Evaluation Average Reward: -27083.50

Running combination 193/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


193it [48:23, 14.98s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17972.29
    Min Reward over last 50 episodes: -18020.39
    Max Reward over last 50 episodes: -17924.20
    Evaluation Average Reward: -18214.92

Running combination 194/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


194it [48:38, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19486.24
    Min Reward over last 50 episodes: -19525.22
    Max Reward over last 50 episodes: -19447.25
    Evaluation Average Reward: -19169.74

Running combination 195/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


195it [48:52, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19184.78
    Min Reward over last 50 episodes: -19237.89
    Max Reward over last 50 episodes: -19131.67
    Evaluation Average Reward: -19278.04

Running combination 196/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 135}
    Training Episode 0/2


196it [49:07, 14.81s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20776.68
    Min Reward over last 50 episodes: -20813.34
    Max Reward over last 50 episodes: -20740.01
    Evaluation Average Reward: -20676.22

Running combination 197/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


197it [49:22, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21673.20
    Min Reward over last 50 episodes: -21828.86
    Max Reward over last 50 episodes: -21517.53
    Evaluation Average Reward: -21603.73

Running combination 198/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 120}
    Training Episode 0/2


198it [49:37, 14.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24624.96
    Min Reward over last 50 episodes: -24737.41
    Max Reward over last 50 episodes: -24512.52
    Evaluation Average Reward: -24478.79

Running combination 199/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.4, 'no_change_after_lap': 105}
    Training Episode 0/2


199it [49:52, 14.89s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28314.92
    Min Reward over last 50 episodes: -28350.72
    Max Reward over last 50 episodes: -28279.13
    Evaluation Average Reward: -28178.84

Running combination 200/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


200it [50:07, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28189.41
    Min Reward over last 50 episodes: -28306.84
    Max Reward over last 50 episodes: -28071.98
    Evaluation Average Reward: -28417.16

Running combination 201/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


201it [50:21, 14.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18266.68
    Min Reward over last 50 episodes: -18298.24
    Max Reward over last 50 episodes: -18235.12
    Evaluation Average Reward: -18104.85

Running combination 202/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


202it [50:37, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16088.21
    Min Reward over last 50 episodes: -16138.15
    Max Reward over last 50 episodes: -16038.27
    Evaluation Average Reward: -16136.75

Running combination 203/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.8, 'no_change_after_lap': 130}
    Training Episode 0/2


203it [50:51, 14.83s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21847.38
    Min Reward over last 50 episodes: -22083.89
    Max Reward over last 50 episodes: -21610.86
    Evaluation Average Reward: -21539.31

Running combination 204/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


204it [51:06, 14.84s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17212.76
    Min Reward over last 50 episodes: -17227.96
    Max Reward over last 50 episodes: -17197.56
    Evaluation Average Reward: -17131.82

Running combination 205/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


205it [51:21, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19408.16
    Min Reward over last 50 episodes: -19585.86
    Max Reward over last 50 episodes: -19230.46
    Evaluation Average Reward: -19470.11

Running combination 206/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


206it [51:36, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29600.07
    Min Reward over last 50 episodes: -29630.51
    Max Reward over last 50 episodes: -29569.62
    Evaluation Average Reward: -29442.96

Running combination 207/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 1.0, 'no_change_after_lap': 145}
    Training Episode 0/2


207it [51:51, 14.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18348.97
    Min Reward over last 50 episodes: -18354.49
    Max Reward over last 50 episodes: -18343.45
    Evaluation Average Reward: -18340.36

Running combination 208/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


208it [52:06, 15.00s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25523.80
    Min Reward over last 50 episodes: -25793.63
    Max Reward over last 50 episodes: -25253.97
    Evaluation Average Reward: -25843.18

Running combination 209/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 155}
    Training Episode 0/2


209it [52:21, 15.09s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16282.90
    Min Reward over last 50 episodes: -16333.00
    Max Reward over last 50 episodes: -16232.79
    Evaluation Average Reward: -16332.31

Running combination 210/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


210it [52:37, 15.26s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19465.84
    Min Reward over last 50 episodes: -19605.88
    Max Reward over last 50 episodes: -19325.80
    Evaluation Average Reward: -19005.15

Running combination 211/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 135}
    Training Episode 0/2


211it [52:53, 15.49s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20748.64
    Min Reward over last 50 episodes: -20814.63
    Max Reward over last 50 episodes: -20682.66
    Evaluation Average Reward: -20804.22

Running combination 212/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 120}
    Training Episode 0/2


212it [53:09, 15.53s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24382.92
    Min Reward over last 50 episodes: -24588.91
    Max Reward over last 50 episodes: -24176.94
    Evaluation Average Reward: -24456.13

Running combination 213/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 150}
    Training Episode 0/2


213it [53:23, 15.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17001.99
    Min Reward over last 50 episodes: -17048.42
    Max Reward over last 50 episodes: -16955.57
    Evaluation Average Reward: -17304.72

Running combination 214/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 160}
    Training Episode 0/2


214it [53:39, 15.25s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16038.30
    Min Reward over last 50 episodes: -16066.55
    Max Reward over last 50 episodes: -16010.04
    Evaluation Average Reward: -16039.21

Running combination 215/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 155}
    Training Episode 0/2


215it [53:54, 15.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16323.72
    Min Reward over last 50 episodes: -16371.95
    Max Reward over last 50 episodes: -16275.49
    Evaluation Average Reward: -16243.38

Running combination 216/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 125}
    Training Episode 0/2


216it [54:09, 15.20s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23246.51
    Min Reward over last 50 episodes: -23296.64
    Max Reward over last 50 episodes: -23196.37
    Evaluation Average Reward: -23084.31

Running combination 217/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 135}
    Training Episode 0/2


217it [54:23, 14.96s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20321.90
    Min Reward over last 50 episodes: -20391.89
    Max Reward over last 50 episodes: -20251.90
    Evaluation Average Reward: -20412.13

Running combination 218/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


218it [54:38, 14.97s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16063.94
    Min Reward over last 50 episodes: -16076.58
    Max Reward over last 50 episodes: -16051.29
    Evaluation Average Reward: -16113.19

Running combination 219/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 125}
    Training Episode 0/2


219it [54:53, 14.95s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23012.62
    Min Reward over last 50 episodes: -23324.52
    Max Reward over last 50 episodes: -22700.73
    Evaluation Average Reward: -23031.07

Running combination 220/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 135}
    Training Episode 0/2


220it [55:08, 14.93s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20533.99
    Min Reward over last 50 episodes: -20567.32
    Max Reward over last 50 episodes: -20500.67
    Evaluation Average Reward: -20733.56

Running combination 221/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 140}
    Training Episode 0/2


221it [55:23, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19218.73
    Min Reward over last 50 episodes: -19320.69
    Max Reward over last 50 episodes: -19116.76
    Evaluation Average Reward: -19560.09

Running combination 222/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 145}
    Training Episode 0/2


222it [55:38, 14.98s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17901.74
    Min Reward over last 50 episodes: -17931.67
    Max Reward over last 50 episodes: -17871.82
    Evaluation Average Reward: -18098.94

Running combination 223/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


223it [55:53, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18336.91
    Min Reward over last 50 episodes: -18354.98
    Max Reward over last 50 episodes: -18318.85
    Evaluation Average Reward: -18301.54

Running combination 224/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 135}
    Training Episode 0/2


224it [56:08, 14.94s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20635.26
    Min Reward over last 50 episodes: -20715.84
    Max Reward over last 50 episodes: -20554.68
    Evaluation Average Reward: -20266.40

Running combination 225/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 145}
    Training Episode 0/2


225it [56:23, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18317.28
    Min Reward over last 50 episodes: -18359.65
    Max Reward over last 50 episodes: -18274.91
    Evaluation Average Reward: -18048.62

Running combination 226/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 120}
    Training Episode 0/2


226it [56:38, 15.02s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24184.92
    Min Reward over last 50 episodes: -24218.48
    Max Reward over last 50 episodes: -24151.35
    Evaluation Average Reward: -24358.49

Running combination 227/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 135}
    Training Episode 0/2


227it [56:53, 14.86s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20702.30
    Min Reward over last 50 episodes: -20763.72
    Max Reward over last 50 episodes: -20640.88
    Evaluation Average Reward: -20644.00

Running combination 228/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 155}
    Training Episode 0/2


228it [57:07, 14.75s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16258.35
    Min Reward over last 50 episodes: -16258.75
    Max Reward over last 50 episodes: -16257.95
    Evaluation Average Reward: -16387.26

Running combination 229/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 115}
    Training Episode 0/2


229it [57:22, 14.68s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25482.08
    Min Reward over last 50 episodes: -25646.48
    Max Reward over last 50 episodes: -25317.68
    Evaluation Average Reward: -25687.93

Running combination 230/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


230it [57:36, 14.76s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21829.79
    Min Reward over last 50 episodes: -22121.95
    Max Reward over last 50 episodes: -21537.62
    Evaluation Average Reward: -22119.87

Running combination 231/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.8, 'no_change_after_lap': 140}
    Training Episode 0/2


231it [57:51, 14.65s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19245.11
    Min Reward over last 50 episodes: -19493.44
    Max Reward over last 50 episodes: -18996.78
    Evaluation Average Reward: -19306.31

Running combination 232/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


232it [58:06, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21837.37
    Min Reward over last 50 episodes: -21983.70
    Max Reward over last 50 episodes: -21691.04
    Evaluation Average Reward: -21627.44

Running combination 233/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 110}
    Training Episode 0/2


233it [58:21, 14.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26812.83
    Min Reward over last 50 episodes: -26924.86
    Max Reward over last 50 episodes: -26700.80
    Evaluation Average Reward: -26530.31

Running combination 234/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 130}
    Training Episode 0/2


234it [58:35, 14.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21926.99
    Min Reward over last 50 episodes: -21982.63
    Max Reward over last 50 episodes: -21871.35
    Evaluation Average Reward: -21767.29

Running combination 235/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


235it [58:50, 14.61s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17129.53
    Min Reward over last 50 episodes: -17178.17
    Max Reward over last 50 episodes: -17080.89
    Evaluation Average Reward: -17249.77

Running combination 236/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 135}
    Training Episode 0/2


236it [59:05, 14.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20437.64
    Min Reward over last 50 episodes: -20526.68
    Max Reward over last 50 episodes: -20348.60
    Evaluation Average Reward: -20822.49

Running combination 237/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


237it [59:19, 14.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29450.15
    Min Reward over last 50 episodes: -29577.53
    Max Reward over last 50 episodes: -29322.77
    Evaluation Average Reward: -29092.49

Running combination 238/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.6, 'no_change_after_lap': 100}
    Training Episode 0/2


238it [59:34, 14.77s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29191.69
    Min Reward over last 50 episodes: -29196.43
    Max Reward over last 50 episodes: -29186.96
    Evaluation Average Reward: -29352.81

Running combination 239/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


239it [59:48, 14.62s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19394.69
    Min Reward over last 50 episodes: -19396.04
    Max Reward over last 50 episodes: -19393.34
    Evaluation Average Reward: -19284.13

Running combination 240/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 160}
    Training Episode 0/2


240it [1:00:03, 14.64s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16048.52
    Min Reward over last 50 episodes: -16117.27
    Max Reward over last 50 episodes: -15979.78
    Evaluation Average Reward: -16084.45

Running combination 241/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 120}
    Training Episode 0/2


241it [1:00:18, 14.74s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24324.29
    Min Reward over last 50 episodes: -24504.01
    Max Reward over last 50 episodes: -24144.57
    Evaluation Average Reward: -24413.54

Running combination 242/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 115}
    Training Episode 0/2


242it [1:00:33, 14.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25358.38
    Min Reward over last 50 episodes: -25448.72
    Max Reward over last 50 episodes: -25268.04
    Evaluation Average Reward: -25801.53

Running combination 243/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 145}
    Training Episode 0/2


243it [1:00:47, 14.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17908.03
    Min Reward over last 50 episodes: -17950.19
    Max Reward over last 50 episodes: -17865.86
    Evaluation Average Reward: -18227.51

Running combination 244/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 135}
    Training Episode 0/2


244it [1:01:02, 14.66s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20561.37
    Min Reward over last 50 episodes: -20829.92
    Max Reward over last 50 episodes: -20292.83
    Evaluation Average Reward: -20665.83

Running combination 245/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


245it [1:01:17, 14.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28097.78
    Min Reward over last 50 episodes: -28383.94
    Max Reward over last 50 episodes: -27811.61
    Evaluation Average Reward: -27923.31

Running combination 246/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 105}
    Training Episode 0/2


246it [1:01:32, 14.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28250.99
    Min Reward over last 50 episodes: -28291.28
    Max Reward over last 50 episodes: -28210.70
    Evaluation Average Reward: -27776.24

Running combination 247/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.8, 'no_change_after_lap': 110}
    Training Episode 0/2


247it [1:01:47, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27049.77
    Min Reward over last 50 episodes: -27135.19
    Max Reward over last 50 episodes: -26964.34
    Evaluation Average Reward: -27069.09

Running combination 248/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 105}
    Training Episode 0/2


248it [1:02:02, 14.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27878.77
    Min Reward over last 50 episodes: -27983.01
    Max Reward over last 50 episodes: -27774.53
    Evaluation Average Reward: -28342.85

Running combination 249/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.8, 'no_change_after_lap': 110}
    Training Episode 0/2


249it [1:02:17, 14.99s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26706.07
    Min Reward over last 50 episodes: -26900.44
    Max Reward over last 50 episodes: -26511.69
    Evaluation Average Reward: -26689.25

Running combination 250/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 110}
    Training Episode 0/2


250it [1:02:33, 15.10s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26987.92
    Min Reward over last 50 episodes: -27065.32
    Max Reward over last 50 episodes: -26910.53
    Evaluation Average Reward: -26597.98

Running combination 251/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


251it [1:02:48, 15.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29410.17
    Min Reward over last 50 episodes: -29539.40
    Max Reward over last 50 episodes: -29280.93
    Evaluation Average Reward: -29470.72

Running combination 252/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 100}
    Training Episode 0/2


252it [1:03:04, 15.41s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29415.72
    Min Reward over last 50 episodes: -29623.36
    Max Reward over last 50 episodes: -29208.07
    Evaluation Average Reward: -29082.72

Running combination 253/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 120}
    Training Episode 0/2


253it [1:03:19, 15.40s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24376.54
    Min Reward over last 50 episodes: -24611.57
    Max Reward over last 50 episodes: -24141.51
    Evaluation Average Reward: -24294.06

Running combination 254/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.4, 'no_change_after_lap': 135}
    Training Episode 0/2


254it [1:03:35, 15.58s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20441.48
    Min Reward over last 50 episodes: -20626.79
    Max Reward over last 50 episodes: -20256.17
    Evaluation Average Reward: -20640.56

Running combination 255/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.6, 'no_change_after_lap': 145}
    Training Episode 0/2


255it [1:03:51, 15.62s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18238.66
    Min Reward over last 50 episodes: -18290.48
    Max Reward over last 50 episodes: -18186.84
    Evaluation Average Reward: -18158.13

Running combination 256/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.2, 'no_change_after_lap': 145}
    Training Episode 0/2


256it [1:04:06, 15.50s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17973.34
    Min Reward over last 50 episodes: -18053.35
    Max Reward over last 50 episodes: -17893.32
    Evaluation Average Reward: -18414.84

Running combination 257/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 125}
    Training Episode 0/2


257it [1:04:22, 15.43s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23052.01
    Min Reward over last 50 episodes: -23286.61
    Max Reward over last 50 episodes: -22817.41
    Evaluation Average Reward: -23378.18

Running combination 258/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.4, 'no_change_after_lap': 115}
    Training Episode 0/2


258it [1:04:37, 15.36s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25559.41
    Min Reward over last 50 episodes: -25726.16
    Max Reward over last 50 episodes: -25392.66
    Evaluation Average Reward: -25488.05

Running combination 259/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 130}
    Training Episode 0/2


259it [1:04:52, 15.43s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21718.44
    Min Reward over last 50 episodes: -21869.37
    Max Reward over last 50 episodes: -21567.50
    Evaluation Average Reward: -21534.43

Running combination 260/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 130}
    Training Episode 0/2


260it [1:05:07, 15.27s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21779.23
    Min Reward over last 50 episodes: -21919.01
    Max Reward over last 50 episodes: -21639.45
    Evaluation Average Reward: -21828.73

Running combination 261/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 150}
    Training Episode 0/2


261it [1:05:22, 15.18s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16893.62
    Min Reward over last 50 episodes: -16955.59
    Max Reward over last 50 episodes: -16831.64
    Evaluation Average Reward: -17136.24
SKIPPING

Running combination 262/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.6, 'no_change_after_lap': 125}
    Training Episode 0/2


262it [1:05:37, 15.16s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22939.25
    Min Reward over last 50 episodes: -22942.02
    Max Reward over last 50 episodes: -22936.48
    Evaluation Average Reward: -23393.26

Running combination 263/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 155}
    Training Episode 0/2


263it [1:05:53, 15.24s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16320.19
    Min Reward over last 50 episodes: -16349.62
    Max Reward over last 50 episodes: -16290.76
    Evaluation Average Reward: -16192.04

Running combination 264/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 115}
    Training Episode 0/2


264it [1:06:08, 15.37s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25801.17
    Min Reward over last 50 episodes: -25866.76
    Max Reward over last 50 episodes: -25735.58
    Evaluation Average Reward: -25312.66

Running combination 265/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


265it [1:06:24, 15.38s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28137.45
    Min Reward over last 50 episodes: -28325.83
    Max Reward over last 50 episodes: -27949.07
    Evaluation Average Reward: -28250.13

Running combination 266/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 120}
    Training Episode 0/2


266it [1:06:39, 15.37s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24137.66
    Min Reward over last 50 episodes: -24146.14
    Max Reward over last 50 episodes: -24129.18
    Evaluation Average Reward: -24564.16

Running combination 267/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 105}
    Training Episode 0/2


267it [1:06:54, 15.18s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28016.07
    Min Reward over last 50 episodes: -28205.29
    Max Reward over last 50 episodes: -27826.84
    Evaluation Average Reward: -28481.46

Running combination 268/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.0, 'no_change_after_lap': 130}
    Training Episode 0/2


268it [1:07:09, 15.07s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21582.53
    Min Reward over last 50 episodes: -21604.36
    Max Reward over last 50 episodes: -21560.70
    Evaluation Average Reward: -21819.96

Running combination 269/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


269it [1:07:24, 15.10s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16074.67
    Min Reward over last 50 episodes: -16127.49
    Max Reward over last 50 episodes: -16021.84
    Evaluation Average Reward: -16125.04

Running combination 270/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 110}
    Training Episode 0/2


270it [1:07:39, 15.12s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26728.86
    Min Reward over last 50 episodes: -26776.13
    Max Reward over last 50 episodes: -26681.58
    Evaluation Average Reward: -26614.76

Running combination 271/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 160}
    Training Episode 0/2


271it [1:07:53, 14.90s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16071.31
    Min Reward over last 50 episodes: -16076.60
    Max Reward over last 50 episodes: -16066.03
    Evaluation Average Reward: -16016.91

Running combination 272/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


272it [1:08:08, 14.82s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17023.67
    Min Reward over last 50 episodes: -17027.60
    Max Reward over last 50 episodes: -17019.73
    Evaluation Average Reward: -17041.99

Running combination 273/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.9, 'no_change_after_lap': 160}
    Training Episode 0/2


273it [1:08:24, 15.13s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16057.59
    Min Reward over last 50 episodes: -16127.29
    Max Reward over last 50 episodes: -15987.89
    Evaluation Average Reward: -16006.36

Running combination 274/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.6, 'no_change_after_lap': 125}
    Training Episode 0/2


274it [1:08:39, 15.07s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22908.95
    Min Reward over last 50 episodes: -23070.13
    Max Reward over last 50 episodes: -22747.77
    Evaluation Average Reward: -23335.22

Running combination 275/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 135}
    Training Episode 0/2


275it [1:08:54, 15.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20437.65
    Min Reward over last 50 episodes: -20497.09
    Max Reward over last 50 episodes: -20378.22
    Evaluation Average Reward: -20711.08

Running combination 276/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 100}
    Training Episode 0/2


276it [1:09:10, 15.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29363.78
    Min Reward over last 50 episodes: -29615.79
    Max Reward over last 50 episodes: -29111.77
    Evaluation Average Reward: -29609.41

Running combination 277/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 155}
    Training Episode 0/2


277it [1:09:25, 15.29s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16328.74
    Min Reward over last 50 episodes: -16383.21
    Max Reward over last 50 episodes: -16274.26
    Evaluation Average Reward: -16324.53

Running combination 278/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 1.0, 'no_change_after_lap': 110}
    Training Episode 0/2


278it [1:09:41, 15.41s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26714.51
    Min Reward over last 50 episodes: -26893.40
    Max Reward over last 50 episodes: -26535.63
    Evaluation Average Reward: -27083.90

Running combination 279/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 155}
    Training Episode 0/2


279it [1:09:56, 15.18s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16245.49
    Min Reward over last 50 episodes: -16296.18
    Max Reward over last 50 episodes: -16194.81
    Evaluation Average Reward: -16312.97

Running combination 280/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.4, 'no_change_after_lap': 150}
    Training Episode 0/2


280it [1:10:15, 16.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17176.11
    Min Reward over last 50 episodes: -17190.44
    Max Reward over last 50 episodes: -17161.77
    Evaluation Average Reward: -17134.48

Running combination 281/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 105}
    Training Episode 0/2


281it [1:10:36, 17.75s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28049.91
    Min Reward over last 50 episodes: -28270.27
    Max Reward over last 50 episodes: -27829.55
    Evaluation Average Reward: -27804.42

Running combination 282/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.8, 'no_change_after_lap': 105}
    Training Episode 0/2


282it [1:10:52, 17.43s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28194.12
    Min Reward over last 50 episodes: -28263.14
    Max Reward over last 50 episodes: -28125.10
    Evaluation Average Reward: -28075.11

Running combination 283/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


283it [1:11:09, 17.17s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16117.14
    Min Reward over last 50 episodes: -16159.57
    Max Reward over last 50 episodes: -16074.70
    Evaluation Average Reward: -16092.53

Running combination 284/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


284it [1:11:26, 17.08s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28043.30
    Min Reward over last 50 episodes: -28233.28
    Max Reward over last 50 episodes: -27853.32
    Evaluation Average Reward: -28111.24

Running combination 285/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


285it [1:11:43, 17.04s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24523.42
    Min Reward over last 50 episodes: -24648.46
    Max Reward over last 50 episodes: -24398.38
    Evaluation Average Reward: -24492.71

Running combination 286/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 115}
    Training Episode 0/2


286it [1:12:00, 17.04s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25721.65
    Min Reward over last 50 episodes: -25753.02
    Max Reward over last 50 episodes: -25690.28
    Evaluation Average Reward: -25370.04

Running combination 287/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


287it [1:12:17, 17.03s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19513.49
    Min Reward over last 50 episodes: -19668.15
    Max Reward over last 50 episodes: -19358.83
    Evaluation Average Reward: -19537.26

Running combination 288/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


288it [1:12:33, 16.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16986.28
    Min Reward over last 50 episodes: -17079.22
    Max Reward over last 50 episodes: -16893.34
    Evaluation Average Reward: -16960.82

Running combination 289/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


289it [1:12:50, 16.81s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19568.74
    Min Reward over last 50 episodes: -19583.71
    Max Reward over last 50 episodes: -19553.76
    Evaluation Average Reward: -19173.64

Running combination 290/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 160}
    Training Episode 0/2


290it [1:13:07, 16.86s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16025.43
    Min Reward over last 50 episodes: -16025.98
    Max Reward over last 50 episodes: -16024.89
    Evaluation Average Reward: -16052.62

Running combination 291/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 115}
    Training Episode 0/2


291it [1:13:24, 16.85s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25805.84
    Min Reward over last 50 episodes: -25815.41
    Max Reward over last 50 episodes: -25796.26
    Evaluation Average Reward: -25796.08

Running combination 292/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


292it [1:13:40, 16.68s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18147.99
    Min Reward over last 50 episodes: -18244.53
    Max Reward over last 50 episodes: -18051.46
    Evaluation Average Reward: -18379.31

Running combination 293/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 105}
    Training Episode 0/2


293it [1:13:56, 16.60s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27921.58
    Min Reward over last 50 episodes: -27950.87
    Max Reward over last 50 episodes: -27892.28
    Evaluation Average Reward: -27833.31

Running combination 294/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 105}
    Training Episode 0/2


294it [1:14:14, 16.76s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28262.85
    Min Reward over last 50 episodes: -28294.76
    Max Reward over last 50 episodes: -28230.94
    Evaluation Average Reward: -28403.32

Running combination 295/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.6, 'no_change_after_lap': 135}
    Training Episode 0/2


295it [1:14:30, 16.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20623.82
    Min Reward over last 50 episodes: -20737.66
    Max Reward over last 50 episodes: -20509.98
    Evaluation Average Reward: -20871.10

Running combination 296/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.6, 'no_change_after_lap': 135}
    Training Episode 0/2


296it [1:14:46, 16.56s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20270.72
    Min Reward over last 50 episodes: -20323.78
    Max Reward over last 50 episodes: -20217.66
    Evaluation Average Reward: -20337.40

Running combination 297/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 110}
    Training Episode 0/2


297it [1:15:03, 16.53s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26756.14
    Min Reward over last 50 episodes: -26920.22
    Max Reward over last 50 episodes: -26592.05
    Evaluation Average Reward: -26776.74

Running combination 298/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


298it [1:15:20, 16.73s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25568.66
    Min Reward over last 50 episodes: -25830.92
    Max Reward over last 50 episodes: -25306.40
    Evaluation Average Reward: -25924.71

Running combination 299/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 155}
    Training Episode 0/2


299it [1:15:37, 16.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16169.40
    Min Reward over last 50 episodes: -16193.78
    Max Reward over last 50 episodes: -16145.02
    Evaluation Average Reward: -16282.34

Running combination 300/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.8, 'no_change_after_lap': 105}
    Training Episode 0/2


300it [1:15:53, 16.50s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27871.59
    Min Reward over last 50 episodes: -28010.85
    Max Reward over last 50 episodes: -27732.33
    Evaluation Average Reward: -28386.07

Running combination 301/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 110}
    Training Episode 0/2


301it [1:16:09, 16.40s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26598.79
    Min Reward over last 50 episodes: -26679.26
    Max Reward over last 50 episodes: -26518.32
    Evaluation Average Reward: -26770.92

Running combination 302/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 100}
    Training Episode 0/2


302it [1:16:26, 16.52s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29163.29
    Min Reward over last 50 episodes: -29177.51
    Max Reward over last 50 episodes: -29149.06
    Evaluation Average Reward: -29659.95

Running combination 303/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 120}
    Training Episode 0/2


303it [1:16:43, 16.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24336.42
    Min Reward over last 50 episodes: -24393.40
    Max Reward over last 50 episodes: -24279.44
    Evaluation Average Reward: -24147.51

Running combination 304/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 130}
    Training Episode 0/2


304it [1:16:59, 16.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21828.91
    Min Reward over last 50 episodes: -21922.63
    Max Reward over last 50 episodes: -21735.19
    Evaluation Average Reward: -21776.85

Running combination 305/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


305it [1:17:16, 16.53s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25631.06
    Min Reward over last 50 episodes: -25670.04
    Max Reward over last 50 episodes: -25592.07
    Evaluation Average Reward: -25609.98

Running combination 306/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 105}
    Training Episode 0/2


306it [1:17:31, 16.35s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28105.60
    Min Reward over last 50 episodes: -28178.83
    Max Reward over last 50 episodes: -28032.37
    Evaluation Average Reward: -27967.21

Running combination 307/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 100}
    Training Episode 0/2


307it [1:17:48, 16.28s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29347.69
    Min Reward over last 50 episodes: -29633.91
    Max Reward over last 50 episodes: -29061.48
    Evaluation Average Reward: -29626.01

Running combination 308/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 150}
    Training Episode 0/2


308it [1:18:05, 16.47s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17197.82
    Min Reward over last 50 episodes: -17203.37
    Max Reward over last 50 episodes: -17192.28
    Evaluation Average Reward: -17034.59

Running combination 309/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.2, 'no_change_after_lap': 115}
    Training Episode 0/2


309it [1:18:21, 16.35s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25405.28
    Min Reward over last 50 episodes: -25454.99
    Max Reward over last 50 episodes: -25355.58
    Evaluation Average Reward: -25330.57

Running combination 310/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 100}
    Training Episode 0/2


310it [1:18:37, 16.29s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29165.69
    Min Reward over last 50 episodes: -29308.65
    Max Reward over last 50 episodes: -29022.73
    Evaluation Average Reward: -29051.40

Running combination 311/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 135}
    Training Episode 0/2


311it [1:18:53, 16.41s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20553.23
    Min Reward over last 50 episodes: -20709.68
    Max Reward over last 50 episodes: -20396.78
    Evaluation Average Reward: -20407.27

Running combination 312/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.2, 'no_change_after_lap': 140}
    Training Episode 0/2


312it [1:19:10, 16.38s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19415.74
    Min Reward over last 50 episodes: -19446.28
    Max Reward over last 50 episodes: -19385.21
    Evaluation Average Reward: -18988.07

Running combination 313/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 130}
    Training Episode 0/2


313it [1:19:26, 16.46s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21549.42
    Min Reward over last 50 episodes: -21570.51
    Max Reward over last 50 episodes: -21528.33
    Evaluation Average Reward: -21511.51

Running combination 314/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 0.0, 'no_change_after_lap': 115}
    Training Episode 0/2


314it [1:19:42, 16.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25721.96
    Min Reward over last 50 episodes: -25825.11
    Max Reward over last 50 episodes: -25618.81
    Evaluation Average Reward: -25262.60

Running combination 315/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 105}
    Training Episode 0/2


315it [1:19:59, 16.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27807.76
    Min Reward over last 50 episodes: -27820.89
    Max Reward over last 50 episodes: -27794.63
    Evaluation Average Reward: -28434.37

Running combination 316/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


316it [1:20:16, 16.51s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24039.28
    Min Reward over last 50 episodes: -24134.20
    Max Reward over last 50 episodes: -23944.36
    Evaluation Average Reward: -24632.57

Running combination 317/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 150}
    Training Episode 0/2


317it [1:20:32, 16.36s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17044.64
    Min Reward over last 50 episodes: -17097.03
    Max Reward over last 50 episodes: -16992.25
    Evaluation Average Reward: -17008.37

Running combination 318/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 135}
    Training Episode 0/2


318it [1:20:48, 16.42s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20639.92
    Min Reward over last 50 episodes: -20709.05
    Max Reward over last 50 episodes: -20570.78
    Evaluation Average Reward: -20292.82

Running combination 319/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 130}
    Training Episode 0/2


319it [1:21:05, 16.46s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21651.19
    Min Reward over last 50 episodes: -21766.76
    Max Reward over last 50 episodes: -21535.63
    Evaluation Average Reward: -21969.31

Running combination 320/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 160}
    Training Episode 0/2


320it [1:21:21, 16.48s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16093.93
    Min Reward over last 50 episodes: -16110.53
    Max Reward over last 50 episodes: -16077.33
    Evaluation Average Reward: -16150.93

Running combination 321/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 135}
    Training Episode 0/2


321it [1:21:39, 16.87s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20701.73
    Min Reward over last 50 episodes: -20767.48
    Max Reward over last 50 episodes: -20635.97
    Evaluation Average Reward: -20826.07

Running combination 322/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.8, 'no_change_after_lap': 125}
    Training Episode 0/2


322it [1:21:57, 17.21s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23292.79
    Min Reward over last 50 episodes: -23324.61
    Max Reward over last 50 episodes: -23260.96
    Evaluation Average Reward: -22865.12

Running combination 323/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


323it [1:22:14, 17.25s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17114.82
    Min Reward over last 50 episodes: -17159.80
    Max Reward over last 50 episodes: -17069.83
    Evaluation Average Reward: -16944.76

Running combination 324/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 115}
    Training Episode 0/2


324it [1:22:31, 17.11s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25814.36
    Min Reward over last 50 episodes: -25855.83
    Max Reward over last 50 episodes: -25772.88
    Evaluation Average Reward: -25341.67

Running combination 325/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 150}
    Training Episode 0/2


325it [1:22:48, 17.13s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17120.67
    Min Reward over last 50 episodes: -17131.26
    Max Reward over last 50 episodes: -17110.09
    Evaluation Average Reward: -17241.20

Running combination 326/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 140}
    Training Episode 0/2


326it [1:23:05, 16.91s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19094.30
    Min Reward over last 50 episodes: -19134.00
    Max Reward over last 50 episodes: -19054.61
    Evaluation Average Reward: -19091.05
SKIPPING

Running combination 327/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.6, 'no_change_after_lap': 130}
    Training Episode 0/2


327it [1:23:21, 16.73s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -21868.79
    Min Reward over last 50 episodes: -21961.27
    Max Reward over last 50 episodes: -21776.31
    Evaluation Average Reward: -21669.12

Running combination 328/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 160}
    Training Episode 0/2


328it [1:23:37, 16.55s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16058.78
    Min Reward over last 50 episodes: -16082.39
    Max Reward over last 50 episodes: -16035.17
    Evaluation Average Reward: -16083.33

Running combination 329/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


329it [1:23:54, 16.69s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19418.42
    Min Reward over last 50 episodes: -19542.25
    Max Reward over last 50 episodes: -19294.60
    Evaluation Average Reward: -19009.16

Running combination 330/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 0.4, 'no_change_after_lap': 105}
    Training Episode 0/2


330it [1:24:11, 16.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28289.46
    Min Reward over last 50 episodes: -28391.59
    Max Reward over last 50 episodes: -28187.33
    Evaluation Average Reward: -28177.56

Running combination 331/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 115}
    Training Episode 0/2


331it [1:24:27, 16.50s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25529.26
    Min Reward over last 50 episodes: -25581.31
    Max Reward over last 50 episodes: -25477.21
    Evaluation Average Reward: -25778.46

Running combination 332/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


332it [1:24:44, 16.56s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19404.08
    Min Reward over last 50 episodes: -19478.11
    Max Reward over last 50 episodes: -19330.06
    Evaluation Average Reward: -19322.76

Running combination 333/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 155}
    Training Episode 0/2


333it [1:25:00, 16.54s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16288.20
    Min Reward over last 50 episodes: -16363.24
    Max Reward over last 50 episodes: -16213.17
    Evaluation Average Reward: -16437.38

Running combination 334/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.9, 'no_change_after_lap': 110}
    Training Episode 0/2


334it [1:25:17, 16.67s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26863.85
    Min Reward over last 50 episodes: -26997.96
    Max Reward over last 50 episodes: -26729.73
    Evaluation Average Reward: -27038.00

Running combination 335/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 1.0, 'no_change_after_lap': 115}
    Training Episode 0/2


335it [1:25:33, 16.48s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25678.32
    Min Reward over last 50 episodes: -25679.75
    Max Reward over last 50 episodes: -25676.90
    Evaluation Average Reward: -25345.28

Running combination 336/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 145}
    Training Episode 0/2


336it [1:25:50, 16.50s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18207.25
    Min Reward over last 50 episodes: -18358.75
    Max Reward over last 50 episodes: -18055.76
    Evaluation Average Reward: -18442.04

Running combination 337/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 125}
    Training Episode 0/2


337it [1:26:06, 16.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23207.71
    Min Reward over last 50 episodes: -23310.74
    Max Reward over last 50 episodes: -23104.67
    Evaluation Average Reward: -22938.67

Running combination 338/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 140}
    Training Episode 0/2


338it [1:26:22, 16.46s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19263.28
    Min Reward over last 50 episodes: -19531.02
    Max Reward over last 50 episodes: -18995.55
    Evaluation Average Reward: -19557.03

Running combination 339/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


339it [1:26:39, 16.34s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -15989.96
    Min Reward over last 50 episodes: -15990.07
    Max Reward over last 50 episodes: -15989.85
    Evaluation Average Reward: -16061.13

Running combination 340/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 100}
    Training Episode 0/2


340it [1:26:55, 16.25s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29582.23
    Min Reward over last 50 episodes: -29621.41
    Max Reward over last 50 episodes: -29543.05
    Evaluation Average Reward: -29465.97

Running combination 341/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


341it [1:27:11, 16.19s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24130.03
    Min Reward over last 50 episodes: -24249.40
    Max Reward over last 50 episodes: -24010.65
    Evaluation Average Reward: -24391.88

Running combination 342/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.4, 'no_change_after_lap': 110}
    Training Episode 0/2


342it [1:27:27, 16.37s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -27016.76
    Min Reward over last 50 episodes: -27052.99
    Max Reward over last 50 episodes: -26980.53
    Evaluation Average Reward: -26888.50

Running combination 343/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.6, 'no_change_after_lap': 155}
    Training Episode 0/2


343it [1:27:44, 16.40s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16232.32
    Min Reward over last 50 episodes: -16265.37
    Max Reward over last 50 episodes: -16199.26
    Evaluation Average Reward: -16322.03

Running combination 344/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 100}
    Training Episode 0/2


344it [1:28:01, 16.51s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29288.39
    Min Reward over last 50 episodes: -29463.19
    Max Reward over last 50 episodes: -29113.58
    Evaluation Average Reward: -29435.60

Running combination 345/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


345it [1:28:16, 16.25s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24545.40
    Min Reward over last 50 episodes: -24599.17
    Max Reward over last 50 episodes: -24491.63
    Evaluation Average Reward: -24670.36

Running combination 346/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


346it [1:28:33, 16.38s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16029.25
    Min Reward over last 50 episodes: -16058.82
    Max Reward over last 50 episodes: -15999.67
    Evaluation Average Reward: -16133.13

Running combination 347/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.8, 'no_change_after_lap': 135}
    Training Episode 0/2


347it [1:28:49, 16.34s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20575.12
    Min Reward over last 50 episodes: -20850.35
    Max Reward over last 50 episodes: -20299.90
    Evaluation Average Reward: -20823.06

Running combination 348/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 0.0, 'no_change_after_lap': 120}
    Training Episode 0/2


348it [1:29:06, 16.34s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24369.38
    Min Reward over last 50 episodes: -24491.29
    Max Reward over last 50 episodes: -24247.47
    Evaluation Average Reward: -24528.25

Running combination 349/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.999, 'alpha': 0.6, 'gamma': 1.0, 'no_change_after_lap': 135}
    Training Episode 0/2


349it [1:29:22, 16.36s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20476.60
    Min Reward over last 50 episodes: -20760.46
    Max Reward over last 50 episodes: -20192.75
    Evaluation Average Reward: -20614.08

Running combination 350/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


350it [1:29:38, 16.34s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16119.28
    Min Reward over last 50 episodes: -16137.51
    Max Reward over last 50 episodes: -16101.04
    Evaluation Average Reward: -15968.54

Running combination 351/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 120}
    Training Episode 0/2


351it [1:29:55, 16.37s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24461.29
    Min Reward over last 50 episodes: -24521.82
    Max Reward over last 50 episodes: -24400.76
    Evaluation Average Reward: -24141.21

Running combination 352/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


352it [1:30:12, 16.52s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19301.06
    Min Reward over last 50 episodes: -19605.21
    Max Reward over last 50 episodes: -18996.92
    Evaluation Average Reward: -19554.30

Running combination 353/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 140}
    Training Episode 0/2


353it [1:30:29, 16.69s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -19312.13
    Min Reward over last 50 episodes: -19314.02
    Max Reward over last 50 episodes: -19310.24
    Evaluation Average Reward: -19114.70

Running combination 354/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.4, 'gamma': 1.0, 'no_change_after_lap': 150}
    Training Episode 0/2


354it [1:30:45, 16.65s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17120.60
    Min Reward over last 50 episodes: -17160.69
    Max Reward over last 50 episodes: -17080.52
    Evaluation Average Reward: -17094.54

Running combination 355/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.4, 'no_change_after_lap': 160}
    Training Episode 0/2


355it [1:31:01, 16.52s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16033.58
    Min Reward over last 50 episodes: -16063.35
    Max Reward over last 50 episodes: -16003.82
    Evaluation Average Reward: -16033.78

Running combination 356/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


356it [1:31:17, 16.39s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16067.96
    Min Reward over last 50 episodes: -16145.61
    Max Reward over last 50 episodes: -15990.31
    Evaluation Average Reward: -16124.63

Running combination 357/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 1.0, 'no_change_after_lap': 125}
    Training Episode 0/2


357it [1:31:34, 16.54s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23107.29
    Min Reward over last 50 episodes: -23227.01
    Max Reward over last 50 episodes: -22987.57
    Evaluation Average Reward: -23257.80

Running combination 358/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.2, 'gamma': 0.0, 'no_change_after_lap': 110}
    Training Episode 0/2


358it [1:31:52, 16.78s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26904.29
    Min Reward over last 50 episodes: -26963.63
    Max Reward over last 50 episodes: -26844.96
    Evaluation Average Reward: -27032.27

Running combination 359/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.9, 'gamma': 0.6, 'no_change_after_lap': 110}
    Training Episode 0/2


359it [1:32:09, 16.92s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -26989.71
    Min Reward over last 50 episodes: -26990.55
    Max Reward over last 50 episodes: -26988.87
    Evaluation Average Reward: -27011.78

Running combination 360/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 160}
    Training Episode 0/2


360it [1:32:27, 17.15s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16104.19
    Min Reward over last 50 episodes: -16136.21
    Max Reward over last 50 episodes: -16072.16
    Evaluation Average Reward: -15988.88

Running combination 361/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.0, 'no_change_after_lap': 105}
    Training Episode 0/2


361it [1:32:44, 17.20s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -28281.24
    Min Reward over last 50 episodes: -28325.75
    Max Reward over last 50 episodes: -28236.73
    Evaluation Average Reward: -28050.82

Running combination 362/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.99, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 115}
    Training Episode 0/2


362it [1:33:03, 17.73s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25567.74
    Min Reward over last 50 episodes: -25926.20
    Max Reward over last 50 episodes: -25209.29
    Evaluation Average Reward: -25265.89

Running combination 363/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 150}
    Training Episode 0/2


363it [1:33:22, 18.07s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16898.90
    Min Reward over last 50 episodes: -16943.77
    Max Reward over last 50 episodes: -16854.03
    Evaluation Average Reward: -17033.51

Running combination 364/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 0.8, 'gamma': 1.0, 'no_change_after_lap': 100}
    Training Episode 0/2


364it [1:33:42, 18.60s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29303.91
    Min Reward over last 50 episodes: -29371.57
    Max Reward over last 50 episodes: -29236.24
    Evaluation Average Reward: -29366.36

Running combination 365/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 160}
    Training Episode 0/2


365it [1:34:01, 18.73s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16028.44
    Min Reward over last 50 episodes: -16041.71
    Max Reward over last 50 episodes: -16015.16
    Evaluation Average Reward: -16033.37

Running combination 366/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.9, 'no_change_after_lap': 125}
    Training Episode 0/2


366it [1:34:20, 18.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23207.38
    Min Reward over last 50 episodes: -23279.93
    Max Reward over last 50 episodes: -23134.82
    Evaluation Average Reward: -23259.31

Running combination 367/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 135}
    Training Episode 0/2


367it [1:34:37, 18.39s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20492.17
    Min Reward over last 50 episodes: -20663.37
    Max Reward over last 50 episodes: -20320.96
    Evaluation Average Reward: -20498.42

Running combination 368/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 0.0, 'no_change_after_lap': 160}
    Training Episode 0/2


368it [1:34:54, 17.99s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16109.72
    Min Reward over last 50 episodes: -16146.10
    Max Reward over last 50 episodes: -16073.34
    Evaluation Average Reward: -16115.19

Running combination 369/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 120}
    Training Episode 0/2


369it [1:35:13, 18.17s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24212.03
    Min Reward over last 50 episodes: -24390.61
    Max Reward over last 50 episodes: -24033.45
    Evaluation Average Reward: -24068.74

Running combination 370/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.0, 'gamma': 0.0, 'no_change_after_lap': 115}
    Training Episode 0/2


370it [1:35:30, 18.02s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25496.07
    Min Reward over last 50 episodes: -25794.43
    Max Reward over last 50 episodes: -25197.72
    Evaluation Average Reward: -25592.65

Running combination 371/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 120}
    Training Episode 0/2


371it [1:35:46, 17.27s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24002.34
    Min Reward over last 50 episodes: -24027.66
    Max Reward over last 50 episodes: -23977.01
    Evaluation Average Reward: -24006.65

Running combination 372/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.999, 'alpha': 1.0, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


372it [1:36:04, 17.59s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25550.71
    Min Reward over last 50 episodes: -25787.79
    Max Reward over last 50 episodes: -25313.63
    Evaluation Average Reward: -26162.89

Running combination 373/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.6, 'no_change_after_lap': 150}
    Training Episode 0/2


373it [1:36:21, 17.41s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17129.43
    Min Reward over last 50 episodes: -17302.22
    Max Reward over last 50 episodes: -16956.64
    Evaluation Average Reward: -17014.86

Running combination 374/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.0, 'gamma': 0.9, 'no_change_after_lap': 115}
    Training Episode 0/2


374it [1:36:38, 17.17s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -25882.08
    Min Reward over last 50 episodes: -25912.45
    Max Reward over last 50 episodes: -25851.71
    Evaluation Average Reward: -25637.91

Running combination 375/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.2, 'no_change_after_lap': 145}
    Training Episode 0/2


375it [1:36:54, 16.81s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18015.70
    Min Reward over last 50 episodes: -18105.15
    Max Reward over last 50 episodes: -17926.24
    Evaluation Average Reward: -18418.79

Running combination 376/9555 with parameters: {'epsilon': 0.025, 'epsilon_decay': 0.995, 'alpha': 1.0, 'gamma': 0.2, 'no_change_after_lap': 135}
    Training Episode 0/2


376it [1:37:10, 16.71s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20797.00
    Min Reward over last 50 episodes: -20839.79
    Max Reward over last 50 episodes: -20754.21
    Evaluation Average Reward: -20581.05

Running combination 377/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


377it [1:37:26, 16.55s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -18350.39
    Min Reward over last 50 episodes: -18427.88
    Max Reward over last 50 episodes: -18272.90
    Evaluation Average Reward: -17973.29

Running combination 378/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.4, 'no_change_after_lap': 125}
    Training Episode 0/2


378it [1:37:43, 16.68s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -22985.38
    Min Reward over last 50 episodes: -23151.46
    Max Reward over last 50 episodes: -22819.29
    Evaluation Average Reward: -23278.02

Running combination 379/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.0, 'no_change_after_lap': 125}
    Training Episode 0/2


379it [1:38:00, 16.79s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -23126.68
    Min Reward over last 50 episodes: -23197.01
    Max Reward over last 50 episodes: -23056.35
    Evaluation Average Reward: -23140.94

Running combination 380/9555 with parameters: {'epsilon': 0.05, 'epsilon_decay': 0.995, 'alpha': 0.8, 'gamma': 0.2, 'no_change_after_lap': 100}
    Training Episode 0/2


380it [1:38:17, 16.70s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -29397.59
    Min Reward over last 50 episodes: -29406.18
    Max Reward over last 50 episodes: -29389.00
    Evaluation Average Reward: -29565.15

Running combination 381/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.999, 'alpha': 0.9, 'gamma': 0.2, 'no_change_after_lap': 135}
    Training Episode 0/2


381it [1:38:33, 16.53s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -20239.42
    Min Reward over last 50 episodes: -20281.69
    Max Reward over last 50 episodes: -20197.16
    Evaluation Average Reward: -20389.51

Running combination 382/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.2, 'gamma': 1.0, 'no_change_after_lap': 155}
    Training Episode 0/2


382it [1:38:50, 16.72s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -16203.59
    Min Reward over last 50 episodes: -16210.53
    Max Reward over last 50 episodes: -16196.64
    Evaluation Average Reward: -16307.04

Running combination 383/9555 with parameters: {'epsilon': 0.2, 'epsilon_decay': 0.995, 'alpha': 0.0, 'gamma': 0.8, 'no_change_after_lap': 145}
    Training Episode 0/2


383it [1:39:09, 17.33s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -17935.33
    Min Reward over last 50 episodes: -17981.93
    Max Reward over last 50 episodes: -17888.73
    Evaluation Average Reward: -17880.34

Running combination 384/9555 with parameters: {'epsilon': 0.0, 'epsilon_decay': 0.99, 'alpha': 0.4, 'gamma': 0.6, 'no_change_after_lap': 120}
    Training Episode 0/2


384it [1:39:27, 17.40s/it]

    Evaluation Episode 0/1
    Average Reward over last 50 episodes: -24457.95
    Min Reward over last 50 episodes: -24488.01
    Max Reward over last 50 episodes: -24427.88
    Evaluation Average Reward: -24159.22

Running combination 385/9555 with parameters: {'epsilon': 0.1, 'epsilon_decay': 0.99, 'alpha': 0.6, 'gamma': 0.0, 'no_change_after_lap': 140}
    Training Episode 0/2


In [None]:
conn = sqlite3.connect(gs_db_name)
df = pd.read_sql_query(f"SELECT COUNT(*) from {table_name}", conn)
print(df.shape)
print(df)
conn.close()

In [None]:
# !pip install nbformat --upgrade

In [None]:
def radar_plot_top_combinations(db_name=gs_db_name, top_n=5):
    # Connect to the database and fetch data
    conn = sqlite3.connect(db_name)
    df = pd.read_sql_query(f"SELECT * from {table_name}", conn)
    conn.close()
    
    print(df.columns)
    # Columns you want to include in the radar plot (i.e., your hyperparameters)
    hyperparameter_columns = ['epsilon', 'epsilon_decay', 'alpha', 'gamma', 'no_change_after_lap', 'eval_avg']

    # Normalize the hyperparameters based on all rows
    normalized_df = (df[hyperparameter_columns] - df[hyperparameter_columns].min()) / (df[hyperparameter_columns].max() - df[hyperparameter_columns].min())
    
    # Sort by eval_reward (if it exists) and take top N
    sort_column = 'eval_avg' if 'eval_avg' in df.columns else hyperparameter_columns[0]
    top_rows_df = df.nlargest(top_n, sort_column)
    top_normalized_df = normalized_df.loc[top_rows_df.index]


    # Radar plot
    from math import pi
    labels = hyperparameter_columns
    num_vars = len(labels)

    angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
    angles += angles[:1]  # Adding the first angle at the end to close the circle

    plt.figure(figsize=(10, 10), dpi=80)
    ax = plt.subplot(111, polar=True)
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.set_rlabel_position(0)
    ax.set_xticks(angles[:-1])  # Removing the last angle for the labels
    ax.set_xticklabels(labels, fontsize=12)
    ax.set_yticklabels([])

    for i in range(top_n):
        norm_values = top_normalized_df.iloc[i].values.flatten().tolist()
        norm_values += norm_values[:1]  # Add the first value to the end to close the circle

        true_values_list = top_rows_df.iloc[i][hyperparameter_columns].values.flatten().tolist()

        param_string = ', '.join([f"{col}={val:.2f}" for col, val in zip(labels, true_values_list)])
        line = ax.plot(angles, norm_values, linewidth=2, linestyle='solid', label=param_string)
        ax.fill(angles, norm_values, alpha=0.1)

        # Annotate with the true values
        for angle, value, true_value in zip(angles, norm_values, true_values_list):
            ax.annotate(f"{true_value:.2f}", 
                        xy=(angle, value), 
                        color=line[0].get_color(),
                        ha='center', va='bottom')

    plt.title(f'Top {top_n} Parameter Combinations', size=20, color='blue', y=1.1)
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.show()

    print(top_rows_df)

radar_plot_top_combinations(top_n=5)
