In [1]:
import time
import random
import numpy as np
from collections import deque
import tensorflow as tf
from matplotlib import pyplot as plt


class DQNAgent:
    """ DQN agent """
    def __init__(self, states, actions, max_memory, double_q):
        self.states = states
        self.actions = actions
        self.session = tf.Session()
        self.build_model()
        self.saver = tf.train.Saver(max_to_keep=10)
        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.memory = deque(maxlen=max_memory)
        self.eps = 1
        self.eps_decay = 0.99999975
        self.eps_min = 0.1
        self.gamma = 0.90
        self.batch_size = 32
        self.burnin = 100000
        self.copy = 10000
        self.step = 0
        self.learn_each = 3
        self.learn_step = 0
        self.save_each = 500000
        self.double_q = double_q

    def build_model(self):
        """ Model builder function """
        self.input = tf.placeholder(dtype=tf.float32, shape=(None, ) + self.states, name='input')
        self.q_true = tf.placeholder(dtype=tf.float32, shape=[None], name='labels')
        self.a_true = tf.placeholder(dtype=tf.int32, shape=[None], name='actions')
        self.reward = tf.placeholder(dtype=tf.float32, shape=[], name='reward')
        self.input_float = tf.to_float(self.input) / 255.
        # Online network
        with tf.variable_scope('online'):
            self.conv_1 = tf.layers.conv2d(inputs=self.input_float, filters=32, kernel_size=8, strides=4, activation=tf.nn.relu)
            self.conv_2 = tf.layers.conv2d(inputs=self.conv_1, filters=64, kernel_size=4, strides=2, activation=tf.nn.relu)
            self.conv_3 = tf.layers.conv2d(inputs=self.conv_2, filters=64, kernel_size=3, strides=1, activation=tf.nn.relu)
            self.flatten = tf.layers.flatten(inputs=self.conv_3)
            self.dense = tf.layers.dense(inputs=self.flatten, units=512, activation=tf.nn.relu)
            self.output = tf.layers.dense(inputs=self.dense, units=self.actions, name='output')
        # Target network
        with tf.variable_scope('target'):
            self.conv_1_target = tf.layers.conv2d(inputs=self.input_float, filters=32, kernel_size=8, strides=4, activation=tf.nn.relu)
            self.conv_2_target = tf.layers.conv2d(inputs=self.conv_1_target, filters=64, kernel_size=4, strides=2, activation=tf.nn.relu)
            self.conv_3_target = tf.layers.conv2d(inputs=self.conv_2_target, filters=64, kernel_size=3, strides=1, activation=tf.nn.relu)
            self.flatten_target = tf.layers.flatten(inputs=self.conv_3_target)
            self.dense_target = tf.layers.dense(inputs=self.flatten_target, units=512, activation=tf.nn.relu)
            self.output_target = tf.stop_gradient(tf.layers.dense(inputs=self.dense_target, units=self.actions, name='output_target'))
        # Optimizer
        self.action = tf.argmax(input=self.output, axis=1)
        self.q_pred = tf.gather_nd(params=self.output, indices=tf.stack([tf.range(tf.shape(self.a_true)[0]), self.a_true], axis=1))
        self.loss = tf.losses.huber_loss(labels=self.q_true, predictions=self.q_pred)
        self.train = tf.train.AdamOptimizer(learning_rate=0.00025).minimize(self.loss)
        # Summaries
        self.summaries = tf.summary.merge([
            tf.summary.scalar('reward', self.reward),
            tf.summary.scalar('loss', self.loss),
            tf.summary.scalar('max_q', tf.reduce_max(self.output))
        ])
        self.writer = tf.summary.FileWriter(logdir='./logs', graph=self.session.graph)

    def copy_model(self):
        """ Copy weights to target network """
        self.session.run([tf.assign(new, old) for (new, old) in zip(tf.trainable_variables('target'), tf.trainable_variables('online'))])

    def save_model(self):
        """ Saves current model to disk """
        self.saver.save(sess=self.session, save_path='./models/model', global_step=self.step)

    def add(self, experience):
        """ Add observation to experience """
        self.memory.append(experience)

    def predict(self, model, state):
        """ Prediction """
        if model == 'online':
            return self.session.run(fetches=self.output, feed_dict={self.input: np.array(state)})
        if model == 'target':
            return self.session.run(fetches=self.output_target, feed_dict={self.input: np.array(state)})

    def run(self, state):
        """ Perform action """
        if np.random.rand() < self.eps:
            # Random action
            action = np.random.randint(low=0, high=self.actions)
        else:
            # Policy action
            q = self.predict('online', np.expand_dims(state, 0))
            action = np.argmax(q)
        # Decrease eps
        self.eps *= self.eps_decay
        self.eps = max(self.eps_min, self.eps)
        # Increment step
        self.step += 1
        return action

    def learn(self):
        """ Gradient descent """
        # Sync target network
        if self.step % self.copy == 0:
            self.copy_model()
        # Checkpoint model
        if self.step % self.save_each == 0:
            self.save_model()
        # Break if burn-in
        if self.step < self.burnin:
            return
        # Break if no training
        if self.learn_step < self.learn_each:
            self.learn_step += 1
            return
        # Sample batch
        batch = random.sample(self.memory, self.batch_size)
        state, next_state, action, reward, done = map(np.array, zip(*batch))
        # Get next q values from target network
        next_q = self.predict('target', next_state)
        # Calculate discounted future reward
        if self.double_q:
            q = self.predict('online', next_state)
            a = np.argmax(q, axis=1)
            target_q = reward + (1. - done) * self.gamma * next_q[np.arange(0, self.batch_size), a]
        else:
            target_q = reward + (1. - done) * self.gamma * np.amax(next_q, axis=1)
        # Update model
        summary, _ = self.session.run(fetches=[self.summaries, self.train],
                                      feed_dict={self.input: state,
                                                 self.q_true: np.array(target_q),
                                                 self.a_true: np.array(action),
                                                 self.reward: np.mean(reward)})
        # Reset learn step
        self.learn_step = 0
        # Write
        self.writer.add_summary(summary, self.step)

    def replay(self, env, model_path, n_replay, plot):
        """ Model replay """
        ckpt = tf.train.latest_checkpoint(model_path)
        saver = tf.train.import_meta_graph(ckpt + '.meta')
        graph = tf.get_default_graph()
        input = graph.get_tensor_by_name('input:0')
        output = graph.get_tensor_by_name('online/output/BiasAdd:0')
        # Replay RL agent
        state = env.reset()
        total_reward = 0
        with tf.Session() as sess:
            saver.restore(sess, ckpt)
            for _ in range(n_replay):
                step = 0
                while True:
                    time.sleep(0.05)
                    env.render()
                    # Plot
                    if plot:
                        if step % 100 == 0:
                            self.visualize_layer(session=sess, layer=self.conv_2, state=state, step=step)
                    # Action
                    if np.random.rand() < 0.0:
                        action = np.random.randint(low=0, high=self.actions, size=1)[0]
                    else:
                        q = sess.run(fetches=output, feed_dict={input: np.expand_dims(state, 0)})
                        action = np.argmax(q)
                    next_state, reward, done, info = env.step(action)
                    total_reward += reward
                    state = next_state
                    step += 1
                    if info['flag_get']:
                        break
                    if done:
                        break
        env.close()

    def visualize_layer(self, session, layer, state, step):
        """ Visualization auf Conv Layers"""
        units = session.run(layer, feed_dict={self.input: np.expand_dims(state, 0)})
        filters = units.shape[3]
        plt.figure(1, figsize=(40, 40))
        n_columns = 8
        n_rows = np.ceil(filters / n_columns)
        for i in range(filters):
            plt.subplot(n_rows, n_columns, i+1)
            plt.title('Filter ' + str(i))
            plt.imshow(units[0, :, :, i], interpolation="nearest", cmap='YlGnBu')
        plt.savefig(fname='./img/img-' + str(step) + '.png')

In [3]:
# Mostly copy-pasted from https://github.com/openai/baselines/blob/master/baselines/common/atari_wrappers.py
import numpy as np
import os
os.environ.setdefault('PATH', '')
from collections import deque
import gym
from gym import spaces
import cv2
cv2.ocl.setUseOpenCL(False)


class NoopResetEnv(gym.Wrapper):
    def __init__(self, env, noop_max=30):
        """Sample initial states by taking random number of no-ops on reset.
        No-op is assumed to be action 0.
        """
        gym.Wrapper.__init__(self, env)
        self.noop_max = noop_max
        self.override_num_noops = None
        self.noop_action = 0
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

    def reset(self, **kwargs):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset(**kwargs)
        if self.override_num_noops is not None:
            noops = self.override_num_noops
        else:
            noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
        assert noops > 0
        obs = None
        for _ in range(noops):
            obs, _, done, _ = self.env.step(self.noop_action)
            if done:
                obs = self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class FireResetEnv(gym.Wrapper):
    def __init__(self, env):
        """Take action on reset for environments that are fixed until firing."""
        gym.Wrapper.__init__(self, env)
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

    def reset(self, **kwargs):
        self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(1)
        if done:
            self.env.reset(**kwargs)
        obs, _, done, _ = self.env.step(2)
        if done:
            self.env.reset(**kwargs)
        return obs

    def step(self, ac):
        return self.env.step(ac)


class EpisodicLifeEnv(gym.Wrapper):
    def __init__(self, env):
        """Make end-of-life == end-of-episode, but only reset on true game over.
        Done by DeepMind for the DQN and co. since it helps value estimation.
        """
        gym.Wrapper.__init__(self, env)
        self.lives = 0
        self.was_real_done  = True

    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
        # then update lives to handle bonus lives
        lives = self.env.unwrapped.ale.lives()
        if lives < self.lives and lives > 0:
            # for Qbert sometimes we stay in lives == 0 condition for a few frames
            # so it's important to keep lives > 0, so that we only reset once
            # the environment advertises done.
            done = True
        self.lives = lives
        return obs, reward, done, info

    def reset(self, **kwargs):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
        """
        if self.was_real_done:
            obs = self.env.reset(**kwargs)
        else:
            # no-op step to advance from terminal/lost life state
            obs, _, _, _ = self.env.step(0)
        self.lives = self.env.unwrapped.ale.lives()
        return obs


class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env, skip=4):
        """Return only every `skip`-th frame"""
        gym.Wrapper.__init__(self, env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = np.zeros((2,)+env.observation_space.shape, dtype=np.uint8)
        self._skip = skip

    def step(self, action):
        """Repeat action, sum reward, and max over last observations."""
        total_reward = 0.0
        done = None
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            if i == self._skip - 2: self._obs_buffer[0] = obs
            if i == self._skip - 1: self._obs_buffer[1] = obs
            total_reward += reward
            if done:
                break
        # Note that the observation on the done=True frame
        # doesn't matter
        max_frame = self._obs_buffer.max(axis=0)

        return max_frame, total_reward, done, info

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)


class ClipRewardEnv(gym.RewardWrapper):
    def __init__(self, env):
        gym.RewardWrapper.__init__(self, env)

    def reward(self, reward):
        """Bin reward to {+1, 0, -1} by its sign."""
        return np.sign(reward)


class WarpFrame(gym.ObservationWrapper):
    def __init__(self, env, width=84, height=84, grayscale=True):
        """Warp frames to 84x84 as done in the Nature paper and later work."""
        gym.ObservationWrapper.__init__(self, env)
        self.width = width
        self.height = height
        self.grayscale = grayscale
        if self.grayscale:
            self.observation_space = spaces.Box(low=0, high=255,
                shape=(self.height, self.width, 1), dtype=np.uint8)
        else:
            self.observation_space = spaces.Box(low=0, high=255,
                shape=(self.height, self.width, 3), dtype=np.uint8)

    def observation(self, frame):
        if self.grayscale:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
        frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        if self.grayscale:
            frame = np.expand_dims(frame, -1)
        return frame


class LazyFrames(object):
    def __init__(self, frames):
        """This object ensures that common frames between the observations are only stored once.
        It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
        buffers.
        This object should only be converted to numpy array before being passed to the model.
        You'd not believe how complex the previous solution was."""
        self._frames = frames
        self._out = None

    def _force(self):
        if self._out is None:
            self._out = np.concatenate(self._frames, axis=-1)
            self._frames = None
        return self._out

    def __array__(self, dtype=None):
        out = self._force()
        if dtype is not None:
            out = out.astype(dtype)
        return out

    def __len__(self):
        return len(self._force())

    def __getitem__(self, i):
        return self._force()[i]


class FrameStack(gym.Wrapper):
    def __init__(self, env, k):
        """Stack k last frames.
        Returns lazy array, which is much more memory efficient.
        See Also
        --------
        baselines.common.atari_wrappers.LazyFrames
        """
        gym.Wrapper.__init__(self, env)
        self.k = k
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(low=0, high=255, shape=(shp[:-1] + (shp[-1] * k,)), dtype=env.observation_space.dtype)

    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info

    def _get_ob(self):
        assert len(self.frames) == self.k
        return LazyFrames(list(self.frames))


def wrapper(env):
    """Apply a common set of wrappers for Atari games."""
    #env = EpisodicLifeEnv(env)
    #env = NoopResetEnv(env, noop_max=10)
    env = MaxAndSkipEnv(env, skip=4)
    if 'FIRE' in env.unwrapped.get_action_meanings():
       env = FireResetEnv(env)
    env = WarpFrame(env)
    env = FrameStack(env, 4)
    env = ClipRewardEnv(env)
    return env

In [None]:
# Configuration paramaters for the whole setup
import time
import numpy as np
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

seed = 42
gamma = 0.99  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.05  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = (
    epsilon_max - epsilon_min
)  # Rate at which to reduce chance of random action being taken
batch_size = 32  # Size of batch taken from replay buffer
max_steps_per_episode = 1000

# Use the Baseline Atari environment because of Deepmind helper functions
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env=JoypadSpace(env,SIMPLE_MOVEMENT)
# Warp the frames, grey scale, stake four frame and scale to smaller ratio
env = wrapper(env)
env.seed(seed)

num_actions = 4


def create_q_model():
    # Network defined by the Deepmind paper
    inputs = layers.Input(shape=(84, 84, 4,))

    # Convolutions on the frames on the screen
    layer1 = layers.Conv2D(32, 8, strides=4, activation="relu")(inputs)
    layer2 = layers.Conv2D(64, 4, strides=2, activation="relu")(layer1)
    layer3 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer2)
    layer4 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer3)
    layer5 = layers.Conv2D(64, 3, strides=1, activation="relu")(layer4)

    layer6 = layers.Flatten()(layer5)

    layer7 = layers.Dense(256, activation="relu")(layer6)
    #layer8 = layers.Dense(64, activation="relu")(layer7)
    action = layers.Dense(num_actions, activation="linear")(layer7)

    return keras.Model(inputs=inputs, outputs=action)


# The first model makes the predictions for Q-values which are used to
# make a action.
model = create_q_model()
model.summary()
# Build a target model for the prediction of future rewards.
# The weights of a target model get updated every 10000 steps thus when the
# loss between the Q-values is calculated the target Q-value is stable.
model_target = create_q_model()

# In the Deepmind paper they use RMSProp however then Adam optimizer
# improves training time
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
running_reward = 0
episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 5000
# Number of frames for exploration
epsilon_greedy_frames = 100000.0
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 10000
# Train the model after 4 actions
update_after_actions = 4
# How often to update the target network
update_target_network = 1000
# Using huber loss for stability
loss_function = keras.losses.Huber()
solved = False

while True:  # Run until solved
    state = np.array(env.reset())
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        if solved: env.render() # Adding this line would show the attempts
        # of the agent in a pop up window.
        frame_count += 1

        # Use epsilon-greedy for exploration
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            # Take random action
            action = np.random.choice(num_actions)
        else:
            # Predict action Q-values
            # From environment state
            state_tensor = tf.convert_to_tensor(state)
            state_tensor = tf.expand_dims(state_tensor, 0)
            action_probs = model(state_tensor, training=False)
            # Take best action
            action = tf.argmax(action_probs[0]).numpy()

        # Decay probability of taking random action
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        # Apply the sampled action in our environment
        state_next, reward, done, _ = env.step(action)
        state_next = np.array(state_next)

        episode_reward += reward

        # Save actions and states in replay buffer
        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        # Update every fourth frame and once batch size is over 32
        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:

            # Get indices of samples for replay buffers
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            # Using list comprehension to sample from replay buffer
            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            done_sample = tf.convert_to_tensor(
                [float(done_history[i]) for i in indices]
            )

            # Build the updated Q-values for the sampled future states
            # Use the target model for stability
            future_rewards = model_target.predict(state_next_sample)
            # Q value = reward + discount factor * expected future reward
            updated_q_values = rewards_sample + gamma * tf.reduce_max(
                future_rewards, axis=1
            )

            # If final frame set the last value to -1
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            # Create a mask so we only calculate loss on the updated Q-values
            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = model(state_sample)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = loss_function(updated_q_values, q_action)

            # Backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            # update the the target network with new weights
            model_target.set_weights(model.get_weights())
            # Log details
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))

        # Limit the state and reward history
        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    print("frame_count="+str(frame_count)+" epsilon="+str(epsilon)+" episode_reward="+str(episode_reward)+" running_reward="+str(running_reward))

    # Update running reward to check condition for solving
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 100:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1

    if solved == False or running_reward > 40:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        #break
        solved = True

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 84, 84, 4)]       0         
                                                                 
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 conv2d_3 (Conv2D)           (None, 5, 5, 64)          36928     
                                                                 
 conv2d_4 (Conv2D)           (None, 3, 3, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 576)               0     

frame_count=30519 epsilon=0.7100695000001336 episode_reward=199.0 running_reward=63.86046511627907
Solved at episode 44!
running reward: 66.93 at episode 44, frame count 31000
frame_count=31518 epsilon=0.700579000000138 episode_reward=-63.0 running_reward=66.93181818181819
Solved at episode 45!
running reward: 64.04 at episode 45, frame count 32000
frame_count=32086 epsilon=0.6951830000001404 episode_reward=135.0 running_reward=64.04444444444445
Solved at episode 46!
frame_count=32189 epsilon=0.6942045000001409 episode_reward=96.0 running_reward=65.58695652173913
Solved at episode 47!
frame_count=32673 epsilon=0.689606500000143 episode_reward=143.0 running_reward=66.23404255319149
Solved at episode 48!
running reward: 67.83 at episode 48, frame count 33000
frame_count=33255 epsilon=0.6840775000001456 episode_reward=227.0 running_reward=67.83333333333333
Solved at episode 49!
frame_count=33666 epsilon=0.6801730000001474 episode_reward=186.0 running_reward=71.08163265306122
Solved at epi

running reward: 90.01 at episode 98, frame count 62000
frame_count=62288 epsilon=0.40826400000027263 episode_reward=94.0 running_reward=90.01020408163265
Solved at episode 99!
frame_count=62429 epsilon=0.40692450000027325 episode_reward=132.0 running_reward=90.05050505050505
Solved at episode 100!
running reward: 90.47 at episode 100, frame count 63000
frame_count=63266 epsilon=0.3989730000002769 episode_reward=192.0 running_reward=90.47
Solved at episode 101!
running reward: 91.06 at episode 101, frame count 64000
frame_count=64086 epsilon=0.3911830000002805 episode_reward=166.0 running_reward=91.06
Solved at episode 102!
running reward: 92.21 at episode 102, frame count 65000
frame_count=65085 epsilon=0.3816925000002849 episode_reward=-105.0 running_reward=92.21
Solved at episode 103!
frame_count=65344 epsilon=0.379232000000286 episode_reward=210.0 running_reward=91.39
Solved at episode 104!
frame_count=65472 epsilon=0.37801600000028657 episode_reward=117.0 running_reward=94.39
Solve

frame_count=87149 epsilon=0.17208450000038145 episode_reward=102.0 running_reward=136.55
Solved at episode 160!
frame_count=87263 epsilon=0.17100150000038195 episode_reward=105.0 running_reward=135.96
Solved at episode 161!
running reward: 137.67 at episode 161, frame count 88000
frame_count=88080 epsilon=0.16324000000038552 episode_reward=170.0 running_reward=137.67
Solved at episode 162!
frame_count=88333 epsilon=0.16083650000038663 episode_reward=163.0 running_reward=137.85
Solved at episode 163!
frame_count=88915 epsilon=0.15530750000038918 episode_reward=150.0 running_reward=137.73
Solved at episode 164!
running reward: 138.17 at episode 164, frame count 89000
frame_count=89252 epsilon=0.15210600000039065 episode_reward=219.0 running_reward=138.17
Solved at episode 165!
running reward: 138.00 at episode 165, frame count 90000
frame_count=90251 epsilon=0.14261550000039502 episode_reward=158.0 running_reward=138.0
Solved at episode 166!
running reward: 138.06 at episode 166, frame c

frame_count=109829 epsilon=0.05 episode_reward=171.0 running_reward=153.79
Solved at episode 227!
running reward: 153.92 at episode 227, frame count 110000
frame_count=110012 epsilon=0.05 episode_reward=172.0 running_reward=153.92
Solved at episode 228!
frame_count=110367 epsilon=0.05 episode_reward=179.0 running_reward=155.82
Solved at episode 229!
frame_count=110805 epsilon=0.05 episode_reward=205.0 running_reward=155.92
Solved at episode 230!
running reward: 156.96 at episode 230, frame count 111000
frame_count=111219 epsilon=0.05 episode_reward=176.0 running_reward=156.96
Solved at episode 231!
frame_count=111379 epsilon=0.05 episode_reward=116.0 running_reward=156.13
Solved at episode 232!
frame_count=111487 epsilon=0.05 episode_reward=92.0 running_reward=156.42
Solved at episode 233!
frame_count=111904 epsilon=0.05 episode_reward=143.0 running_reward=155.21
Solved at episode 234!
running reward: 155.48 at episode 234, frame count 112000
frame_count=112091 epsilon=0.05 episode_rew

running reward: 149.85 at episode 298, frame count 130000
frame_count=130243 epsilon=0.05 episode_reward=201.0 running_reward=149.85
Solved at episode 299!
frame_count=130465 epsilon=0.05 episode_reward=170.0 running_reward=150.2
Solved at episode 300!
frame_count=130925 epsilon=0.05 episode_reward=258.0 running_reward=149.74
Solved at episode 301!
running reward: 151.20 at episode 301, frame count 131000
frame_count=131073 epsilon=0.05 episode_reward=132.0 running_reward=151.2
Solved at episode 302!
frame_count=131180 epsilon=0.05 episode_reward=99.0 running_reward=151.61
Solved at episode 303!
frame_count=131316 epsilon=0.05 episode_reward=126.0 running_reward=151.8
Solved at episode 304!
frame_count=131636 epsilon=0.05 episode_reward=218.0 running_reward=151.31
Solved at episode 305!
running reward: 152.65 at episode 305, frame count 132000
frame_count=132635 epsilon=0.05 episode_reward=8.0 running_reward=152.65
Solved at episode 306!
running reward: 150.04 at episode 306, frame cou

frame_count=153850 epsilon=0.05 episode_reward=121.0 running_reward=140.28
Solved at episode 369!
running reward: 141.71 at episode 369, frame count 154000
frame_count=154849 epsilon=0.05 episode_reward=-18.0 running_reward=141.71
Solved at episode 370!
running reward: 140.26 at episode 370, frame count 155000
frame_count=155109 epsilon=0.05 episode_reward=198.0 running_reward=140.26
Solved at episode 371!
frame_count=155232 epsilon=0.05 episode_reward=115.0 running_reward=140.96
Solved at episode 372!
frame_count=155685 epsilon=0.05 episode_reward=247.0 running_reward=140.83
Solved at episode 373!
frame_count=155789 epsilon=0.05 episode_reward=94.0 running_reward=142.48
Solved at episode 374!
frame_count=155924 epsilon=0.05 episode_reward=121.0 running_reward=143.43
Solved at episode 375!
running reward: 143.51 at episode 375, frame count 156000
frame_count=156062 epsilon=0.05 episode_reward=127.0 running_reward=143.51
Solved at episode 376!
frame_count=156208 epsilon=0.05 episode_rew

frame_count=176683 epsilon=0.05 episode_reward=192.0 running_reward=149.89
Solved at episode 440!
frame_count=176934 epsilon=0.05 episode_reward=159.0 running_reward=150.58
Solved at episode 441!
running reward: 149.70 at episode 441, frame count 177000
frame_count=177174 epsilon=0.05 episode_reward=159.0 running_reward=149.7
Solved at episode 442!
frame_count=177634 epsilon=0.05 episode_reward=350.0 running_reward=149.7
Solved at episode 443!
frame_count=177735 epsilon=0.05 episode_reward=91.0 running_reward=151.32
Solved at episode 444!
frame_count=177855 epsilon=0.05 episode_reward=109.0 running_reward=151.09
Solved at episode 445!
running reward: 150.96 at episode 445, frame count 178000
frame_count=178854 epsilon=0.05 episode_reward=134.0 running_reward=150.96
Solved at episode 446!
running reward: 150.18 at episode 446, frame count 179000
frame_count=179673 epsilon=0.05 episode_reward=188.0 running_reward=150.18
Solved at episode 447!
running reward: 150.93 at episode 447, frame 

frame_count=200193 epsilon=0.05 episode_reward=120.0 running_reward=180.83
Solved at episode 510!
frame_count=200613 epsilon=0.05 episode_reward=200.0 running_reward=179.9
Solved at episode 511!
frame_count=200773 epsilon=0.05 episode_reward=142.0 running_reward=180.82
Solved at episode 512!
running reward: 181.04 at episode 512, frame count 201000
frame_count=201065 epsilon=0.05 episode_reward=255.0 running_reward=181.04
Solved at episode 513!
running reward: 182.48 at episode 513, frame count 202000
frame_count=202064 epsilon=0.05 episode_reward=269.0 running_reward=182.48
Solved at episode 514!
frame_count=202218 epsilon=0.05 episode_reward=143.0 running_reward=184.4
Solved at episode 515!
frame_count=202346 epsilon=0.05 episode_reward=115.0 running_reward=184.06
Solved at episode 516!
frame_count=202578 epsilon=0.05 episode_reward=178.0 running_reward=183.87
Solved at episode 517!
frame_count=202727 epsilon=0.05 episode_reward=132.0 running_reward=184.52
Solved at episode 518!
runn

frame_count=223696 epsilon=0.05 episode_reward=296.0 running_reward=192.79
Solved at episode 581!
frame_count=223979 epsilon=0.05 episode_reward=204.0 running_reward=193.63
Solved at episode 582!
running reward: 194.25 at episode 582, frame count 224000
frame_count=224624 epsilon=0.05 episode_reward=273.0 running_reward=194.25
Solved at episode 583!
frame_count=224782 epsilon=0.05 episode_reward=137.0 running_reward=193.97
Solved at episode 584!
frame_count=224930 epsilon=0.05 episode_reward=133.0 running_reward=193.21
Solved at episode 585!
running reward: 192.40 at episode 585, frame count 225000
frame_count=225098 epsilon=0.05 episode_reward=138.0 running_reward=192.4
Solved at episode 586!
frame_count=225390 epsilon=0.05 episode_reward=260.0 running_reward=192.31
Solved at episode 587!
frame_count=225549 epsilon=0.05 episode_reward=145.0 running_reward=193.27
Solved at episode 588!
frame_count=225670 epsilon=0.05 episode_reward=113.0 running_reward=192.39
Solved at episode 589!
fra