# Module Five Assignment: Cartpole Problem
Review the code in this notebook and in the score_logger.py file in the *scores* folder (directory). Once you have reviewed the code, return to this notebook and select **Cell** and then **Run All** from the menu bar to run this code. The code takes several minutes to run.

In [4]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

# No changes made - original version
GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  


EXPLORATION_MAX = 1.0  
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995 

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [5]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  

# Changed Exploration max to 0.8
EXPLORATION_MAX = 0.8
EXPLORATION_MIN = 0.01  
EXPLORATION_DECAY = 0.995 

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [9]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  

# Changed Exploration min to 0.013
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.013  
EXPLORATION_DECAY = 0.995 

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [11]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

GAMMA = 0.95  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  

# Changed Exploration decay 0.905
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01 
EXPLORATION_DECAY = 0.905

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [13]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

GAMMA = 0.95  
# Changed learning rate to 0.004
LEARNING_RATE = 0.004  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  

# Changed Exploration decay to 0.905
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01 
EXPLORATION_DECAY = 0.905

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [16]:

import random  
import gym  
import numpy as np  
from collections import deque  
from keras.models import Sequential  
from keras.layers import Dense  
from keras.optimizers import Adam  
  
  
from scores.score_logger import ScoreLogger  
  
ENV_NAME = "CartPole-v1"  

# Changed gamma to 0.90
GAMMA = 0.90  
LEARNING_RATE = 0.001  
  
MEMORY_SIZE = 1000000  
BATCH_SIZE = 20  

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01 
EXPLORATION_DECAY = 0.995

class DQNSolver:  
  
    def __init__(self, observation_space, action_space):  
        self.exploration_rate = EXPLORATION_MAX  
  
        self.action_space = action_space  
        self.memory = deque(maxlen=MEMORY_SIZE)  
  
        self.model = Sequential()  
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu"))  
        self.model.add(Dense(24, activation="relu"))  
        self.model.add(Dense(self.action_space, activation="linear"))  
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))  
  
    def remember(self, state, action, reward, next_state, done):  
        self.memory.append((state, action, reward, next_state, done))  
  
    def act(self, state):  
        if np.random.rand() < self.exploration_rate:  
            return random.randrange(self.action_space)  
        q_values = self.model.predict(state)  
        return np.argmax(q_values[0])  
  
    def experience_replay(self):  
        if len(self.memory) < BATCH_SIZE:  
            return  
        batch = random.sample(self.memory, BATCH_SIZE)  
        for state, action, reward, state_next, terminal in batch:  
            q_update = reward  
            if not terminal:  
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))  
            q_values = self.model.predict(state)  
            q_values[0][action] = q_update  
            self.model.fit(state, q_values, verbose=0)  
        self.exploration_rate *= EXPLORATION_DECAY  
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)  
  
  
def cartpole():  
    env = gym.make(ENV_NAME)  
    score_logger = ScoreLogger(ENV_NAME)  
    observation_space = env.observation_space.shape[0]  
    action_space = env.action_space.n  
    dqn_solver = DQNSolver(observation_space, action_space)  
    run = 0  
    while True:  
        run += 1  
        state = env.reset()  
        state = np.reshape(state, [1, observation_space])  
        step = 0  
        while True:  
            step += 1  
            #env.render()  
            action = dqn_solver.act(state)  
            state_next, reward, terminal, info = env.step(action)  
            reward = reward if not terminal else -reward  
            state_next = np.reshape(state_next, [1, observation_space])  
            dqn_solver.remember(state, action, reward, state_next, terminal)  
            state = state_next  
            if terminal:  
                print ("Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))  
                score_logger.add_score(step, run)  
                break  
            dqn_solver.experience_replay()  

In [17]:
cartpole() # Changed gamma to 0.9
           # Solved in 76 runs, 176 runs total.

Run: 1, exploration: 1.0, score: 11
Scores: (min: 11, avg: 11, max: 11)

Run: 2, exploration: 0.8603841919146962, score: 39
Scores: (min: 11, avg: 25, max: 39)

Run: 3, exploration: 0.8020760579717637, score: 15
Scores: (min: 11, avg: 21.666666666666668, max: 39)

Run: 4, exploration: 0.7147372386831305, score: 24
Scores: (min: 11, avg: 22.25, max: 39)

Run: 5, exploration: 0.6242658676435396, score: 28
Scores: (min: 11, avg: 23.4, max: 39)

Run: 6, exploration: 0.5848838636585911, score: 14
Scores: (min: 11, avg: 21.833333333333332, max: 39)

Run: 7, exploration: 0.5371084840724134, score: 18
Scores: (min: 11, avg: 21.285714285714285, max: 39)

Run: 8, exploration: 0.500708706245853, score: 15
Scores: (min: 11, avg: 20.5, max: 39)

Run: 9, exploration: 0.47862223409330756, score: 10
Scores: (min: 10, avg: 19.333333333333332, max: 39)

Run: 10, exploration: 0.40565285250151817, score: 34
Scores: (min: 10, avg: 20.8, max: 39)

Run: 11, exploration: 0.322118930542046, score: 47
Scores: (

Run: 90, exploration: 0.01, score: 117
Scores: (min: 9, avg: 120.04444444444445, max: 418)

Run: 91, exploration: 0.01, score: 139
Scores: (min: 9, avg: 120.25274725274726, max: 418)

Run: 92, exploration: 0.01, score: 291
Scores: (min: 9, avg: 122.1086956521739, max: 418)

Run: 93, exploration: 0.01, score: 165
Scores: (min: 9, avg: 122.56989247311827, max: 418)

Run: 94, exploration: 0.01, score: 148
Scores: (min: 9, avg: 122.84042553191489, max: 418)

Run: 95, exploration: 0.01, score: 155
Scores: (min: 9, avg: 123.17894736842105, max: 418)

Run: 96, exploration: 0.01, score: 171
Scores: (min: 9, avg: 123.67708333333333, max: 418)

Run: 97, exploration: 0.01, score: 190
Scores: (min: 9, avg: 124.36082474226804, max: 418)

Run: 98, exploration: 0.01, score: 207
Scores: (min: 9, avg: 125.20408163265306, max: 418)

Run: 99, exploration: 0.01, score: 139
Scores: (min: 9, avg: 125.34343434343434, max: 418)

Run: 100, exploration: 0.01, score: 149
Scores: (min: 9, avg: 125.58, max: 418)



NameError: name 'exit' is not defined

In [14]:
cartpole() # Changed learning rate to 0.004 and decay to 0.905
           # Force stopped running after 1195 runs over an hour period

Run: 1, exploration: 1.0, score: 10
Scores: (min: 10, avg: 10, max: 10)

Run: 2, exploration: 0.5494035676106407, score: 16
Scores: (min: 10, avg: 13, max: 16)

Run: 3, exploration: 0.15008006353821474, score: 14
Scores: (min: 10, avg: 13.333333333333334, max: 16)

Run: 4, exploration: 0.0553106544202557, score: 11
Scores: (min: 10, avg: 12.75, max: 16)

Run: 5, exploration: 0.02252402547163457, score: 10
Scores: (min: 10, avg: 12.2, max: 16)

Run: 6, exploration: 0.01, score: 11
Scores: (min: 10, avg: 12, max: 16)

Run: 7, exploration: 0.01, score: 11
Scores: (min: 10, avg: 11.857142857142858, max: 16)

Run: 8, exploration: 0.01, score: 10
Scores: (min: 10, avg: 11.625, max: 16)

Run: 9, exploration: 0.01, score: 13
Scores: (min: 10, avg: 11.777777777777779, max: 16)

Run: 10, exploration: 0.01, score: 14
Scores: (min: 10, avg: 12, max: 16)

Run: 11, exploration: 0.01, score: 12
Scores: (min: 10, avg: 12, max: 16)

Run: 12, exploration: 0.01, score: 20
Scores: (min: 10, avg: 12.666666

Run: 94, exploration: 0.01, score: 10
Scores: (min: 9, avg: 125.80851063829788, max: 500)

Run: 95, exploration: 0.01, score: 9
Scores: (min: 9, avg: 124.57894736842105, max: 500)

Run: 96, exploration: 0.01, score: 10
Scores: (min: 9, avg: 123.38541666666667, max: 500)

Run: 97, exploration: 0.01, score: 9
Scores: (min: 9, avg: 122.20618556701031, max: 500)

Run: 98, exploration: 0.01, score: 10
Scores: (min: 9, avg: 121.06122448979592, max: 500)

Run: 99, exploration: 0.01, score: 9
Scores: (min: 9, avg: 119.92929292929293, max: 500)

Run: 100, exploration: 0.01, score: 10
Scores: (min: 9, avg: 118.83, max: 500)

Run: 101, exploration: 0.01, score: 10
Scores: (min: 9, avg: 118.83, max: 500)

Run: 102, exploration: 0.01, score: 10
Scores: (min: 9, avg: 118.77, max: 500)

Run: 103, exploration: 0.01, score: 8
Scores: (min: 8, avg: 118.71, max: 500)

Run: 104, exploration: 0.01, score: 11
Scores: (min: 8, avg: 118.71, max: 500)

Run: 105, exploration: 0.01, score: 10
Scores: (min: 8, av

Run: 198, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.35, max: 13)

Run: 199, exploration: 0.01, score: 9
Scores: (min: 8, avg: 9.35, max: 13)

Run: 200, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.35, max: 13)

Run: 201, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.35, max: 13)

Run: 202, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.35, max: 13)

Run: 203, exploration: 0.01, score: 9
Scores: (min: 8, avg: 9.36, max: 13)

Run: 204, exploration: 0.01, score: 11
Scores: (min: 8, avg: 9.36, max: 13)

Run: 205, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.36, max: 13)

Run: 206, exploration: 0.01, score: 9
Scores: (min: 8, avg: 9.35, max: 13)

Run: 207, exploration: 0.01, score: 9
Scores: (min: 8, avg: 9.35, max: 13)

Run: 208, exploration: 0.01, score: 10
Scores: (min: 8, avg: 9.36, max: 13)

Run: 209, exploration: 0.01, score: 9
Scores: (min: 8, avg: 9.34, max: 13)

Run: 210, exploration: 0.01, score: 8
Scores: (min: 8, avg: 9.32, max: 13)

Run: 

Run: 305, exploration: 0.01, score: 18
Scores: (min: 8, avg: 17.7, max: 48)

Run: 306, exploration: 0.01, score: 10
Scores: (min: 8, avg: 17.71, max: 48)

Run: 307, exploration: 0.01, score: 15
Scores: (min: 8, avg: 17.77, max: 48)

Run: 308, exploration: 0.01, score: 38
Scores: (min: 8, avg: 18.05, max: 48)

Run: 309, exploration: 0.01, score: 11
Scores: (min: 8, avg: 18.07, max: 48)

Run: 310, exploration: 0.01, score: 50
Scores: (min: 8, avg: 18.49, max: 50)

Run: 311, exploration: 0.01, score: 54
Scores: (min: 8, avg: 18.93, max: 54)

Run: 312, exploration: 0.01, score: 15
Scores: (min: 8, avg: 19, max: 54)

Run: 313, exploration: 0.01, score: 31
Scores: (min: 8, avg: 19.21, max: 54)

Run: 314, exploration: 0.01, score: 54
Scores: (min: 8, avg: 19.66, max: 54)

Run: 315, exploration: 0.01, score: 23
Scores: (min: 8, avg: 19.79, max: 54)

Run: 316, exploration: 0.01, score: 19
Scores: (min: 8, avg: 19.83, max: 54)

Run: 317, exploration: 0.01, score: 24
Scores: (min: 8, avg: 19.98, 

Run: 410, exploration: 0.01, score: 46
Scores: (min: 8, avg: 30.63, max: 129)

Run: 411, exploration: 0.01, score: 15
Scores: (min: 8, avg: 30.24, max: 129)

Run: 412, exploration: 0.01, score: 32
Scores: (min: 8, avg: 30.41, max: 129)

Run: 413, exploration: 0.01, score: 17
Scores: (min: 8, avg: 30.27, max: 129)

Run: 414, exploration: 0.01, score: 14
Scores: (min: 8, avg: 29.87, max: 129)

Run: 415, exploration: 0.01, score: 31
Scores: (min: 8, avg: 29.95, max: 129)

Run: 416, exploration: 0.01, score: 9
Scores: (min: 8, avg: 29.85, max: 129)

Run: 417, exploration: 0.01, score: 119
Scores: (min: 8, avg: 30.8, max: 129)

Run: 418, exploration: 0.01, score: 24
Scores: (min: 8, avg: 30.95, max: 129)

Run: 419, exploration: 0.01, score: 12
Scores: (min: 8, avg: 30.97, max: 129)

Run: 420, exploration: 0.01, score: 11
Scores: (min: 8, avg: 30.91, max: 129)

Run: 421, exploration: 0.01, score: 9
Scores: (min: 8, avg: 30.39, max: 129)

Run: 422, exploration: 0.01, score: 55
Scores: (min: 8

Run: 514, exploration: 0.01, score: 86
Scores: (min: 8, avg: 37.86, max: 145)

Run: 515, exploration: 0.01, score: 15
Scores: (min: 8, avg: 37.7, max: 145)

Run: 516, exploration: 0.01, score: 67
Scores: (min: 8, avg: 38.28, max: 145)

Run: 517, exploration: 0.01, score: 31
Scores: (min: 8, avg: 37.4, max: 145)

Run: 518, exploration: 0.01, score: 68
Scores: (min: 8, avg: 37.84, max: 145)

Run: 519, exploration: 0.01, score: 85
Scores: (min: 8, avg: 38.57, max: 145)

Run: 520, exploration: 0.01, score: 32
Scores: (min: 8, avg: 38.78, max: 145)

Run: 521, exploration: 0.01, score: 31
Scores: (min: 8, avg: 39, max: 145)

Run: 522, exploration: 0.01, score: 28
Scores: (min: 8, avg: 38.73, max: 145)

Run: 523, exploration: 0.01, score: 51
Scores: (min: 8, avg: 39.05, max: 145)

Run: 524, exploration: 0.01, score: 44
Scores: (min: 8, avg: 39.25, max: 145)

Run: 525, exploration: 0.01, score: 19
Scores: (min: 8, avg: 39.09, max: 145)

Run: 526, exploration: 0.01, score: 15
Scores: (min: 8, a

Run: 618, exploration: 0.01, score: 17
Scores: (min: 8, avg: 53.06, max: 277)

Run: 619, exploration: 0.01, score: 12
Scores: (min: 8, avg: 52.33, max: 277)

Run: 620, exploration: 0.01, score: 11
Scores: (min: 8, avg: 52.12, max: 277)

Run: 621, exploration: 0.01, score: 9
Scores: (min: 8, avg: 51.9, max: 277)

Run: 622, exploration: 0.01, score: 9
Scores: (min: 8, avg: 51.71, max: 277)

Run: 623, exploration: 0.01, score: 11
Scores: (min: 8, avg: 51.31, max: 277)

Run: 624, exploration: 0.01, score: 11
Scores: (min: 8, avg: 50.98, max: 277)

Run: 625, exploration: 0.01, score: 98
Scores: (min: 8, avg: 51.77, max: 277)

Run: 626, exploration: 0.01, score: 224
Scores: (min: 8, avg: 53.86, max: 277)

Run: 627, exploration: 0.01, score: 138
Scores: (min: 8, avg: 55.07, max: 277)

Run: 628, exploration: 0.01, score: 84
Scores: (min: 8, avg: 55.7, max: 277)

Run: 629, exploration: 0.01, score: 18
Scores: (min: 8, avg: 55.43, max: 277)

Run: 630, exploration: 0.01, score: 20
Scores: (min: 8

Run: 722, exploration: 0.01, score: 13
Scores: (min: 8, avg: 65.63, max: 279)

Run: 723, exploration: 0.01, score: 164
Scores: (min: 8, avg: 67.16, max: 279)

Run: 724, exploration: 0.01, score: 91
Scores: (min: 8, avg: 67.96, max: 279)

Run: 725, exploration: 0.01, score: 111
Scores: (min: 8, avg: 68.09, max: 279)

Run: 726, exploration: 0.01, score: 163
Scores: (min: 8, avg: 67.48, max: 279)

Run: 727, exploration: 0.01, score: 53
Scores: (min: 8, avg: 66.63, max: 279)

Run: 728, exploration: 0.01, score: 158
Scores: (min: 8, avg: 67.37, max: 279)

Run: 729, exploration: 0.01, score: 60
Scores: (min: 8, avg: 67.79, max: 279)

Run: 730, exploration: 0.01, score: 320
Scores: (min: 8, avg: 70.79, max: 320)

Run: 731, exploration: 0.01, score: 178
Scores: (min: 8, avg: 72.31, max: 320)

Run: 732, exploration: 0.01, score: 17
Scores: (min: 8, avg: 71.98, max: 320)

Run: 733, exploration: 0.01, score: 164
Scores: (min: 8, avg: 73.39, max: 320)

Run: 734, exploration: 0.01, score: 56
Scores

Run: 825, exploration: 0.01, score: 97
Scores: (min: 9, avg: 141.29, max: 500)

Run: 826, exploration: 0.01, score: 147
Scores: (min: 9, avg: 141.13, max: 500)

Run: 827, exploration: 0.01, score: 264
Scores: (min: 9, avg: 143.24, max: 500)

Run: 828, exploration: 0.01, score: 274
Scores: (min: 9, avg: 144.4, max: 500)

Run: 829, exploration: 0.01, score: 261
Scores: (min: 9, avg: 146.41, max: 500)

Run: 830, exploration: 0.01, score: 193
Scores: (min: 9, avg: 145.14, max: 500)

Run: 831, exploration: 0.01, score: 345
Scores: (min: 9, avg: 146.81, max: 500)

Run: 832, exploration: 0.01, score: 206
Scores: (min: 9, avg: 148.7, max: 500)

Run: 833, exploration: 0.01, score: 144
Scores: (min: 9, avg: 148.5, max: 500)

Run: 834, exploration: 0.01, score: 270
Scores: (min: 9, avg: 150.64, max: 500)

Run: 835, exploration: 0.01, score: 197
Scores: (min: 9, avg: 152.43, max: 500)

Run: 836, exploration: 0.01, score: 500
Scores: (min: 9, avg: 156.41, max: 500)

Run: 837, exploration: 0.01, sco

Run: 927, exploration: 0.01, score: 233
Scores: (min: 8, avg: 128.5, max: 500)

Run: 928, exploration: 0.01, score: 200
Scores: (min: 8, avg: 127.76, max: 500)

Run: 929, exploration: 0.01, score: 296
Scores: (min: 8, avg: 128.11, max: 500)

Run: 930, exploration: 0.01, score: 170
Scores: (min: 8, avg: 127.88, max: 500)

Run: 931, exploration: 0.01, score: 251
Scores: (min: 8, avg: 126.94, max: 500)

Run: 932, exploration: 0.01, score: 92
Scores: (min: 8, avg: 125.8, max: 500)

Run: 933, exploration: 0.01, score: 193
Scores: (min: 8, avg: 126.29, max: 500)

Run: 934, exploration: 0.01, score: 286
Scores: (min: 8, avg: 126.45, max: 500)

Run: 935, exploration: 0.01, score: 107
Scores: (min: 8, avg: 125.55, max: 500)

Run: 936, exploration: 0.01, score: 261
Scores: (min: 8, avg: 123.16, max: 500)

Run: 937, exploration: 0.01, score: 482
Scores: (min: 8, avg: 126.1, max: 500)

Run: 938, exploration: 0.01, score: 239
Scores: (min: 8, avg: 126.86, max: 500)

Run: 939, exploration: 0.01, sco

Run: 1029, exploration: 0.01, score: 21
Scores: (min: 9, avg: 153.14, max: 500)

Run: 1030, exploration: 0.01, score: 90
Scores: (min: 9, avg: 152.34, max: 500)

Run: 1031, exploration: 0.01, score: 83
Scores: (min: 9, avg: 150.66, max: 500)

Run: 1032, exploration: 0.01, score: 166
Scores: (min: 9, avg: 151.4, max: 500)

Run: 1033, exploration: 0.01, score: 436
Scores: (min: 9, avg: 153.83, max: 500)

Run: 1034, exploration: 0.01, score: 238
Scores: (min: 9, avg: 153.35, max: 500)

Run: 1035, exploration: 0.01, score: 75
Scores: (min: 9, avg: 153.03, max: 500)

Run: 1036, exploration: 0.01, score: 117
Scores: (min: 9, avg: 151.59, max: 500)

Run: 1037, exploration: 0.01, score: 292
Scores: (min: 9, avg: 149.69, max: 500)

Run: 1038, exploration: 0.01, score: 182
Scores: (min: 9, avg: 149.12, max: 500)

Run: 1039, exploration: 0.01, score: 178
Scores: (min: 9, avg: 150.26, max: 500)

Run: 1040, exploration: 0.01, score: 105
Scores: (min: 9, avg: 150.64, max: 500)

Run: 1041, exploratio

Run: 1130, exploration: 0.01, score: 74
Scores: (min: 9, avg: 121.45, max: 500)

Run: 1131, exploration: 0.01, score: 39
Scores: (min: 9, avg: 121.01, max: 500)

Run: 1132, exploration: 0.01, score: 25
Scores: (min: 9, avg: 119.6, max: 500)

Run: 1133, exploration: 0.01, score: 10
Scores: (min: 9, avg: 115.34, max: 500)

Run: 1134, exploration: 0.01, score: 14
Scores: (min: 9, avg: 113.1, max: 500)

Run: 1135, exploration: 0.01, score: 184
Scores: (min: 9, avg: 114.19, max: 500)

Run: 1136, exploration: 0.01, score: 122
Scores: (min: 9, avg: 114.24, max: 500)

Run: 1137, exploration: 0.01, score: 152
Scores: (min: 9, avg: 112.84, max: 500)

Run: 1138, exploration: 0.01, score: 49
Scores: (min: 9, avg: 111.51, max: 500)

Run: 1139, exploration: 0.01, score: 40
Scores: (min: 9, avg: 110.13, max: 500)

Run: 1140, exploration: 0.01, score: 44
Scores: (min: 9, avg: 109.52, max: 500)

Run: 1141, exploration: 0.01, score: 55
Scores: (min: 9, avg: 109.13, max: 500)

Run: 1142, exploration: 0.0

KeyboardInterrupt: 

In [12]:
cartpole() # Changed exploration decay to 0.905
           # Solved in 243 runs, 343 runs total

Run: 1, exploration: 0.08245452233512682, score: 45
Scores: (min: 45, avg: 45, max: 45)

Run: 2, exploration: 0.03710249487545281, score: 9
Scores: (min: 9, avg: 27, max: 45)

Run: 3, exploration: 0.015109160222299656, score: 10
Scores: (min: 9, avg: 21.333333333333332, max: 45)

Run: 4, exploration: 0.01, score: 8
Scores: (min: 8, avg: 18, max: 45)

Run: 5, exploration: 0.01, score: 10
Scores: (min: 8, avg: 16.4, max: 45)

Run: 6, exploration: 0.01, score: 9
Scores: (min: 8, avg: 15.166666666666666, max: 45)

Run: 7, exploration: 0.01, score: 10
Scores: (min: 8, avg: 14.428571428571429, max: 45)

Run: 8, exploration: 0.01, score: 11
Scores: (min: 8, avg: 14, max: 45)

Run: 9, exploration: 0.01, score: 9
Scores: (min: 8, avg: 13.444444444444445, max: 45)

Run: 10, exploration: 0.01, score: 12
Scores: (min: 8, avg: 13.3, max: 45)

Run: 11, exploration: 0.01, score: 13
Scores: (min: 8, avg: 13.272727272727273, max: 45)

Run: 12, exploration: 0.01, score: 15
Scores: (min: 8, avg: 13.41666

Run: 94, exploration: 0.01, score: 144
Scores: (min: 8, avg: 137.32978723404256, max: 354)

Run: 95, exploration: 0.01, score: 124
Scores: (min: 8, avg: 137.18947368421053, max: 354)

Run: 96, exploration: 0.01, score: 286
Scores: (min: 8, avg: 138.73958333333334, max: 354)

Run: 97, exploration: 0.01, score: 100
Scores: (min: 8, avg: 138.340206185567, max: 354)

Run: 98, exploration: 0.01, score: 99
Scores: (min: 8, avg: 137.9387755102041, max: 354)

Run: 99, exploration: 0.01, score: 222
Scores: (min: 8, avg: 138.78787878787878, max: 354)

Run: 100, exploration: 0.01, score: 188
Scores: (min: 8, avg: 139.28, max: 354)

Run: 101, exploration: 0.01, score: 221
Scores: (min: 8, avg: 141.04, max: 354)

Run: 102, exploration: 0.01, score: 155
Scores: (min: 8, avg: 142.5, max: 354)

Run: 103, exploration: 0.01, score: 223
Scores: (min: 8, avg: 144.63, max: 354)

Run: 104, exploration: 0.01, score: 193
Scores: (min: 9, avg: 146.48, max: 354)

Run: 105, exploration: 0.01, score: 80
Scores: (

Run: 194, exploration: 0.01, score: 11
Scores: (min: 11, avg: 177.66, max: 500)

Run: 195, exploration: 0.01, score: 13
Scores: (min: 11, avg: 176.55, max: 500)

Run: 196, exploration: 0.01, score: 210
Scores: (min: 11, avg: 175.79, max: 500)

Run: 197, exploration: 0.01, score: 122
Scores: (min: 11, avg: 176.01, max: 500)

Run: 198, exploration: 0.01, score: 184
Scores: (min: 11, avg: 176.86, max: 500)

Run: 199, exploration: 0.01, score: 116
Scores: (min: 11, avg: 175.8, max: 500)

Run: 200, exploration: 0.01, score: 294
Scores: (min: 11, avg: 176.86, max: 500)

Run: 201, exploration: 0.01, score: 163
Scores: (min: 11, avg: 176.28, max: 500)

Run: 202, exploration: 0.01, score: 450
Scores: (min: 11, avg: 179.23, max: 500)

Run: 203, exploration: 0.01, score: 195
Scores: (min: 11, avg: 178.95, max: 500)

Run: 204, exploration: 0.01, score: 113
Scores: (min: 11, avg: 178.15, max: 500)

Run: 205, exploration: 0.01, score: 133
Scores: (min: 11, avg: 178.68, max: 500)

Run: 206, explorati

Run: 295, exploration: 0.01, score: 154
Scores: (min: 9, avg: 183.39, max: 500)

Run: 296, exploration: 0.01, score: 248
Scores: (min: 9, avg: 183.77, max: 500)

Run: 297, exploration: 0.01, score: 466
Scores: (min: 9, avg: 187.21, max: 500)

Run: 298, exploration: 0.01, score: 171
Scores: (min: 9, avg: 187.08, max: 500)

Run: 299, exploration: 0.01, score: 107
Scores: (min: 9, avg: 186.99, max: 500)

Run: 300, exploration: 0.01, score: 25
Scores: (min: 9, avg: 184.3, max: 500)

Run: 301, exploration: 0.01, score: 19
Scores: (min: 9, avg: 182.86, max: 500)

Run: 302, exploration: 0.01, score: 18
Scores: (min: 9, avg: 178.54, max: 500)

Run: 303, exploration: 0.01, score: 136
Scores: (min: 9, avg: 177.95, max: 500)

Run: 304, exploration: 0.01, score: 157
Scores: (min: 9, avg: 178.39, max: 500)

Run: 305, exploration: 0.01, score: 500
Scores: (min: 9, avg: 182.06, max: 500)

Run: 306, exploration: 0.01, score: 107
Scores: (min: 9, avg: 180.76, max: 500)

Run: 307, exploration: 0.01, sco

NameError: name 'exit' is not defined

In [10]:
cartpole() # Changed exploration min to 0.013
           # Solved in 23 runs, 123 total runs

Run: 1, exploration: 0.851801859600347, score: 52
Scores: (min: 52, avg: 52, max: 52)

Run: 2, exploration: 0.7292124703704616, score: 32
Scores: (min: 32, avg: 42, max: 52)

Run: 3, exploration: 0.6900935609921609, score: 12
Scores: (min: 12, avg: 32, max: 52)

Run: 4, exploration: 0.6274028820538087, score: 20
Scores: (min: 12, avg: 29, max: 52)

Run: 5, exploration: 0.5967292370047992, score: 11
Scores: (min: 11, avg: 25.4, max: 52)

Run: 6, exploration: 0.567555222460375, score: 11
Scores: (min: 11, avg: 23, max: 52)

Run: 7, exploration: 0.5032248303978422, score: 25
Scores: (min: 11, avg: 23.285714285714285, max: 52)

Run: 8, exploration: 0.4738479773082268, score: 13
Scores: (min: 11, avg: 22, max: 52)

Run: 9, exploration: 0.43296668905325736, score: 19
Scores: (min: 11, avg: 21.666666666666668, max: 52)

Run: 10, exploration: 0.40974000909221303, score: 12
Scores: (min: 11, avg: 20.7, max: 52)

Run: 11, exploration: 0.39166620452737816, score: 10
Scores: (min: 10, avg: 19.7272

Run: 88, exploration: 0.013, score: 307
Scores: (min: 8, avg: 145.9431818181818, max: 500)

Run: 89, exploration: 0.013, score: 351
Scores: (min: 8, avg: 148.24719101123594, max: 500)

Run: 90, exploration: 0.013, score: 230
Scores: (min: 8, avg: 149.15555555555557, max: 500)

Run: 91, exploration: 0.013, score: 168
Scores: (min: 8, avg: 149.36263736263737, max: 500)

Run: 92, exploration: 0.013, score: 203
Scores: (min: 8, avg: 149.94565217391303, max: 500)

Run: 93, exploration: 0.013, score: 227
Scores: (min: 8, avg: 150.7741935483871, max: 500)

Run: 94, exploration: 0.013, score: 193
Scores: (min: 8, avg: 151.22340425531914, max: 500)

Run: 95, exploration: 0.013, score: 280
Scores: (min: 8, avg: 152.57894736842104, max: 500)

Run: 96, exploration: 0.013, score: 219
Scores: (min: 8, avg: 153.27083333333334, max: 500)

Run: 97, exploration: 0.013, score: 279
Scores: (min: 8, avg: 154.56701030927834, max: 500)

Run: 98, exploration: 0.013, score: 206
Scores: (min: 8, avg: 155.091836

NameError: name 'exit' is not defined

In [6]:
cartpole() # Changed exploration max to 0.8 results below
           # Solved in 14 runs, 114 total runs

Run: 1, exploration: 0.7685544348603495, score: 28
Scores: (min: 28, avg: 28, max: 28)

Run: 2, exploration: 0.6712708882964634, score: 28
Scores: (min: 28, avg: 28, max: 28)

Run: 3, exploration: 0.6102901313127972, score: 20
Scores: (min: 20, avg: 25.333333333333332, max: 28)

Run: 4, exploration: 0.5717897909465046, score: 14
Scores: (min: 14, avg: 22.5, max: 28)

Run: 5, exploration: 0.5224585615557593, score: 19
Scores: (min: 14, avg: 21.8, max: 28)

Run: 6, exploration: 0.4894991027424384, score: 14
Scores: (min: 14, avg: 20.5, max: 28)

Run: 7, exploration: 0.38098329833827305, score: 51
Scores: (min: 14, avg: 24.857142857142858, max: 51)

Run: 8, exploration: 0.3412036856786414, score: 23
Scores: (min: 14, avg: 24.625, max: 51)

Run: 9, exploration: 0.3055775820842425, score: 23
Scores: (min: 14, avg: 24.444444444444443, max: 51)

Run: 10, exploration: 0.26159955042365357, score: 32
Scores: (min: 14, avg: 25.2, max: 51)

Run: 11, exploration: 0.16330767032038201, score: 95
Scor

Run: 90, exploration: 0.01, score: 170
Scores: (min: 13, avg: 162.65555555555557, max: 500)

Run: 91, exploration: 0.01, score: 190
Scores: (min: 13, avg: 162.95604395604394, max: 500)

Run: 92, exploration: 0.01, score: 189
Scores: (min: 13, avg: 163.2391304347826, max: 500)

Run: 93, exploration: 0.01, score: 186
Scores: (min: 13, avg: 163.48387096774192, max: 500)

Run: 94, exploration: 0.01, score: 500
Scores: (min: 13, avg: 167.06382978723406, max: 500)

Run: 95, exploration: 0.01, score: 165
Scores: (min: 13, avg: 167.0421052631579, max: 500)

Run: 96, exploration: 0.01, score: 280
Scores: (min: 13, avg: 168.21875, max: 500)

Run: 97, exploration: 0.01, score: 159
Scores: (min: 13, avg: 168.1237113402062, max: 500)

Run: 98, exploration: 0.01, score: 275
Scores: (min: 13, avg: 169.21428571428572, max: 500)

Run: 99, exploration: 0.01, score: 119
Scores: (min: 13, avg: 168.7070707070707, max: 500)

Run: 100, exploration: 0.01, score: 185
Scores: (min: 13, avg: 168.87, max: 500)

R

NameError: name 'exit' is not defined

In [2]:
cartpole() # No changes made - Results below
           # Solved in 12 runs, 112 total runs

Run: 1, exploration: 1.0, score: 11
Scores: (min: 11, avg: 11, max: 11)

Run: 2, exploration: 0.8603841919146962, score: 39
Scores: (min: 11, avg: 25, max: 39)

Run: 3, exploration: 0.7901049725470279, score: 18
Scores: (min: 11, avg: 22.666666666666668, max: 39)

Run: 4, exploration: 0.736559652908221, score: 15
Scores: (min: 11, avg: 20.75, max: 39)

Run: 5, exploration: 0.7076077347272662, score: 9
Scores: (min: 9, avg: 18.4, max: 39)

Run: 6, exploration: 0.6763948591909945, score: 10
Scores: (min: 9, avg: 17, max: 39)

Run: 7, exploration: 0.6088145090359074, score: 22
Scores: (min: 9, avg: 17.714285714285715, max: 39)

Run: 8, exploration: 0.5790496471185967, score: 11
Scores: (min: 9, avg: 16.875, max: 39)

Run: 9, exploration: 0.5562889678716474, score: 9
Scores: (min: 9, avg: 16, max: 39)

Run: 10, exploration: 0.5344229416520513, score: 9
Scores: (min: 9, avg: 15.3, max: 39)

Run: 11, exploration: 0.4883155414435353, score: 19
Scores: (min: 9, avg: 15.636363636363637, max: 39

Run: 89, exploration: 0.01, score: 328
Scores: (min: 8, avg: 160.26966292134833, max: 500)

Run: 90, exploration: 0.01, score: 232
Scores: (min: 8, avg: 161.06666666666666, max: 500)

Run: 91, exploration: 0.01, score: 179
Scores: (min: 8, avg: 161.26373626373626, max: 500)

Run: 92, exploration: 0.01, score: 198
Scores: (min: 8, avg: 161.66304347826087, max: 500)

Run: 93, exploration: 0.01, score: 228
Scores: (min: 8, avg: 162.3763440860215, max: 500)

Run: 94, exploration: 0.01, score: 247
Scores: (min: 8, avg: 163.27659574468086, max: 500)

Run: 95, exploration: 0.01, score: 415
Scores: (min: 8, avg: 165.92631578947368, max: 500)

Run: 96, exploration: 0.01, score: 241
Scores: (min: 8, avg: 166.70833333333334, max: 500)

Run: 97, exploration: 0.01, score: 327
Scores: (min: 8, avg: 168.36082474226805, max: 500)

Run: 98, exploration: 0.01, score: 263
Scores: (min: 8, avg: 169.3265306122449, max: 500)

Run: 99, exploration: 0.01, score: 216
Scores: (min: 8, avg: 169.7979797979798, ma

NameError: name 'exit' is not defined

Note: If the code is running properly, you should begin to see output appearing above this code block. It will take several minutes, so it is recommended that you let this code run in the background while completing other work. When the code has finished, it will print output saying, "Solved in _ runs, _ total runs."

You may see an error about not having an exit command. This error does not affect the program's functionality and results from the steps taken to convert the code from Python 2.x to Python 3. Please disregard this error.

# Markdown - Assignment 5

## Analysis of CartPole Problem and Q-learning

### Reinforcement Learning Concepts for the Cartpole Problem
Reinforcement learning is applied to the CartPole problem by training an agent to balance a pole on a moving cart. The agent's goal is to maximize its cumulative reward over time. The state values represent the current state of the environment, which includes the cart's position, velocity, pole angle, and pole angular velocity. Possible actions include moving the cart left or right. The reinforcement algorithm used is a form of Q-learning, where the agent learns a Q-value function to estimate the expected future rewards for each state-action pair.

### Experience Replay in the Cartpole Problem
Experience replay is applied to the CartPole problem by storing past experiences (state, action, reward, next state) in a replay buffer. During training, random batches of experiences are sampled from the buffer, breaking the temporal correlation between consecutive samples, thus improving the stability of the learning process.

### Neural Networks in the Cartpole Problem
Deep Q-learning employs a neural network to approximate the Q-value function. The architecture typically consists of input nodes representing the state space, hidden layers for complex feature representation, and output nodes for each possible action. This neural network makes the Q-learning algorithm more efficient by generalizing Q-values across similar states.

### Effect of Modifying Parameters

#### Modifying Discount Factor (Gamma)
*Changed gamma to 0.9.*
**Result:** Solved in 76 runs out of 176 total runs.
**Analysis:** A discount factor of 0.9 likely emphasizes short-term rewards slightly more than the default, influencing the agent to prioritize immediate gains.

#### Modifying Learning Rate and Decay
*Changed learning rate to 0.004 and decay to 0.905.*
**Result:** Force stopped after 1195 runs over an hour.
**Analysis:** A lower learning rate and higher decay might lead to slower convergence, and in this case, it resulted in a prolonged training time without finding a solution.

#### Modifying Exploration Max
*Changed exploration max to 0.8.*
**Result:** Solved in 14 runs out of 114 total runs.
**Analysis:** Decreasing exploration max seems to have allowed the agent to explore more states, leading to a successful solution but with a higher number of runs.

#### Modifying Exploration Min
*Changed exploration min to 0.013.*
**Result:** Solved in 23 runs out of 123 total runs.
**Analysis:** Decreasing exploration min likely encouraged more exploration, helping the agent find a solution faster.

#### Modifying Exploration Decay
*Changed exploration decay to 0.905.*
**Result:** Solved in 243 runs out of 343 total runs.
**Analysis:** Increasing exploration decay seems to have allowed the agent to explore more effectively and find a solution.

#### No Changes Made
**Result:** Solved in 12 runs out of 112 total runs.
**Analysis:** The default parameters seem to perform reasonably well, solving the problem in a relatively low number of runs.

### Citations and Attributions
[Surma, G. (2018). Cartpole. Github repository.](https://github.com/gsurma/cartpole)
