In [1]:
import time
import csv
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects

In [2]:
%matplotlib inline

# Algorithms and Functions

In [73]:
WATERMARK = False
GATECH_USERNAME = 'DO NOT STEAL'
TERM = 'SPRING 2019'

def watermark(p):
    if not WATERMARK:
        return p

    ax = plt.gca()
    for i in range(1, 11):
        p.text(0.95, 0.95 - (i * (1.0/10)), '{} {}'.format(GATECH_USERNAME, TERM), transform=ax.transAxes,
               fontsize=32, color='gray',
               ha='right', va='bottom', alpha=0.2)
    return p

def plot_policy_map(title, policy, map_desc, color_map, direction_map):
    fig = plt.figure()
    ax = fig.add_subplot(111, xlim=(0, policy.shape[1]), ylim=(0, policy.shape[0]))
    font_size = 'x-large'
    if policy.shape[1] >= 16:
        font_size = 'x-small'
    plt.title(title)
    for i in range(policy.shape[0]):
        for j in range(policy.shape[1]):
            y = policy.shape[0] - i - 1
            x = j
            p = plt.Rectangle([x, y], 1, 1)
            p.set_facecolor(color_map[map_desc[i, j]])
            ax.add_patch(p)

            text = ax.text(x+0.5, y+0.5, direction_map[policy[i, j]], weight='bold', size=font_size,
                           horizontalalignment='center', verticalalignment='center', color='w')
            text.set_path_effects([path_effects.Stroke(linewidth=2, foreground='black'),
                                   path_effects.Normal()])

    plt.axis('off')
    plt.xlim((0, policy.shape[1]))
    plt.ylim((0, policy.shape[0]))
    plt.tight_layout()

    return watermark(plt)


def plot_value_map(title, v, map_desc, color_map):
    fig = plt.figure()
    ax = fig.add_subplot(111, xlim=(0, v.shape[1]), ylim=(0, v.shape[0]))
    font_size = 'x-large'
    if v.shape[1] >= 16:
        font_size = 'x-small'

    v_min = np.min(v)
    v_max = np.max(v)
    bins = np.linspace(v_min, v_max, 100)
    v_red = np.digitize(v, bins)/100.0
#     for i in range(v.shape[0]):
#         for j in range(v.shape[1]):
#             value = np.round(v[i, j], 2)
#             if len(str(value)) > 4:
#                 font_size = 'x-small'

    plt.title(title)
    for i in range(v.shape[0]):
        for j in range(v.shape[1]):
            y = v.shape[0] - i - 1
            x = j
            p = plt.Rectangle([x, y], 1, 1)
            p.set_facecolor(color_map[map_desc[i, j]])
            ax.add_patch(p)

            value = np.round(v[i, j], 2)

            red = v_red[i, j]
            text2 = ax.text(x+0.5, y+0.5, value, size=font_size,
                            horizontalalignment='center', verticalalignment='center', color=(1.0, 1.0-red, 1.0-red))
            text2.set_path_effects([path_effects.Stroke(linewidth=1, foreground='black'),
                                   path_effects.Normal()])

    plt.axis('off')
    plt.xlim((0, v.shape[1]))
    plt.ylim((0, v.shape[0]))
    plt.tight_layout()

    return watermark(plt)

In [74]:
class ExperimentStats(object):
    
    def __init__(self):
        self.policies = list()
        self.vs = list()
        self.steps = list()
        self.step_times = list()
        self.rewards = list()
        self.deltas = list()
        self.converged_values = list()
        self.elapsed_time = 0
        self.optimal_policy = None

    def add(self, policy, v, step, step_time, reward, delta, converged):
        self.policies.append(policy)
        self.vs.append(v)
        self.steps.append(step)
        self.step_times.append(step_time)
        self.rewards.append(reward)
        self.deltas.append(delta)
        self.converged_values.append(converged)

    def to_csv(self, file_name):
        with open(file_name, 'w') as f:
            f.write("steps,time,reward,delta,converged\n")
            writer = csv.writer(f, delimiter=',')
            writer.writerows(zip(self.steps, self.step_times, self.rewards, self.deltas, self.converged_values))

    def pickle_results(self, file_name_base, map_shape, step_size=1, only_last=False):
        if only_last:
            policy = np.reshape(np.argmax(self.policies[-1], axis=1), map_shape)
            v = self.vs[-1].reshape(map_shape)
            file_name = file_name_base.format('Last')
            with open(file_name, 'wb') as f:
                pickle.dump({'policy': policy, 'v': v}, f)
        else:
            l = len(self.policies)
            if step_size == 1 and l > 20:
                step_size = math.floor(l/20.0)
            for i, policy in enumerate(self.policies):
                if i % step_size == 0 or i == l-1:
                    v = self.vs[i].reshape(map_shape)
                    file_name = file_name_base.format(i)
                    if i == l-1:
                        file_name = file_name_base.format('Last')
                    with open(file_name, 'wb') as f:
                        pickle.dump({'policy': np.reshape(np.argmax(policy, axis=1), map_shape), 'v': v}, f)

    def plot_policies_on_map(self, file_name_base, map_desc, color_map, direction_map, experiment, step_preamble,
                             details, step_size=1, only_last=False):
        if only_last:
            policy = self.policies[-1]
            v = self.vs[-1]

            policy_file_name = file_name_base.format('Policy', 'Last')
            value_file_name = file_name_base.format('Value', 'Last')
            title = '{}: {} - {} {}'.format(details.env_readable_name, experiment, 'Last', step_preamble)

            p = plot_policy_map(title, policy, map_desc, color_map, direction_map)
            p.savefig(policy_file_name, format='png', dpi=150)
            p.close()

            p = plot_value_map(title, v, map_desc, color_map)
            p.savefig(value_file_name, format='png', dpi=150)
            p.close()
        else:
            l = len(self.policies)
            if step_size == 1 and l > 20:
                step_size = math.floor(l/20.0)
            for i, policy in enumerate(self.policies):
                if i % step_size == 0 or i == l-1:
                    policy = np.reshape(np.argmax(policy, axis=1), map_desc.shape)
                    v = self.vs[i].reshape(map_desc.shape)

                    file_name = file_name_base.format('Policy', i)
                    value_file_name = file_name_base.format('Value', i)
                    if i == l-1:
                        file_name = file_name_base.format('Policy', 'Last')
                        value_file_name = file_name_base.format('Value', 'Last')

                    title = '{}: {} - {} {}'.format(details.env_readable_name, experiment, step_preamble, i)

                    p = plot_policy_map(title, policy, map_desc, color_map, direction_map)
                    p.savefig(file_name, format='png', dpi=150)
                    p.close()

                    p = plot_value_map(title, v, map_desc, color_map)
                    p.savefig(value_file_name, format='png', dpi=150)
                    p.close()

    def __str__(self):
        return 'policies: {}, vs: {}, steps: {}, step_times: {}, deltas: {}, converged_values: {}'.format(
            self.policies,
            self.vs,
            self.steps,
            self.step_times,
            self.deltas,
            self.converged_values
        )

In [5]:
class GridWorld:
    
    def __init__(self, size, reward, start=None, terminal_1=None, terminal_2=None, blocked=None):
        self.board = np.zeros(size)
        self.rows = size[0]
        self.cols = size[1]
        self.reward = reward
        self.actions = ['north', 'east', 'south', 'west']
        self.model = {'north': [((-1, 0), 0.8), ((0, 1), 0.1), ((0, -1), 0.1)],
                      'east': [((0, 1), 0.8), ((-1, 0), 0.1), ((1, 0), 0.1)],
                      'south': [((1, 0), 0.8), ((0, 1), 0.1), ((0, -1), 0.1)],
                      'west': [((0, -1), 0.8), ((-1, 0), 0.1), ((1, 0), 0.1)]}
        if not start:
            self.start = (np.random.choice(self.rows), 0)
        else:
            self.start = start
            
        if not terminal_1:
            self.terminal_1 = (np.random.choice(self.rows), self.cols-1)
        else:
            self.terminal_1 = terminal_1
            
        if not terminal_2:
            self.terminal_2 = (np.random.choice(self.rows), np.random.choice(range(self.cols//2, self.cols-1)))
        else:
            self.terminal_2 = terminal_2
            
        if not blocked:
            blocked = (np.random.choice(self.rows), np.random.choice(range(1, self.cols//2)))
            while blocked == self.start or blocked == self.terminal_1 or blocked == self.terminal_2:
                blocked = (np.random.choice(self.rows), np.random.choice(self.cols))
            self.blocked = blocked
        else:
            self.blocked = blocked
            
        self.desc = np.asarray([['W' for j in range(self.cols)] for i in range(self.rows)], dtype='c')
        self.desc[self.start] = 'S'
        self.desc[self.terminal_1] = 'G'
        self.desc[self.terminal_2] = 'R'
        self.desc[self.blocked] = 'B'
        
    def colors(self):
        return {
            b'S': 'yellow',
            b'W': 'lightslategray',
            b'B': 'black',
            b'G': 'green',
            b'R': 'red'
        }

    def directions(self):
        return {
            0: '\u2191',
            1: '\u2192',
            2: '\u2193',
            3: '\u2190'
        }
        
    def rewards(self, state):
        if state == self.terminal_1:
            return 1
        elif state == self.terminal_2:
            return -1
        elif state == self.blocked:
            return 0
        else:
            return self.reward
        
    def step(self, state, action):
        
        if state == self.terminal_1 or state == self.terminal_2:
            return 'terminal', 0, True
        else:
            transition, _ = self.model[action][np.random.choice(range(3), p=[0.8, 0.1, 0.1])]
            i, j = state
            delta_i, delta_j = transition
            next_state = np.clip(i + delta_i, 0, self.rows - 1), np.clip(j + delta_j, 0, self.cols - 1)

            if next_state == self.blocked:
                next_state = state

            reward = self.rewards(next_state)

            return next_state, reward, False

In [6]:
class ValueIteration:
    
    def __init__(self, env, gamma=0.9, theta=1e-5):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.delta = np.ones(self.env.board.shape)
        self.value = np.zeros(self.env.board.shape)
        self.policy = np.zeros(self.env.board.shape)
        self.steps = 0
        self.step_times = []
        
    def step(self):
        start = time.time()
        reward = 0
        new_value = np.zeros(self.value.shape)
        for i in range(self.env.rows):
            for j in range(self.env.cols):
                vals = []
                state = i, j
                if state == self.env.terminal_1 or state == self.env.terminal_2 or state == self.env.blocked:
                    new_value[state] = self.env.rewards(state)
                else:
                    for action in self.env.actions:
                        val = 0
                        for transition, prob in self.env.model[action]:
                            delta_i, delta_j = transition
                            next_state = np.clip(i + delta_i, 0, self.env.rows - 1), np.clip(j + delta_j, 0, self.env.cols - 1)
                            if next_state == self.env.blocked:
                                next_state = state                            
                            val += prob * (self.env.rewards(state) + self.gamma * self.value[next_state])
                        vals.append(val)
                    new_value[i, j] = max(vals)
                    self.policy[i, j] = np.argmax(vals)
                    reward += max(vals)
        self.delta = np.abs(self.value - new_value)
        max_delta = np.max(self.delta)
        self.value = new_value
        self.step_times.append(time.time() - start)
        
#         self.policy = np.zeros((self.env.rows, self.env.cols))#, len(self.env.actions)))
#         for i in range(self.env.rows):
#             for j in range(self.env.cols):
#                 vals = []
#                 state = i, j
#                 if state == self.env.terminal_1 or state == self.env.terminal_2 or state == self.env.blocked:
#                     continue
#                 else:
#                     for action in self.env.actions:
#                         val = 0
#                         for transition, prob in self.env.model[action]:
#                             delta_i, delta_j = transition
#                             next_state = np.clip(i + delta_i, 0, self.env.rows - 1), np.clip(j + delta_j, 0, self.env.cols - 1)
#                             if next_state == blocked:
#                                 next_state = state                            
#                             val += prob * (self.env.rewards(state) + self.gamma * self.value[next_state])
#                         vals.append(val)
#                     self.policy[i, j] = np.argmax(vals)
                    
        self.steps += 1
        
        return self.policy, self.value, self.steps, self.step_times[-1], reward, max_delta, self.has_converged()
                    
    def has_converged(self):
        return np.all(self.delta < self.theta)
    
    def run_to_convergence(self):
        while not self.has_converged():
            self.step()

In [66]:
class PolicyIteration:
    
    def __init__(self, env, gamma=0.9, theta=1e-5, max_conv_steps=3):
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.delta = np.ones(self.env.board.shape)
        self.policy_delta = 1
        self.value = np.zeros(self.env.board.shape)
        self.policy = np.random.choice(len(self.env.actions), (self.env.rows, self.env.cols))
        self.policy[self.env.start] = 0
        self.policy[self.env.terminal_1] = 0
        self.policy[self.env.terminal_2] = 0
        self.policy[self.env.blocked] = 0
        self.steps = 0
        self.step_times = []
        self.conv_steps = 0
        self.max_conv_steps = max_conv_steps
        
    def evaluate_policy(self):
        converged = False
        while not converged:
            new_value = np.zeros(self.value.shape)
            for i in range(self.env.rows):
                for j in range(self.env.cols):
                    state = i, j
                    if state == self.env.terminal_1 or state == self.env.terminal_2 or state == self.env.blocked:
                        new_value[i, j] = self.env.rewards(state)
                    else:
                        val = 0
                        for transition, prob in self.env.model[self.env.actions[int(self.policy[state])]]:
                            delta_i, delta_j = transition
                            next_state = np.clip(i + delta_i, 0, self.env.rows - 1), np.clip(j + delta_j, 0, self.env.cols - 1)
                            if next_state == self.env.blocked:
                                next_state = state                            
                            val += prob * (self.env.rewards(state) + self.gamma * self.value[next_state])
                        new_value[i, j] = val
            delta = np.abs(self.value - new_value)
            converged = np.all(delta < self.theta)
            self.value = new_value
        
    def step(self):
        start = time.time()
        self.evaluate_policy()
        reward = 0
        new_value = np.zeros(self.value.shape)
        new_policy = np.zeros(self.value.shape)
        for i in range(self.env.rows):
            for j in range(self.env.cols):
                vals = []
                state = i, j
                if state == self.env.terminal_1 or state == self.env.terminal_2 or state == self.env.blocked:
                    continue
                else:
                    for action in self.env.actions:
                        val = 0
                        for transition, prob in self.env.model[action]:
                            delta_i, delta_j = transition
                            next_state = np.clip(i + delta_i, 0, self.env.rows - 1), np.clip(j + delta_j, 0, self.env.cols - 1)
                            if next_state == self.env.blocked:
                                next_state = state                            
                            val += prob * (self.env.rewards(state) + self.gamma * self.value[next_state])
                        vals.append(val)
                    new_policy[state] = np.argmax(vals)
                    new_value[state] = max(vals)
        
        self.policy_delta = np.abs(self.policy - new_policy)
        self.delta = np.abs(self.value - new_value)
        max_delta = np.max(self.delta)
        self.policy = new_policy
        self.steps += 1
        self.step_times.append(time.time() - start)
        
#         print(self.policy_delta)
#         print(self.delta)
#         print(self.delta < self.theta)
#         print(np.all(self.delta < self.theta))
        
        if np.all(self.policy_delta == 0):
            self.conv_steps += 1
        else:
            self.conv_steps = 0
        
        return self.policy, self.value, self.steps, self.step_times[-1], reward, max_delta, self.has_converged()
         
    def has_converged(self):
        return self.conv_steps >= self.max_conv_steps or np.all(self.delta < self.theta)
    
    def run_to_convergence(self):
        while not self.has_converged():
            self.step()

In [61]:
class QLearning:
    
    def __init__(self, env, gamma=0.9, theta=1e-5, epsilon=0.99999, epsilon_decay=0.99995, alpha=0.5, 
                 steps_per_episode=100, max_steps=100000, strategy='epsilon-greedy'):
        
        self.env = env
        self.gamma = gamma
        self.theta = theta
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.alpha = alpha
        self.steps = 0
        self.max_steps = max_steps
        self.step_times = []
        self.conv_steps = 0
        self.max_conv_steps = 10
        self.strategy = strategy
        if strategy == 'epsilon-greedy':
            self.q_function = {(i, j): [0]*len(self.env.actions) for i in range(self.env.rows) for j in range(self.env.cols)}
            self.q_function['terminal'] = [0]*len(self.env.actions)
        else:
            self.q_function = {(i, j): [1]*len(self.env.actions) for i in range(self.env.rows) for j in range(self.env.cols)}
            self.q_function['terminal'] = [1]*len(self.env.actions)
        
    def step(self):
        start = time.time()
        total_reward = 0
        episode_steps = 0
        state = self.env.start
        self.delta = 0
        done = False
        
        while not done:
            if self.strategy == 'epsilon-greedy':
                if np.random.random() < 1 - self.epsilon:
                    action = self.env.actions[np.argmax(self.q_function[state])]
                else:
                    action = np.random.choice(self.env.actions)
            else:
                action = self.env.actions[np.argmax(self.q_function[state])]
                
            action_idx = self.env.actions.index(action)
            next_state, reward, done = self.env.step(state, action)
            total_reward += reward
            td_error = reward + self.gamma * np.max(self.q_function[next_state]) - self.q_function[state][action_idx]
            self.q_function[state][action_idx] += (self.alpha * td_error)
            self.delta = max(self.delta, td_error)
            state = next_state
            episode_steps += 1

        self.steps += 1
        self.step_times.append(time.time() - start)
        self.epsilon *= self.epsilon_decay
        avg_reward = total_reward / episode_steps
        self.get_value_policy()
        
        if self.delta < self.theta:
            self.conv_steps += 1
        else:
            self.conv_steps = 0

        return self.policy, self.value, self.steps, self.step_times[-1], avg_reward, self.delta, self.has_converged()
    
    def get_value_policy(self):
        self.value = np.zeros(self.env.board.shape)
        self.policy = np.zeros(self.env.board.shape)
        for i in range(self.value.shape[0]):
            for j in range(self.value.shape[1]):
                self.value[i, j] = np.max(self.q_function[(i, j)])
                self.policy[i, j] = np.argmax(self.q_function[(i, j)])
        
    def has_converged(self):
        return self.conv_steps >= self.max_conv_steps or \
               self.steps >= self.max_steps# or self.epsilon < 0.001
    
    def run_to_convergence(self):
        while not self.has_converged():
            self.step()

In [62]:
def run_solver_and_collect(solver):
    stats = ExperimentStats()

    t = time.time()
    step_count = 0
    optimal_policy = None
    best_reward = float('-inf')

    while not solver.has_converged():# and step_count < MAX_STEP_COUNT:
        policy, v, steps, step_time, reward, delta, converged = solver.step()
        # print('{} {}'.format(reward, best_reward))
        if reward > best_reward:
            best_reward = reward
            optimal_policy = policy

        stats.add(policy, v, steps, step_time, reward, delta, converged)
        step_count += 1

    stats.elapsed_time = time.time() - t
    stats.optimal_policy = stats.policies[-1]  # optimal_policy
    return stats

In [63]:
class ExperimentDetails(object):
    def __init__(self, env, env_name, env_readable_name, threads, seed):
        self.env = env
        self.env_name = env_name
        self.env_readable_name = env_readable_name
        self.threads = threads
        self.seed = seed

# Solve MDPs with VI, PI, QL

In [67]:
OUTPUT_DIRECTORY = './output'
rewards = [-2, -0.5, -0.05, -0.005]
gammas = [0.1, 0.5, 0.9]
alphas = [0.25, 0.5, 0.75]
sm_start, sm_terminal_1, sm_terminal_2, sm_blocked = (3, 0), (0, 3), (1, 3), (2, 1)
lg_start, lg_terminal_1, lg_terminal_2, lg_blocked = (15, 0), (0, 15), (4, 14), (7, 7)

In [75]:
for reward in rewards:
    
    sm_gridworld = GridWorld((4, 4), reward, sm_start, sm_terminal_1, sm_terminal_2, sm_blocked)
    lg_gridworld = GridWorld((16, 16), reward, lg_start, lg_terminal_1, lg_terminal_2, lg_blocked)
    
    envs = [sm_gridworld, lg_gridworld]
    #envs = [sm_gridworld]
    #envs = [lg_gridworld]
    
    small_details = ExperimentDetails(sm_gridworld, 'sm_gridworld', 'Gridworld (4x4)', 1, 1)
    large_details = ExperimentDetails(lg_gridworld, 'lg_gridworld', 'Gridworld (16x16)', 1, 1)
    
    details = [small_details, large_details]
    #details = [small_details]
    #details = [large_details]
    
    for gamma in gammas:
        
        for i, env in enumerate(envs):
            
            print('Running VI with reward: {}, gamma: {}'.format(reward, gamma))
            
            vi = ValueIteration(env, gamma=gamma)
            vi_stats = run_solver_and_collect(vi)
            vi_stats.plot_policies_on_map('{}/images/VI/{}_{}_{}_{}.png'.format(OUTPUT_DIRECTORY, details[i].env_name, 
                                                                                gamma, reward, '{}_{}'),
                                       env.desc, env.colors(), env.directions(),
                                       'Value Iteration', 'Step', details[i], only_last=True)
            vi_stats.to_csv('{}/VI/{}_{}_{}.csv'.format(OUTPUT_DIRECTORY, details[i].env_name, reward, gamma))
    
            print('Running PI with reward: {}, gamma: {}'.format(reward, gamma))
        
            pi = PolicyIteration(env, gamma=gamma, max_conv_steps=1)
            pi_stats = run_solver_and_collect(pi)
            pi_stats.plot_policies_on_map('{}/images/PI/{}_{}_{}_{}.png'.format(OUTPUT_DIRECTORY, details[i].env_name,
                                                                                gamma, reward, '{}_{}'),
                                   env.desc, env.colors(), env.directions(),
                                   'Policy Iteration', 'Step', details[i], only_last=True)
            pi_stats.to_csv('{}/PI/{}_{}_{}.csv'.format(OUTPUT_DIRECTORY, details[i].env_name, reward, gamma))
            
            for alpha in alphas:
                
                print('Running Q-Learning with reward: {}, gamma: {}, alpha: {}'.format(reward, gamma, alpha))
                
                q_learner = QLearning(env, gamma=gamma, alpha=alpha)
                q_stats = run_solver_and_collect(q_learner)
                q_learner.value[env.terminal_1] = 1
                q_learner.value[env.terminal_2] = -1
                q_stats.plot_policies_on_map('{}/images/QL/{}_{}_{}_{}_{}_{:.3f}_{:.3f}_{}.png'.format(OUTPUT_DIRECTORY,
                                                                                         details[i].env_name, reward,
                                                                                         'zeros', gamma, alpha,
                                                                                         q_learner.epsilon, q_learner.epsilon_decay,
                                                                                         '{}_{}'),
                                           env.desc, env.colors(), env.directions(),
                                           'Q-Learning', 'Episode', details[i], only_last=True)
                q_stats.to_csv('{}/QL/{}_{}_{}_{}_{:.3f}_{:.3f}.csv'.format(OUTPUT_DIRECTORY, details[i].env_name,
                                              reward, 'zeros', gamma, alpha, q_learner.epsilon, q_learner.epsilon_decay))

Running VI with reward: -2, gamma: 0.1
Running VI with reward: -2, gamma: 0.1
Running VI with reward: -2, gamma: 0.5
Running VI with reward: -2, gamma: 0.5
Running VI with reward: -2, gamma: 0.9
Running VI with reward: -2, gamma: 0.9
Running VI with reward: -0.5, gamma: 0.1
Running VI with reward: -0.5, gamma: 0.1
Running VI with reward: -0.5, gamma: 0.5
Running VI with reward: -0.5, gamma: 0.5
Running VI with reward: -0.5, gamma: 0.9
Running VI with reward: -0.5, gamma: 0.9
Running VI with reward: -0.05, gamma: 0.1
Running VI with reward: -0.05, gamma: 0.1
Running VI with reward: -0.05, gamma: 0.5
Running VI with reward: -0.05, gamma: 0.5
Running VI with reward: -0.05, gamma: 0.9
Running VI with reward: -0.05, gamma: 0.9
Running VI with reward: -0.005, gamma: 0.1
Running VI with reward: -0.005, gamma: 0.1
Running VI with reward: -0.005, gamma: 0.5
Running VI with reward: -0.005, gamma: 0.5
Running VI with reward: -0.005, gamma: 0.9
Running VI with reward: -0.005, gamma: 0.9


# Single Tests

## Value Iteration

In [539]:
value = ValueIteration(envs[1])

In [540]:
v_stats = run_solver_and_collect(value)

In [541]:
np.max(v_stats.vs[-1])

1.0

In [546]:
v_stats.plot_policies_on_map('{}/images/VI/{}_{}_{}_{}.png'.format(OUTPUT_DIRECTORY, 'gridworld', value.gamma, -0.04, '{}_{}'),
                           value.env.desc, value.env.colors(), value.env.directions(),
                           'Value Iteration', 'Step', details[1], only_last=True)

x-small


## Policy Iteration

In [315]:
policy = PolicyIteration(gridworld)

In [316]:
p_stats = run_solver_and_collect(policy)

In [317]:
p_stats.plot_policies_on_map('{}/images/PI/{}_{}_{}.png'.format(OUTPUT_DIRECTORY, 'gridworld', policy.gamma, '{}_{}'),
                           policy.env.desc, policy.env.colors(), policy.env.directions(),
                           'Policy Iteration', 'Step', details, only_last=True)

## Q-Learning

In [46]:
q_learner = QLearning(GridWorld((16, 16), reward, lg_start, lg_terminal_1, lg_terminal_2, lg_blocked), max_steps=100000)

In [47]:
start = time.time()
q_stats = run_solver_and_collect(q_learner)
time.time() - start

573.0980224609375

In [439]:
q_learner.value[gridworld.terminal_1] = 1
q_learner.value[gridworld.terminal_2] = -1
q_stats.plot_policies_on_map('{}/images/QL/{}_{}_{}.png'.format(OUTPUT_DIRECTORY, 'gridworld', q_learner.gamma, '{}_{}'),
                           q_learner.env.desc, q_learner.env.colors(), q_learner.env.directions(),
                           'Q-Learning', 'Step', details, only_last=True)

In [48]:
q_learner.epsilon

0.13533122319589064

In [49]:
q_learner.steps

100000