# Introduction
This notebook contains the 4 (+2 bonus) agents used in the paper "Using Hippocampal Replay to Consolidate Experiences in Memory-Augmented Reinforcement Learning"
- Random
- TD (bonus)
- Q-Learning (bonus)
- Go-Explore
- Go-Explore-Count
- Explore-Count

It contains the 2 discrete state environments used
- Unwalled Maze
- Walled Maze

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import copy
from collections import defaultdict

# Agent Definition

In [None]:
# this is the memory for the agents which need them
memory = defaultdict(lambda: 0)

## Agent 1: Random Agent

In [None]:
def RandomAgent(env, **kwargs):
    return env.sample()

## Agent 2: TD Agent

TD-error: $\delta_t = r_{t} + \gamma \max_{a\in A, a: s_t \rightarrow s_{t+1}}V(s_{t+1}) - V(s_t)$

Value update: $V(s_t) \leftarrow V(s_t) + \alpha\delta_t$

In [None]:
def TDAgent(env, **kwargs):
    eps = kwargs.get('eps', 1)
    GAMMA = 0.99
    ALPHA = 1
    
    statehistory = kwargs.get('statehistory', [])
    repeatedstate = kwargs.get('repeatedstate', False)
    
    curstate = env.staterep()
    if repeatedstate:
        curstate += str(statehistory.count(env.staterep()))
    
    if env.reward == 1:
        memory[curstate] = 1
        return

    validmoves = env.getvalidmoves()
    bestmove = None
    bestvalue = -1
    
    # choose best move
    for move in validmoves:
        newenv = copy.deepcopy(env)
        newenv.step(move)
        nextstate = newenv.staterep()
        if repeatedstate:
            nextstate += str(statehistory.count(newenv.staterep())+1)
            
        curvalue = memory[nextstate] 
        if curvalue > bestvalue:
            bestvalue = curvalue
            bestmove = move
    
    # if epsilon, then choose randomly
    if eps > np.random.rand():
        bestmove = env.sample()
        
    # update current state value with optimal one-step lookahead estimate
    td_error = env.reward + GAMMA*bestvalue - memory[curstate]
    memory[curstate] = memory[curstate] + ALPHA*td_error
    
    return bestmove

## Agent 3: Q-Learning Agent

TD-error: $\delta_t = r_t + \gamma\max_{a\in A}Q(s_{t+1},a) - Q(s_t, a_t)$

Q-learning update: $Q(s_t, a_t) \leftarrow Q(s_t, a_t) + \alpha\delta_t$

In [None]:
def QAgent(env, **kwargs):
    eps = kwargs.get('eps', 1)
    statehistory = kwargs.get('statehistory', [])
    repeatedstate = kwargs.get('repeatedstate', False)
    
    GAMMA = 0.99
    ALPHA = 1
    
    curstate = env.staterep()
    if repeatedstate:
        curstate += str(statehistory.count(env.staterep()))
    
    if env.reward == 1:
        for move in env.getvalidmoves():
            memory[(curstate, move)] = 1
        return

    validmoves = env.getvalidmoves()
    bestmove = None
    bestvalue = -1
    
    # if epsilon, then choose randomly
    if eps > np.random.rand():
        bestmove = env.sample()
    # else choose best move
    else:
        for move in validmoves:
            curvalue = memory[(curstate, move)] 
            if curvalue > bestvalue:
                bestvalue = curvalue
                bestmove = move
        
    # do a one-step in the next direction
    newenv = copy.deepcopy(env)
    newenv.step(bestmove)
    nextstate = newenv.staterep()
    if repeatedstate:
        nextstate += str(statehistory.count(newenv.staterep())+1)
    td_error = env.reward + GAMMA*np.max([memory[(nextstate, move)] for move in newenv.getvalidmoves()]) - memory[(curstate, bestmove)]
        
    # update the Q-function
    memory[(curstate, bestmove)] = memory[(curstate, bestmove)] + ALPHA*td_error
    
    return bestmove

## Agent 4: Go-Explore

In [None]:
def reward_formula(reward = 0, intrinsic = 0, moves = 0, numselected = 0, numvisits = 0, eps = 1e-20):
    return 1000*reward + 10*intrinsic + 1*np.sqrt(moves) - 100*np.sqrt(numselected+numvisits)

In [None]:
''' Chooses the next best memory state to go to '''
def ChooseState(env):
    bestvalue = -1e20
    bestkey = None
    
    # choose the best memory based on heuristics
    for key in memory:
        # do not choose final state as there is nothing left to explore
        if memory[key]['reward'] == 1: 
            continue
        
        curmem = memory[key]
        reward = curmem['reward']
        intrinsic = curmem['intrinsic']
        moves = curmem['moves']
        numselected = curmem.get('numselected', 0)
        numvisits = curmem['numvisits']
        
        value = reward_formula(reward = reward, intrinsic = intrinsic, moves = moves, numselected = numselected, numvisits = numvisits)
        if value > bestvalue:
            bestvalue = value
            bestkey = key
    
    # generate the trajectory to get the environment state
    actionhistory = []
    statehistory = []
    
    for move in memory[bestkey]['actionhistory']:
        statehistory.append(env.staterep())
        env.step(move)
        actionhistory.append(move)
        
    # increment the selection visit count
    if 'numselected' in memory[bestkey]:
        memory[bestkey]['numselected'] = memory[bestkey]['numselected'] + 1
    
    return (actionhistory, statehistory, copy.deepcopy(env))
        

In [None]:
''' Chooses the best move based on memory and intrinsic rewards '''
def GoExplore(env, **kwargs):
    
    intrinsic_fn = kwargs.get('intrinsic_fn', None)
    replay = kwargs.get('replay', False)
    getbestmove = kwargs.get('getbestmove', False)
    statehistory = kwargs.get('statehistory', [])
    actionhistory = kwargs.get('actionhistory', [])
    repeatedstate = kwargs.get('repeatedstate', False)
    
    # if no intrinsic guiding value, then do without intrinsic motivation
    if intrinsic_fn is not None:
        intrinsic_value = intrinsic_fn(env)
    else:
        intrinsic_value = 0
        
    curmoves = env.numsteps
    curreward = env.reward

    curstate = env.staterep()
    if repeatedstate:
        curstate += str(statehistory.count(env.staterep()))
        
    # if this state is not present in memory (should only happen for start state), add it in
    if curstate not in memory:
        memory[curstate] = {'statehistory': statehistory+[], 'reward': curreward, 'intrinsic': intrinsic_value, 'moves': curmoves, 'numvisits': 0, 'numselected': 0, 'actionhistory': actionhistory+[]}
        
    curmemory = memory[curstate]
    
    # only increment memory if not doing replay
    if replay:
        curmemory['numvisits'] = 0
        curmemory['numselected'] = 0
    else:
        curmemory['numvisits'] = curmemory['numvisits'] + 1
    
    # if completed, no need to continue to next move selection
    if env.done:
        # if there is positive reward, then make intrinsic become extrinsic reward
        if env.reward > 0 and intrinsic_fn is not None:
            curmemory['intrinsic'] = env.reward
        return

    # if not completed, continue to select next move
    validmoves = env.getvalidmoves()
    
    # if no valid moves, no need to continue to next move selection
    if validmoves == []:
        return
    
    bestmove = None
    bestvalue = -1e20
    bestintrinsic = -1e20
    
    # choose best move
    for move in validmoves:
        newenv = copy.deepcopy(env)
        newenv.step(move)
        
        nextmoves = newenv.numsteps
        nextreward = newenv.reward
        nextmemory = None
        
        nextstate = newenv.staterep()
        if repeatedstate:
            nextstate += str(statehistory.count(newenv.staterep())+1)
        
        if nextstate in memory:
            nextmemory = memory[nextstate] 
            # update the nextmemory if agent has a better reward
            if nextreward > nextmemory['reward']:
                nextmemory['reward'] = nextreward
                nextmemory['moves'] = curmoves + 1
                nextmemory['numvisits'] = 0
                nextmemory['numselected'] = 0
                nextmemory['actionhistory'] = actionhistory+[move]
                nextmemory['statehistory'] = statehistory+[newenv.staterep()]
                
            # update the nextmemory if agent has similar reward but fewer number of moves
            elif nextreward == nextmemory['reward'] and nextmemory['moves'] > curmoves + 1:
                nextmemory['moves'] = curmoves + 1
                nextmemory['numvisits'] = 0
                nextmemory['numselected'] = 0
                nextmemory['actionhistory'] = actionhistory+[move]
                nextmemory['statehistory'] = statehistory+[newenv.staterep()]
                
        # start a new memory if this is a new state
        else:
            # if no intrinsic guiding value, then do without intrinsic motivation
            if intrinsic_fn is not None:
                next_intrinsic_value = intrinsic_fn(newenv)
            else:
                next_intrinsic_value = 0
            memory[nextstate] = {'statehistory': statehistory+[newenv.staterep()], 'reward': nextreward, 'intrinsic': next_intrinsic_value, 'moves': curmoves + 1, 'numvisits': 0, 'numselected': 0, 'actionhistory': actionhistory+[move]}
            nextmemory = memory[nextstate]
                
        # best intrinsic is the highest intrinsic value of all 1-step connections
        bestintrinsic = max(bestintrinsic, nextmemory['intrinsic'])
        
        # determine the next square to visit via a heuristic
        reward = nextmemory['reward'] 
        moves = nextmemory['moves'] 
        numvisits = nextmemory['numvisits']
        intrinsic = nextmemory['intrinsic']
        
        totalvalue = reward_formula(reward = reward, intrinsic = intrinsic, moves = moves, numselected = 0, numvisits = numvisits)
      
        if totalvalue > bestvalue or bestvalue is None:
            bestvalue = totalvalue
            bestmove = move
            
    # update the one-step lookahead for intrinsic value
    curmemory['intrinsic'] = bestintrinsic*0.99
    
    if getbestmove:
        return bestmove
    else:
        return np.random.choice(validmoves)

In [None]:
''' Chooses Best Move '''
def GoExploreCount(env, **kwargs):
    return GoExplore(env, getbestmove = True, **kwargs)

## Agent 5: Count Agent

In [None]:
''' Chooses the best move based on memory and intrinsic rewards '''
def CountAgent(env, **kwargs):
    
    intrinsic_fn = kwargs.get('intrinsic_fn', None)
    replay = kwargs.get('replay', False)
    statehistory = kwargs.get('statehistory', [])
    actionhistory = kwargs.get('actionhistory', [])
    repeatedstate = kwargs.get('repeatedstate', False)
    
    # if no intrinsic guiding value, then do without intrinsic motivation
    if intrinsic_fn is not None:
        intrinsic_value = intrinsic_fn(env)
    else:
        intrinsic_value = 0
        
    curmoves = env.numsteps
    curreward = env.reward

    curstate = env.staterep()
    if repeatedstate:
        curstate += str(statehistory.count(env.staterep()))

    # if this state is not present in memory (should only happen for start state), add it in
    if curstate not in memory:
        memory[curstate] = {'statehistory': statehistory+[], 'reward': curreward, 'intrinsic': intrinsic_value, 'moves': curmoves, 'numvisits': 0, 'actionhistory': actionhistory+[]}
        
    curmemory = memory[curstate]
    
    # only increment memory if not doing replay
    if replay:
        curmemory['numvisits'] = 0
    else:
        curmemory['numvisits'] = curmemory['numvisits'] + 1

    # if completed, no need to continue to next move selection
    if env.done:
        if env.reward > 0 and intrinsic_fn is not None:
            curmemory['intrinsic'] = env.reward
        return

    # if not completed, continue to select next move
    validmoves = env.getvalidmoves()
    
    # if no valid moves, no need to continue to next move selection
    if validmoves == []:
        return
    
    bestmove = None
    bestvalue = -1e20
    bestintrinsic = -1e20
    
    # choose best move
    for move in validmoves:
        newenv = copy.deepcopy(env)
        newenv.step(move)
        
        nextmoves = newenv.numsteps
        nextreward = newenv.reward
        nextmemory = None
        
        nextstate = newenv.staterep()
        if repeatedstate:
            nextstate += str(statehistory.count(newenv.staterep())+1)
        
        if nextstate in memory:
            nextmemory = memory[nextstate] 
            # update the nextmemory if agent has a better reward
            if nextreward > nextmemory['reward']:
                nextmemory['reward'] = nextreward
                nextmemory['moves'] = curmoves + 1
                nextmemory['numvisits'] = 0
                nextmemory['actionhistory'] = actionhistory+[move]
                nextmemory['statehistory'] = statehistory+[newenv.staterep()]
                
            # update the nextmemory if agent has similar reward but fewer number of moves
            elif nextreward == nextmemory['reward'] and nextmemory['moves'] > curmoves + 1:
                nextmemory['moves'] = curmoves + 1
                nextmemory['numvisits'] = 0
                nextmemory['actionhistory'] = actionhistory+[move]
                nextmemory['statehistory'] = statehistory+[newenv.staterep()]
                
        # start a new memory if this is a new state
        else:
            # if no intrinsic guiding value, then do without intrinsic motivation
            if intrinsic_fn is not None:
                next_intrinsic_value = intrinsic_fn(newenv)
            else:
                next_intrinsic_value = 0
            memory[nextstate] = {'statehistory': statehistory+[newenv.staterep()], 'reward': nextreward, 'intrinsic': next_intrinsic_value, 'moves': curmoves + 1, 'numvisits': 0, 'actionhistory': actionhistory+[move]}
            nextmemory = memory[nextstate]
            
        # best intrinsic is the highest intrinsic value of all 1-step connections
        bestintrinsic = max(bestintrinsic, nextmemory['intrinsic'])
        
        # determine the next square to visit via a heuristic
        reward = nextmemory['reward'] 
        moves = nextmemory['moves'] 
        numvisits = nextmemory['numvisits']
        intrinsic = nextmemory['intrinsic']
        
        totalvalue = reward_formula(reward = reward, intrinsic = intrinsic, moves = moves, numselected = 0, numvisits = numvisits)
        
        if totalvalue > bestvalue:
            bestvalue = totalvalue
            bestmove = move
            
        # update the one-step lookahead for intrinsic value
        curmemory['intrinsic'] = bestintrinsic * 0.99
    
    return bestmove

# Helper Functions
These functions help to perform hippocampal replay, and evaluation of agent on the environment.
- MemoryReplay: Implements hippocampal replay
- Game: Plays an environment for a single run (episode)
- MultipleGame: Plays an environment for 100 runs

## Memory Replay

In [None]:
''' Performs memory replay '''
def MemoryReplay(env, bestactionhistory = [], agent = RandomAgent, maxsteps = 500, seed = None, **kwargs):
    statehistory = []
    actionhistory = []
    statehistory.append(env.staterep())
    historytuplelist = []
    
    # do forward replay
    for move in bestactionhistory:
        kwargs['statehistory'] = statehistory        
        kwargs['actionhistory'] = actionhistory
        kwargs['replay']=True
        historytuplelist.append((copy.deepcopy(env), statehistory, actionhistory))
        env.step(move)     
        statehistory.append(env.staterep())
        agent(copy.deepcopy(env), **kwargs)
        actionhistory.append(move)
    
    # # do backward replay
    backwardstates = []
    for env, statehistory, actionhistory in historytuplelist[::-1]:
        kwargs['statehistory'] = statehistory        
        kwargs['actionhistory'] = actionhistory
        kwargs['replay'] = True
        agent(copy.deepcopy(env), **kwargs)
        backwardstates.append(env.staterep())

## A Single Game

In [None]:
''' Plays 1 game '''
def Game(env, agent = RandomAgent, actionhistory = [], statehistory = [], maxsteps = 500, seed = None, verbose = True, **kwargs):
    
    if seed is not None:
        np.random.seed(seed)
    else:
        np.random.seed(0)
    while not env.done and env.numsteps < maxsteps:
        statehistory.append(env.staterep())
        kwargs['statehistory'] = statehistory        
        kwargs['actionhistory'] = actionhistory
        move = agent(env, **kwargs)
        env.step(move)
        actionhistory.append(move)
            
    statehistory.append(env.staterep())
    kwargs['statehistory'] = statehistory
    kwargs['actionhistory'] = actionhistory
    # to update final state for RL agents
    agent(env, **kwargs)
    
    if verbose:
        print(env.done, env.reward, env.numsteps)
        # env.print()
    return env.done, env.reward, env.numsteps

## Multiple Games

In [None]:
''' Plays multiple games '''
def MultiGame(env, numtries = 100, hippocampal_replay = True, **kwargs):
    solvedcount = 0
    stephistory = []
    bestmemory = 0
    beststeps = 1000000
    firstsolve = None
    tries = numtries
    solved = False
    
    for i in range(tries):
        # choose a new state for GoExplore or GoExploreCount
        if kwargs['agent'] in [GoExplore, GoExploreCount] and i > 0:
            actionhistory, statehistory, nextenv = ChooseState(env = copy.deepcopy(env))
            done, reward, steps = Game(seed = i, env = copy.deepcopy(nextenv), actionhistory = actionhistory+[], statehistory = statehistory+[], **kwargs)
        else:
            done, reward, steps = Game(seed = i, env = copy.deepcopy(env), actionhistory = [], statehistory = [], **kwargs)
        if reward == 1:
            solvedcount += 1
            stephistory.append(steps)
            
            # if first solve, note how much memory is used
            if solvedcount == 1:
                bestmemory = len(memory)
                firstsolve = i+1
                
            if hippocampal_replay:
                # hippocampal replay only for goexplore or intrinsic agent
                if kwargs['agent'] in [GoExplore, GoExploreCount, CountAgent]:
                    actionhistory = None
                    for key, value in memory.items():
                        if memory[key]['reward'] == 1:
                            actionhistory = memory[key]['actionhistory']

                    # MemoryReplay to improve chance of optimal path being followed
                    if actionhistory is not None:
                        MemoryReplay(env = copy.deepcopy(env), bestactionhistory = actionhistory, **kwargs)
    name = kwargs['agent'].__name__
    if name == 'RandomAgent': 
        name = 'Random'
        bestmemory = '-'
    if name == 'QAgent': name = 'Q-Learning'
    if name == 'TDAgent': name = 'TD-Learning'
    if name == 'GoExplore': name = 'Go-Explore'
    if name == 'GoExploreCount': name = 'Go-Explore-Count'
    if name == 'CountAgent': name = 'Explore-Count'

    if kwargs['agent'] == QAgent or kwargs['agent'] == TDAgent:
        if kwargs.get('eps', 1) == 0:
            name += ' (Test)'
        else:
            name += ' (Train)'
            
    if kwargs.get('intrinsic_fn', None) is not None:
        name += ' GDIR'
    # if solvedcount == 0:
    #     print(f"{name} & {solvedcount}/{tries} & - & - & - & - & - \\\\")
    # else:
    #     print(f"{name} & {solvedcount}/{tries} & {firstsolve} & {bestmemory} & {sum(stephistory)/len(stephistory):.1f} & {min(stephistory):.1f} & {max(stephistory):.1f} \\\\")
    if solvedcount == 0:
        print(f'Agent: {name}, No solves at all, First Solve Memory: {bestmemory}, Total Memory: {len(memory)}')
    else:
        print(f'Agent: {name}, Solve rate: {solvedcount}/{tries} ({solvedcount/tries*100:.1f}%), First Solve: {firstsolve}, First Solve Memory: {bestmemory}, Steps: Avg {sum(stephistory)/len(stephistory):.1f}, Min {min(stephistory):.1f}, Max {max(stephistory):.1f}')

# Discrete Environments 1 & 2 - Maze Environment (Unwalled, Walled)

In [None]:
class MazeEnv:
    def __init__(self, height=20, width=20, numbricks = 10, grid = None, doorpos = None, agentpos = None, randomseed = None):
        self.height = height
        self.width = width
        self.doorpos = doorpos
        self.agentpos = agentpos
        self.numbricks = numbricks
        self.randomseed = randomseed
        self.numsteps = 0
        self.done = False
        self.reward = 0
        if self.randomseed is not None:
            np.random.seed(self.randomseed)
        self.mapping = {0: '.', 1: 'X', 2: 'D', 3: '#'}
        
        # if grid not defined, do a random initialization of maze
        if grid is None:
            self.grid = np.zeros((self.height, self.width))
            
            # Step 1: get a door position that is valid
            if doorpos is None:
                self.doorpos = self.getvalidpos()
            else:
                self.doorpos = doorpos
            self.grid[self.doorpos] = 2

            # Step 2: get a start position that is valid
            if agentpos is None:
                self.agentpos = self.getvalidpos()
            else:
                self.agentpos = agentpos
            self.grid[self.agentpos] = 1
            
            # Step 3: fill in the bricks
            for i in range(self.numbricks):
                self.grid[self.getvalidpos()] = 3
                
        # if grid predefined, get the parameters from there instead
        else:
            self.grid = grid
            self.height, self.width = self.grid.shape
            
            lista, listb = np.where(self.grid == 2)
            if len(lista) == 0 or len(listb) == 0:
                self.doorpos = self.getvalidpos()
            else:
                self.doorpos = (lista[0], listb[0])
            self.grid[self.doorpos] = 2
            
            lista, listb = np.where(self.grid == 1)
            if len(lista) == 0 or len(listb) == 0:
                self.agentpos = self.getvalidpos()
            else:
                self.agentpos = (lista[0], listb[0])
            self.grid[self.agentpos] = 1
            
        # some variables to reset the environment
        self.startgrid = self.grid.copy()
        self.startagentpos = self.agentpos
        self.startdoorpos = self.doorpos
            
    def reset(self):
        self.grid = self.startgrid.copy()
        self.agentpos = self.startagentpos
        self.doorpos = self.startdoorpos
        self.done = False
        self.reward = 0
        self.numsteps = 0
        if self.randomseed is not None:
            np.random.seed(self.randomseed)
            
    # gets state representation
    def staterep(self):
        return str(self.agentpos)
            
    # gets a valid position
    def getvalidpos(self):
        validpos = []
        for i in range(self.height):
            for j in range(self.width):
                if self.grid[i,j] == 0:
                    validpos.append((i,j))
        return validpos[np.random.randint(len(validpos))]
        
    # checks if a position is valid that is not out of the grid and not occupied
    def isvalid(self, pos, allowdoor = False):
        if pos == None or len(pos)!=2:
            return False
        height, width = pos
        if height < 0 or height >= self.height or width < 0 or width >= self.width:
            return False
        if allowdoor and self.grid[height,width] == 2:
            return True
        if self.grid[height,width] == 0:
            return True
        return False
    
    def step(self, move):
        validmoves = self.getvalidmoves()
        # randomly sample a move if not in validmoves
        if move not in validmoves:
            move = validmoves[np.random.randint(len(validmoves))]
        self.numsteps += 1
        self.grid[self.agentpos] = 0
        self.agentpos = self.movedir(self.agentpos, move)
        if self.agentpos == self.doorpos:
            self.done = True
            self.reward = 1
        self.grid[self.agentpos] = 1
    
    def movedir(self, pos, d):
        if pos == None or len(pos)!=2:
            return False
        height, width = pos
        if d=='left':
            return (height, width-1)
        elif d=='right':
            return (height, width+1)
        elif d=='up':
            return (height-1, width)
        elif d=='down':
            return (height+1, width)
    
    def getvalidmoves(self):
        validmoves = []
        for move in ['left', 'right', 'up', 'down']:
            if self.isvalid(self.movedir(self.agentpos, move), allowdoor = True):
                validmoves.append(move)
        return validmoves
    
    def sample(self):
        return np.random.choice(self.getvalidmoves())
    
    def print(self):
        for i in range(self.height):
            for j in range(self.width):
                print(self.mapping[self.grid[i,j]], end = '')
            print()

In [None]:
def Manhattan(env):
    ''' Calculates the Manhattan distance between the agent and the door '''
    pointA = env.agentpos
    pointB = env.doorpos
    return -(abs(pointA[0]-pointB[0])+abs(pointA[1]-pointB[1]))/(env.width+env.height-2)

## Unwalled maze (10x10)

In [None]:
# This is how the maze looks like
height, width = 10, 10
env = MazeEnv(height = height, width = width, agentpos = (0, 0), doorpos = (height-1, width-1), randomseed = 1, numbricks = height*width//10)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)

for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Unwalled maze (20x20)

In [None]:
# This is how the maze looks like
height, width = 20, 20
env = MazeEnv(height = height, width = width, agentpos = (0, 0), doorpos = (height-1, width-1), randomseed = 1, numbricks = height*width//10)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Unwalled maze (100x100)

In [None]:
# This is how the maze looks like
height, width = 100, 100
env = MazeEnv(height = height, width = width, agentpos = (0, 0), doorpos = (height-1, width-1), randomseed = 1, numbricks = height*width//10)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Extra: Without hippocampal replay

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, intrinsic_fn = Manhattan)

## Walled maze (10x10)

In [None]:
# create game environment
size = 10
grid = np.zeros((size,size))
maxheight, maxwidth = grid.shape
grid[:, maxwidth//2-1] = 3
grid[maxheight//2,:] = 3
grid[1:maxheight, maxwidth-2] = 3
grid[maxheight//2, maxwidth//4] = 0
grid[maxheight//2, 3*maxwidth//4-1] = 0
grid[3*maxheight//4, maxwidth//2-1] = 0
grid[maxheight//2, maxwidth-1] = 0
grid[0, 0] = 1
grid[maxheight-1, maxwidth-1] = 2
env = MazeEnv(grid = grid)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Walled maze (20x20)

In [None]:
# create game environment
size = 20
grid = np.zeros((size,size))
maxheight, maxwidth = grid.shape
grid[:, maxwidth//2-1] = 3
grid[maxheight//2,:] = 3
grid[1:maxheight, maxwidth-2] = 3
grid[maxheight//2, maxwidth//4] = 0
grid[maxheight//2, 3*maxwidth//4-1] = 0
grid[3*maxheight//4, maxwidth//2-1] = 0
grid[maxheight//2, maxwidth-1] = 0
grid[0, 0] = 1
grid[maxheight-1, maxwidth-1] = 2
env = MazeEnv(grid = grid)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Walled maze (100x100)

In [None]:
# create game environment
size = 100
grid = np.zeros((size,size))
maxheight, maxwidth = grid.shape
grid[:, maxwidth//2-1] = 3
grid[maxheight//2,:] = 3
grid[1:maxheight, maxwidth-2] = 3
grid[maxheight//2, maxwidth//4] = 0
grid[maxheight//2, 3*maxwidth//4-1] = 0
grid[3*maxheight//4, maxwidth//2-1] = 0
grid[maxheight//2, maxwidth-1] = 0
grid[0, 0] = 1
grid[maxheight-1, maxwidth-1] = 2
env = MazeEnv(grid = grid)
env.print()

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, verbose = False, intrinsic_fn = Manhattan)

## Extra: Without hippocampal replay

In [None]:
for agent in [RandomAgent, TDAgent, QAgent]:
    # print(agent.__name__, 'training')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False)
    
    # for QAgent and TDAgent, we include the final number of solves with deterministic transition
    if agent == QAgent or agent == TDAgent:
        # print(agent.__name__, 'testing')
        MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, eps = 0)
    
for agent in [GoExplore, GoExploreCount, CountAgent]:
    # print(agent.__name__, 'without IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, intrinsic_fn = None)

    # for Intrinsic Agent, we also do the run with intrinsic motivation
    # print(agent.__name__, 'with IM')
    memory = defaultdict(lambda: 0)
    MultiGame(numtries = 100, env = copy.deepcopy(env), agent = agent, maxsteps = env.height*env.width, hippocampal_replay = False, verbose = False, intrinsic_fn = Manhattan)