707 Coursework- Task 1: The Environment

This environment recreates the original Pacman game. The environment is a grid filled with pellets (or breadcrumbs) which the pacman agent must collect. The environment also contains some barriers which the agent must learn to avoid/walk around, as it cannot pass through them. The element of stochasticity in this environment is a ghost, which walks around the grid randomly. The pacman agent must avoid the ghost in order to win the game. 
Moreover, the breadcrumbs in the environment are also placed in random locations of the grid at the start of each episode. 

If the pacman agent collects all the pellets in the environment and does not encounter the ghost, pacman wins and the episode terminates. However, if pacman moves to the cell with the ghost, pacman dies and the game terminates. 


In [13]:
#import statements

import numpy as np
import random
import matplotlib.pyplot as plt

In [14]:
class Environment(object):
    '''
    creating the pacman environment
    '''
    def __init__(self, m, n):
        self.environment = np.zeros((m, n))
        self.m = m
        self.n = n
        self.state_space = [i for i in range(self.m * self.n)]
        self.action_space = {'U': -self.m, 'D': self.m, 'L': -1, 'R': 1}
        self.possible_actions = ['U', 'D', 'L', 'R']
        self.add_barriers()
        self.agent_position = 24
        self.environment[3][3] = 1
        self.ghost_position = 39
        self.add_breadcrumbs()

    def add_barriers(self):
        '''
        this function add barriers in the grid space
        '''
        for i in range(7):
            self.environment[i][0] = 2
            self.environment[i][6] = 2
            self.environment[0][i] = 2
            self.environment[6][i] = 2
        self.environment[2][3] = 2
        self.environment[2][4] = 2
        self.environment[3][4] = 2
        self.environment[4][3] = 2
        self.environment[4][4] = 2

    def add_breadcrumbs(self, ):
        '''
        this function adds breadcrumbs/pellets in the grid space which pacman has to collect
        '''
        self.breadcrumbs = []
        i = 1
        while i < 11:
            position = random.randint(0, 48)
            x, y = self.get_row_and_column(position)
            if self.environment[x][y] == 0 and position not in self.breadcrumbs:
                self.breadcrumbs.append(position)
                i += 1

    def is_terminal_state(self, state):
        '''
        This function returns true if the state is terminal.
        For the state to be terminal pacman must collect all pellets
        '''
        if self.number_of_breadcrumbs() == 0:
            return state
        if self.ghost_position == self.agent_position:
            return state


    def number_of_breadcrumbs(self):
        '''
        Returns the number of pellets left in the environment
        '''
        return len(self.breadcrumbs)

    def get_row_and_column(self, position):
        '''
        Returns the coordinates of the agent on the grid
        '''
        x = position // self.m
        y = position % self.n
        return x, y

    def set_state(self, state):
        '''
        When pacman makes a new move, its position becomes the new state
        and the old one becomes 0 (empty cell)
        '''
        x, y = self.get_row_and_column(self.agent_position)
        self.environment[x][y] = 0

        self.agent_position = state
        x, y = self.get_row_and_column(self.agent_position)
        self.environment[x][y] = 1

    def off_grid_move(self, newState, oldState):
        '''
        If pacman tries to go off the grid it must receive a negative reward,
        this is becuase the outer perimeter of the grid is formed by barriers.
        This function returns a boolean value of signaling if pacman has tried
        to make an illegal move, attempting to move outside the 5x5 RL environment.
        '''
        x, y = self.get_row_and_column(newState)
        if self.environment[x][y] == 2:
            print("off grid")
            return True
        else:
            return False

    def step(self, action):
        '''
        This function implements the agents movement from one cell to the next. The reward at each time step is -100.
        It checks that the chosen action is not off the grid, and then it replaces the agent's co-ordinates from the current cell to the next cell
        which the agent has decided to move to. 
        This function also checks that if the agent moves to a location with the breadcrumb it collects it.
        if the agent is not on the ghost's position, the ghost can now move.
        Lastly, the reward is calculated and given to the agent. 
        '''
        resulting_state = self.agent_position + self.action_space[self.possible_actions[action]]
        reward = -100
        numberOfBreadcrumbs = self.number_of_breadcrumbs()
        if not self.off_grid_move(resulting_state, self.agent_position):
            self.set_state(resulting_state)
            if self.agent_position in self.breadcrumbs:
                self.breadcrumbs.remove(self.agent_position)
            if self.agent_position != self.ghost_position:
                self.ghost_move()
            reward = self.reward(self.agent_position, action, numberOfBreadcrumbs)
        return (self.get_row_and_column(self.agent_position), tuple(self.breadcrumbs) if len(self.breadcrumbs) < 2 else len(self.breadcrumbs), self.get_row_and_column(self.ghost_position)), reward, self.is_terminal_state(self.agent_position)

    def ghost_move(self):
        '''
        This method's functionality is to move the ghost around the grid.
        '''
        self.ghost_position = self.ghost_position + self.action_space[self.possible_actions[self.action_space_sample(self.ghost_position)]]


    def reward(self, state, action, numberOfBreadcrumbs):
        '''
        The reward function is responsable for assigning the rewards to the agent.
        The reward value is dependent on what action the agent performs:
        if the agent moves onto the ghost's cell, the reward is -1000;
        if the game is terminated successfully, pacman receives 1000;
        else the agent receives -1 at each timestep.
        Moreover, the agent will get a negative reward as it gets closer to the ghost,
        & receives a reward of 10 if there are more breadcrumbs to collect.
        '''
        if self.agent_position == self.ghost_position:
            reward = -1000
        elif self.is_terminal_state(state):
            reward = 1000
        else:
            reward = -1
            state_after_another_action = state
            x, y = self.get_row_and_column(state_after_another_action)
            for i in range(3):
                state_after_another_action = state_after_another_action + self.action_space[self.possible_actions[action]]
                object = self.environment[x][y]
                if state_after_another_action == self.ghost_position:
                    reward = -100/(i+1)
                    break
                elif object == 2:
                    break
            if reward == -1 and numberOfBreadcrumbs != self.number_of_breadcrumbs():
                reward = 10
        return reward

    def reset(self):
        '''
        This method resets the whole environment from the start
        for each episode
        '''
        self.agent_position = 24
        self.environment = np.zeros((self.m, self.n))
        self.add_barriers()
        self.environment[3][3] = 1
        self.ghost_position = 39
        self.add_breadcrumbs()
        return self.agent_position

    def render(self):
        '''
        This function is a simple rendering of the environment in the console,
        this function is mostly used for debugging and to give a visual demonstration
        of how the environment looks like
        '''
        print('--------------------------------------------------')
        i = 0
        for row in self.environment:
            for col in row:
                if i == self.ghost_position:
                    print('G', end='\t')
                elif col == 1:
                    print('P', end='\t')
                elif col == 2:
                    print('|', end='\t')
                elif i in self.breadcrumbs:
                    print('*', end='\t')
                elif col == 0:
                    print('-', end='\t')
                i += 1
            print('\n')
        print('--------------------------------------------------')

    def action_space_sample(self, position):
        '''
        This function returns a random sample of the possible actions, as long as
        the action does not result in encountering a barrier
        '''
        actions = []
        for action in range(len(self.possible_actions)):
            next_state = position + self.action_space[self.possible_actions[action]]
            x, y = self.get_row_and_column(next_state)
            if self.environment[x][y] != 2:
                actions.append(action)
        return np.random.choice(actions)


    def max_action(self, Q, state):
        '''
        This function returns the action with the maximum Q-value in the Q-table
        '''
        values = Q[state]
        for action in range(len(values)):
            next_state = self.agent_position + self.action_space[self.possible_actions[action]]
            x, y = self.get_row_and_column(next_state)
            if self.environment[x][y] == 2:
                values[action] = -np.inf
        max_reward = np.max(values)
        actions = []
        for action in range(len(values)):
            if values[action] == max_reward:
                actions.append(action)
        return np.random.choice(actions)


In [15]:
if __name__ == '__main__':
    env = Environment(7, 7)

    env.render()

--------------------------------------------------
|	|	|	|	|	|	|	

|	*	*	*	*	-	|	

|	*	-	|	|	*	|	

|	-	*	P	|	*	|	

|	*	-	|	|	-	|	

|	*	-	-	G	-	|	

|	|	|	|	|	|	|	

--------------------------------------------------
