In [10]:
import numpy as np

### Q_train_test

In [11]:
class Q_train_test:
    
    Q_DUMP = "qlearner.pickle"
    
    def __init__(self, states:int, actions:int, alpha:float, gamma:float, rar:float, radr:float, verbose:bool) -> None:
        
        self.states = states
        self.actions = actions
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.radr = radr
        self.verbose=verbose
        self.current_state = 0
        self.current_action = 0
        self.q_values = np.zeros((self.states, self.actions))
        
    def predict(self, new_state:int)->int:
        """ Predicts random action if rar is greater than randint; else argmax of q table for current state """
        
        self.current_state = new_state
        if rand.random() < self.rar:
            random_flag = True
            action = int(rand.randint(0, self.actions -1))
        else:
            random_flag = False
            action = int(np.argmax(self.q_values[self.current_state])  )                  
        self.current_action = action
        
        if self.verbose:
            print("RAR --> ", self.rar)
            print(f"state = {new_state}, action = {action}")
        return(action, random_flag)
        
                         
    def fit_model(self, new_pos:int, immediate_reward:float) -> int:
        
        """ Update q table and return an action  """
        
        impr_est = immediate_reward + (self.gamma* self.q_values[new_pos, np.argmax(self.q_values[new_pos])])
        self.q_values[self.current_state, self.current_action] = (1-self.alpha) * self.q_values[self.current_state, self.current_action] + (self.alpha * impr_est)
        
        if rand.random() < self.rar:
            random_flag = True
            action = int(rand.randint(0, self.actions -1))
        else:
            random_flag = False
            action = int(np.argmax(self.q_values[new_pos]))
            
        self.rar = (self.rar*self.radr)
        self.current_state = new_pos
        self.current_action = action
        
        if self.verbose:
            print(f"s={new_pos}, a={action}, r={immediate_reward}")
        return action, random_flag
    
    def dump(self):
        # Saving the current state of the learner to be later used for testing
        with open(
            self.Q_DUMP, "wb"
        ) as serialized_learner_state_write:
            pickle.dump(
                self,
                serialized_learner_state_write,
                protocol=pickle.DEFAULT_PROTOCOL,
            )

### Create a maze from csv

In [12]:
def csv_to_maze():
    
    """ Load csv file as a maze using file obj """
    
    file_path = "maze.csv"
    with open(file_path, encoding= "utf8") as maze_file:
        maze_grid = np.array(
            [
                list(map(float, s.strip().split(",")))
                for s in maze_file.readlines()
            ]
        )
    return maze_grid

### Render maze

In [13]:
def render_maze(csv: np.ndarray):
    
    """ Renders a 2d map """
    
    #print(" ")
    print("..........")
    map_dictionary = {
                      0: " ", 
                      1: "*",
                      2: "X",
                      3: "Y",
                      9: "."}
    for row in range(0, csv.shape[0]):
        for col in range(0, csv.shape[1]):
            print(map_dictionary[csv[row,col]], end= " ")
        print()
    print("..........")
maze_grid = csv_to_maze()
render_maze(maze_grid)

..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X   * 
..........


### Goals, obstacles and check for out of bounds 

In [14]:
from typing import Tuple, List

In [15]:
def get_goal_position(csv:np.ndarray) -> Tuple:
    
    """ Returns location of goal in the maze grid """
    
    goal_x, goal_y = np.where(csv==3)
    return goal_x, goal_y

def get_obstacle_positions(csv:np.ndarray) -> List: 
    
    """ Returns list of positions where obstacles are placed in the grid """
    
    csv_shape = csv.shape
    obstacles = []
    
    for row in range(0, csv_shape[0]):
        for col in range(0, csv_shape[1]):
            if csv[row,col] == 2:
                obstacles.append([row,col])
    return obstacles

def out_of_bounds(csv:np.ndarray, new_x:int, new_y:int) -> bool:
    
    """ Checks if new position is out of bounds """
    
    csv_shape = csv.shape
    if (new_x<0) or (new_x> csv_shape[0] - 1):
        return True
    if (new_y<0) or (new_y>csv_shape[1] - 1):
        return True
    return False

def discretize(curr_x:int, curr_y:int):
    
    """ Converts a position [row,column] in the grid into an integer """
    
    return curr_x*10 + curr_y

### Actions, rewards and penalty

In [16]:
def maze_game_action(csv: np.ndarray, curr_x:int, curr_y:int, action:int, obstacles:List):
    
    """ Define actions within the grid; ie. move up/down/left/right """
    
    new_x = curr_x
    new_y = curr_y
    
    if action == 0:
        """ Move agent to the left """
        new_y = curr_y -1
    elif action == 1: 
        """ Move agent to the right """
        new_y = curr_y +1
    elif action == 2:
        """ Move agent upwards """
        new_x = curr_x -1
    else:
        """ Move agent downwards """
        new_x = curr_x +1
        
    """ Assign rewards for correct actions and penalise if agent hits obstacles or moves out of bounds """
    
    rewards = -1 
    
    if not out_of_bounds(csv, new_x, new_y):
        if [new_x, new_y] in obstacles:
            rewards = -100 
        csv[curr_x, curr_y] = 9
        csv[new_x, new_y] = 1
        return csv, rewards, new_x, new_y
    
    rewards = -10
    return csv, rewards, curr_x, curr_y

### Training & Persistence

In [21]:
def train(start_positions, maze):
    
    """ Training the agent to play a maze game by avoiding obstacles and reaching the goal """
    
    states = discretize(maze.shape[0]-1, maze.shape[1])
    
    Qlearner = Q_train_test(
        states = states,
        actions = 4,
        alpha = 0.2,
        gamma = 0.9,
        rar = 0.98,
        radr = 0.99,
        verbose = False
                     )
    obstacles = get_obstacle_positions(maze)
    goal_x, goal_y = get_goal_position(maze)
    print(" ")
    print("Goal at",goal_x, ",", goal_y)
    print(" ")
    print("Maze grid dimensions: ", maze.shape[0], "x", maze.shape[1])
    print("....................")
    #csv_to_maze()
    
    convergence = []
    
    """ Creating different starting positions to train the maze by replacing original SP to zero and new SP to 1 """
    
    for start_x, start_y in start_positions: 
        x, y = np.where(maze==1)
        maze[x,y] = 0
        maze[start_x, start_y] = 1
        print(" ")
        print("Training maze with starting position at: ", "[", start_x, ",", start_y, "]")
        print("       -----      ")
        print(maze)
        render_maze(maze)
        
        
        for _ in trange(1, 1000, unit = "Episode"):
            
            maze_a = maze.copy()
            done = False 
            curr_x, curr_y = np.where(maze==1)
            
            """ call predict function to see if goal is met, while not done --> call maze_game_action """
            
            action, random_flag = Qlearner.predict(discretize(curr_x, curr_y))
            
            while not done:                
                curr_x, curr_y = np.where(maze_a==1)
                maze_a, rewards, curr_x, curr_y = maze_game_action(maze_a, curr_x, curr_y, action, obstacles)
                new_pos = discretize(curr_x, curr_y)
                action, random_flag = Qlearner.fit_model(new_pos, rewards)
                if [curr_x, curr_y] in obstacles or (curr_x==goal_x) & (curr_y == goal_y):
                    done=True
        render_maze(maze_a)
        
        final_x, final_y = np.where(maze_a == 1)
        convergence.append(str(final_x[0])+str(final_y[0])==str(goal_x[0])+str(goal_y[0]))
        
        a = sum(1 for i in convergence if i==True)
        b = len(convergence)
        print(" ")
    print(round((a/b)*100,2), "% convergence rate in this training cycle")
    
    """ Persist the learner """
    Qlearner.dump()
    print("Training complete --> Learner persisted ")
    return goal_x, goal_y, Qlearner

### Train Q Learner

In [22]:
from tqdm import trange
import random as rand
import pickle
maze = csv_to_maze()
start_positions = [
    (1, 0),
    (1, 4),
    (3, 3),
    (5, 2),
    (6, 0),
    (6, 4),
    (8, 2),
    (9, 0),
    (9, 4),
]
train(start_positions, maze)

 
Goal at [0] , [2]
 
Maze grid dimensions:  10 x 5
....................
 
Training maze with starting position at:  [ 1 , 0 ]
       -----      
[[0. 0. 3. 0. 0.]
 [1. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
*     X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:00<00:00, 2581.20Episode/s]


..........
    *     
. . . X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 1 , 4 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 1.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X * 
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:00<00:00, 2685.50Episode/s]


..........
    * . . 
      X . 
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 3 , 3 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 1. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X *   
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:00<00:00, 1550.94Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X       X 
  X   X   
          
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 5 , 2 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 1. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X * X   
          
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:00<00:00, 1075.71Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
          
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 6 , 0 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [1. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
*         
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:01<00:00, 828.39Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
. . .     
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 6 , 4 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 1.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
        * 
  X X X   
        X 
  X X     
..........


100%|██████████| 999/999 [00:01<00:00, 908.11Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
    . . . 
  X X X   
        X 
  X X     
..........
 
 
Training maze with starting position at:  [ 8 , 2 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 1. 0. 2.]
 [0. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
    *   X 
  X X     
..........


100%|██████████| 999/999 [00:01<00:00, 550.00Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
. . .     
. X X X   
. . .   X 
  X X     
..........
 
 
Training maze with starting position at:  [ 9 , 0 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [1. 2. 2. 0. 0.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
* X X     
..........


100%|██████████| 999/999 [00:01<00:00, 669.27Episode/s]


..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
. . .     
. X X X   
.       X 
. X X     
..........
 
 
Training maze with starting position at:  [ 9 , 4 ]
       -----      
[[0. 0. 3. 0. 0.]
 [0. 0. 0. 2. 0.]
 [0. 2. 0. 0. 2.]
 [0. 0. 2. 0. 0.]
 [2. 0. 0. 0. 2.]
 [0. 2. 0. 2. 0.]
 [0. 0. 0. 0. 0.]
 [0. 2. 2. 2. 0.]
 [0. 0. 0. 0. 2.]
 [0. 2. 2. 0. 1.]]
..........
    Y     
      X   
  X     X 
    X     
X       X 
  X   X   
          
  X X X   
        X 
  X X   * 
..........


100%|██████████| 999/999 [00:02<00:00, 471.39Episode/s]

..........
    *     
    . X   
  X . . X 
    X .   
X   . . X 
  X . X   
. . .     
. X X X   
. . . . X 
  X X . . 
..........
 
100.0 % convergence rate in this training cycle
Training complete --> Learner persisted 





(array([0], dtype=int64),
 array([2], dtype=int64),
 <__main__.Q_train_test at 0x213b922bac0>)