In [None]:
import numpy as np
import pygame
import time

In [None]:
directions = {
    'UP': (0,1), 
    'RIGHT': (1,0),
    'DOWN': (0,-1),
    'LEFT': (-1,0)
}
directions_list = [(0,1),(1,0),(0,-1),(-1,0)]

class MazeEnv():
    def __init__(self,obstacles,m,n):
        if (0,0) in obstacles or (m-1,n-1) in obstacles:
            raise ValueError("Obstacle cannot be at (0,0) or (m-1,n-1)")
        self.obstacles = obstacles
        self.m = m
        self.n = n
        self.cur_pos = (0,0)
        self.end_pos = (m-1,n-1)
        self.done = False

        pygame.init()
        self.screen = pygame.display.set_mode((self.m*50,self.n*50))
        pygame.display.set_caption("Maze")
        self.screen.fill((255,255,255))

        self.render()
        

    def action_space(self):
        return len(directions)
    def observation_space(self):
        return 2
    
    def step(self,action):
        if self.done:
            raise ValueError("Episode is done")
        reward = 0
        action_value = directions_list[action]
        new_pos = (self.cur_pos[0]+action_value[0],self.cur_pos[1]+action_value[1])
        if new_pos[0] < 0 or new_pos[0] >= self.m or new_pos[1] < 0 or new_pos[1] >= self.n or new_pos in self.obstacles:
            self.done = True
            reward = -1
        if new_pos == self.end_pos:
            self.done = True
            reward = 1
        self.cur_pos = new_pos
        self.render()
        return self.cur_pos,reward,self.done
    
    def reset(self):
        self.cur_pos = (0,0)
        self.done = False
        self.render()
        return self.cur_pos
    
    def render(self):
        self.screen.fill((255,255,255))

        for i in range(self.m):
            for j in range(self.n):
                if (i,j) in self.obstacles:
                    pygame.draw.rect(self.screen,(0,0,0),(i*50,j*50,50,50))
        pygame.draw.rect(self.screen,(0,255,0),(0,0,50,50))
        pygame.draw.rect(self.screen,(255,0,0),((self.m-1)*50,(self.n-1)*50,50,50))
        pygame.draw.rect(self.screen,(0,0,255),(self.cur_pos[0]*50,self.cur_pos[1]*50,50,50))
        pygame.display.update()

    



In [None]:
class QTable():
    def __init__(self,env,eps,alpha,gamma):
        self.env = env
        self.q_table = np.zeros((env.m,env.n,env.action_space()))
        self.eps = eps
        self.alpha = alpha
        self.gamma = gamma
    def train(self,num_episodes):
        for _ in range(num_episodes):
            state = self.env.reset()
            done = False
            while not done:
                if np.random.random() < self.eps:
                    action = np.random.randint(self.env.action_space())
                else:
                    action = np.argmax(self.q_table[state])
                
                new_state,reward,done = self.env.step(action)
                # update q table
            
                self.q_table[state[0],state[1],action] =self.q_table[state[0],state[1],action] + self.alpha*(reward + self.gamma*np.max(self.q_table[new_state[0],new_state[1],:]-self.q_table[state[0],state[1],action]))
                state = new_state
        
                
                

    



In [None]:
obstacles = [
    (0,1),(1,1),(2,1),(4,0),(2,2),(2,3),
    (2,4),(2,5),(4,2),(4,3),(4,4),(4,6),
    (5,4),(6,4),(7,5),(3,7),(2,7),(1,7),
    (9.8),(9,7),(8,6),(5,6),(6,7),(7,8),
    (8,9),(0,7),(1,5),(0,5)
    
    
]

In [None]:
for i in range(10):
    print()
    for j in range(10):
        if (i,j) in obstacles:
            print("      ",end="")
        else:
            print(f"({i},{j}) ",end="")

In [None]:
env = MazeEnv(obstacles,10,10)
q_table = QTable(env,0.1,0.1,0.9)

In [None]:
q_table.train(3000)

In [None]:
env.reset()
done = False
while not done:
    action = np.argmax(q_table.q_table[env.cur_pos])
    _, _, done = env.step(action)
    time.sleep(1)