In [1]:
import numpy as np

In [2]:
class FrozenLakeEnv:
    def __init__(self, grid):
        self.grid = np.array(grid)
        self.size = len(grid)
        
        self.actions = ['up', 'down', 'left', 'right']
        self.holes = self.grid_to_holes()
        self.goal = self.grid_to_goal()
        self.start = (0, 0)
        
        self.move = {
            'up': (-1, 0),
            'down': (1, 0),
            'left': (0, -1),
            'right': (0, 1)
        }

    def grid_to_holes(self):
        coordinates = []
        for i in range(self.grid.shape[0]):
            for j in range(self.grid.shape[1]):
                if self.grid[i, j] == 'H':
                    coordinates.append((i, j))
        return coordinates

    def grid_to_goal(self):
        for i in range(self.grid.shape[0]):
            for j in range(self.grid.shape[1]):
                if self.grid[i, j] == 'G':
                    return (i, j)
        return None
    
    def state_to_index(self, state):
        return state[0] * self.size + state[1]
    
    def get_new_position(self, position, action):
        move_x, move_y = self.move[action]
        new_x, new_y = position[0] + move_x, position[1] + move_y
        
        if new_x < 0 or new_x >= self.size or new_y < 0 or new_y >= self.size:
            return position
        return (new_x, new_y)
    
    def is_hole(self, position):
        return position in self.holes
    
    def is_goal(self, position):
        return position == self.goal
    
    def reset(self):
        return self.start

In [3]:
class QLearningAgent:
    def __init__(self, environment, epsilon=0.1, learning_rate=0.1, discount_factor=0.99):
        self.env = environment
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        
        self.q_table = np.zeros((self.env.size * self.env.size, len(self.env.actions)))
    
    def choose_action(self, state):
        state_idx = self.env.state_to_index(state)
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.env.actions)
        else:
            # return self.env.actions[np.argmax(self.q_table[state_idx])]
            max_value = np.max(self.q_table[state_idx])
            max_indices = np.where(self.q_table[state_idx] == max_value)[0]
            chosen_index = np.random.choice(max_indices)
            return self.env.actions[chosen_index]
    
    def update_q_table(self, state, action, reward, new_state):
        state_idx = self.env.state_to_index(state)
        new_state_idx = self.env.state_to_index(new_state)
        
        action_idx = self.env.actions.index(action)
        
        self.q_table[state_idx, action_idx] += self.learning_rate * (
            reward + self.discount_factor * np.max(self.q_table[new_state_idx]) - self.q_table[state_idx, action_idx]
        )

In [4]:
def train(agent, episodes=10000, max_steps=100):
    for episode in range(episodes):
        state = agent.env.reset()
        lose = False
        total_reward = 0
        
        for step in range(max_steps):
            action = agent.choose_action(state)
            new_state = agent.env.get_new_position(state, action)
            while new_state == state:
                action = agent.choose_action(state)
                new_state = agent.env.get_new_position(state, action)
            
            if agent.env.is_goal(new_state):
                reward = 1
                lose = True
            elif agent.env.is_hole(new_state):
                reward = -1
                lose = True
            else:
                reward = 0
            
            agent.update_q_table(state, action, reward, new_state)
            
            state = new_state
            total_reward += reward
            
            if lose: break
        
        if (episode + 1) % 1000 == 0:
            print(f"Episode {episode + 1}: Total reward = {total_reward}")


In [5]:
grid = [
    ['S', 'F', 'F', 'F', 'F', 'F', 'F', 'F'],
    ['F', 'H', 'H', 'H', 'H', 'F', 'H', 'H'],
    ['F', 'H', 'H', 'H', 'F', 'F', 'F', 'H'],
    ['F', 'H', 'H', 'H', 'F', 'F', 'F', 'H'],
    ['F', 'H', 'H', 'H', 'F', 'F', 'H', 'F'],
    ['F', 'H', 'H', 'H', 'F', 'H', 'F', 'F'],
    ['F', 'H', 'H', 'H', 'F', 'F', 'F', 'F'],
    ['H', 'F', 'F', 'F', 'H', 'F', 'F', 'G']
]
env = FrozenLakeEnv(grid)
agent = QLearningAgent(environment=env, epsilon=0.1, learning_rate=0.1, discount_factor=0.99)
train(agent)

Episode 1000: Total reward = -1
Episode 2000: Total reward = 1
Episode 3000: Total reward = 1
Episode 4000: Total reward = 1
Episode 5000: Total reward = 1
Episode 6000: Total reward = 1
Episode 7000: Total reward = 1
Episode 8000: Total reward = -1
Episode 9000: Total reward = 1
Episode 10000: Total reward = 1


In [6]:
print(agent.q_table)

[[ 0.          0.84294319  0.          0.86005835]
 [ 0.         -1.          0.85145777  0.86874581]
 [ 0.         -1.          0.86005835  0.87752102]
 [ 0.         -1.          0.86874581  0.88638487]
 [ 0.         -1.          0.87752102  0.89533825]
 [ 0.          0.90438208  0.88638487  0.88638487]
 [ 0.         -0.9282102   0.89533825  0.04527402]
 [ 0.         -0.79410887  0.30294332  0.        ]
 [ 0.85145777  0.20116837  0.         -0.99990595]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.89533825  0.91351725 -1.         -1.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.52894095  0.          0.         -0.99954322]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.        ]
 [ 0.          0.          0.  

In [7]:
def evaluate(agent, episodes=3, max_steps=100):
    for episode in range(episodes):
        state = agent.env.reset()
        lose = False
        steps = 0
        while not lose:
            state_idx = agent.env.state_to_index(state)
            action = agent.env.actions[np.argmax(agent.q_table[state_idx])]
            state = agent.env.get_new_position(state, action)
            print(f"Step {steps}: {state} -> Action: {action}")
            if agent.env.is_goal(state):
                print("Goal reached!\n")
                lose = True
            elif agent.env.is_hole(state):
                print("Fell into a hole!\n")
                lose = True
            steps += 1
            if steps == max_steps: 
                print("Something wrong!\n")
                lose = True

In [8]:
evaluate(agent)

Step 0: (0, 1) -> Action: right
Step 1: (0, 2) -> Action: right
Step 2: (0, 3) -> Action: right
Step 3: (0, 4) -> Action: right
Step 4: (0, 5) -> Action: right
Step 5: (1, 5) -> Action: down
Step 6: (2, 5) -> Action: down
Step 7: (3, 5) -> Action: down
Step 8: (3, 4) -> Action: left
Step 9: (4, 4) -> Action: down
Step 10: (5, 4) -> Action: down
Step 11: (6, 4) -> Action: down
Step 12: (6, 5) -> Action: right
Step 13: (7, 5) -> Action: down
Step 14: (7, 6) -> Action: right
Step 15: (7, 7) -> Action: right
Goal reached!

Step 0: (0, 1) -> Action: right
Step 1: (0, 2) -> Action: right
Step 2: (0, 3) -> Action: right
Step 3: (0, 4) -> Action: right
Step 4: (0, 5) -> Action: right
Step 5: (1, 5) -> Action: down
Step 6: (2, 5) -> Action: down
Step 7: (3, 5) -> Action: down
Step 8: (3, 4) -> Action: left
Step 9: (4, 4) -> Action: down
Step 10: (5, 4) -> Action: down
Step 11: (6, 4) -> Action: down
Step 12: (6, 5) -> Action: right
Step 13: (7, 5) -> Action: down
Step 14: (7, 6) -> Action: righ