# Reinforcement Learning Workbook

The goal of this workbook is to get a broad sense of how Q-learning works.
Scenario: Moo Deng (the pygmy hippo) is trying to escape the zoo!
As an aspiring QSS major, she is trying to simulate her escape using Q-learning.

In [None]:
import numpy as np
import random
import time

from IPython.display import clear_output

In [None]:
class SimpleGridEnv:
    def __init__(self, size=5, goal=(4, 4), pitfall=(2, 2)):
        self.size = size
        self.goal = goal
        self.pitfall = pitfall
        self.reset()
    
    def reset(self):
        self.agent_pos = (0, 0)
        return self.agent_pos
    
    def step(self, action):
        row, col = self.agent_pos
        
        # Actions: 0=Up, 1=Down, 2=Left, 3=Right
        if action == 0 and row > 0:  # Up
            # your code | Think of what to add or minus to your row and column
            
        elif action == 1 and row < self.size - 1:  # Down
            # your code
            
        elif action == 2 and col > 0:  # Left
            # your code
            
        elif action == 3 and col < self.size - 1:  # Right
            # your code
        
        self.agent_pos = (row, col)
        
        # Determine rewards
        # This block of code returns the agent's position, a reward value, and termination
        if self.agent_pos == self.goal:
            return self.agent_pos, 1, True  # Reward of +1 for reaching the goal
        elif self.agent_pos == self.pitfall:
            return self.agent_pos, # your reward, # your ending condition
        else:
            return self.agent_pos, # your reward, # your ending condition 

    def render(self):
        grid = np.full((self.size, self.size), ' ')
        grid[self.goal] = 'G'       # Goal
        grid[self.pitfall] = 'X'    # Pitfall
        grid[self.agent_pos] = 'M'  # Agent Moo Deng's position
        print(grid)

In [None]:
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.9, epsilon=0.1):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.q_table = np.zeros((env.size, env.size, 4))  # Q-table for each state-action pair
    
    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            ####
            # Your code for Explore: generate a random action by sampling the possible actions
            ####
        else:
            row, col = state
            return np.argmax(self.q_table[row, col])  # Exploit: best action based on Q-table
    
    def update_q_value(self, state, action, reward, next_state):
        row, col = state
        next_row, next_col = next_state
        best_future_q = np.max(self.q_table[next_row, next_col])
        
        # Q-learning update rule | Bellman Equation
        self.q_table[row, col, action] = (1 - self.learning_rate) * self.q_table[row, col, action] + \
            self.learning_rate * (reward + self.discount_factor * best_future_q)
    
    def train(self, episodes=50):
        for episode in range(episodes):
            state = self.env.reset()
            done = False
            steps = 0
            
            print(f"\nEpisode {episode + 1}/{episodes}")
            while not done:
                clear_output(wait=True)
                self.env.render()
                action = self.choose_action(state)
                next_state, reward, done = self.env.step(action)
                self.update_q_value(state, action, reward, next_state)
                state = next_state
                steps += 1
                time.sleep(0.3)
            print(f"Episode finished in {steps} steps")
            time.sleep(3)

# Interactive Demo
def interactive_q_learning_demo():
    print("Welcome to the Q-Learning Demo! Help Moo Deng Escape!")
    size = int(input("Enter grid size (e.g., 5): "))
    learning_rate = float(input("Enter learning rate (α) (e.g., 0.1): "))
    discount_factor = float(input("Enter discount factor (γ) (e.g., 0.9): "))
    epsilon = float(input("Enter exploration rate (ε) (e.g., 0.1): "))
    episodes = int(input("Enter number of training episodes: "))
    
    env = SimpleGridEnv(size=size)
    agent = QLearningAgent(env, learning_rate=learning_rate, discount_factor=discount_factor, epsilon=epsilon)
    
    print("Training Moo Deng...")
    agent.train(episodes)
    print("Training complete! Here is the final Q-table:")
    print(agent.q_table)

#### Once you have filled in the two code blocks, try running the demo with 3 training episodes
What do you observe?

In [None]:
# Run the interactive demo
interactive_q_learning_demo()

#### Now try running it for many more episodes (adjust sleep for faster training)
- What do you observe about the final q-tables?
- Afterward, experiment with a) the rewards, b) grid-size, and c) exploration
- What do you observe?