# Zadanie 5

Celem ćwiczenia jest implementacja algorytmu Q-learning.

Następnie należy stworzyć agenta rozwiązującego problem [Taxi](https://gymnasium.farama.org/environments/toy_text/taxi/). Problem dostępny jest w pakiecie **gym**.

Punktacja (max 7 pkt):
- Implementacja algorytmu Q-learning. [3 pkt]
- Eksperymenty dla różnych wartości hiperparametrów [2 pkt]
- Jakość kodu [1 pkt]
- Wnioski [1 pkt]


In [87]:
import numpy as np
import random as rd

In [88]:
class QLearningSolver:
    """Class containing the Q-learning algorithm that might be used for different discrete environments."""

    def __init__(
        self,
        observation_space: int,
        action_space: int,
        learning_rate: float = 0.1,
        gamma: float = 0.9,
        epsilon: float = 0.1,
    ):
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = np.zeros((observation_space, action_space))

    def __call__(self, state: np.ndarray, action: np.ndarray) -> np.ndarray:
        """Return Q-value of given state and action."""
        return self.q_table[state, action]

    def update(self, state: np.ndarray, action: np.ndarray, reward: float, next_state: int) -> None:
        """Update Q-value of given state and action."""
        current_q: np.ndarray = self.q_table[state, action]
        max_next_q: float = np.max(self.q_table[next_state])
        new_q: np.ndarray = current_q + self.learning_rate * (reward + self.gamma * max_next_q - current_q)
        self.q_table[state, action] = new_q

    def get_best_action(self, state: np.ndarray) -> np.ndarray:
        """Return action that maximizes Q-value for a given state."""
        # print(self.q_table[state])
         
        return np.argmax(self.q_table[state])

    def __repr__(self):
        """Elegant representation of Q-learning solver."""
        return f"QLearningSolver(observation_space={self.observation_space}, action_space={self.action_space}, learning_rate={self.learning_rate}, gamma={self.gamma}, epsilon={self.epsilon})"

    def __str__(self):
        return self.__repr__()
    

def tester_3000():
    pass

In [91]:
import gymnasium as gym
import numpy as np

# Inicjalizacja środowiska Taxi
env = gym.make('Taxi-v3')
# env.reset(seed=42)
observation_space = env.observation_space.n
action_space = env.action_space.n

# Inicjalizacja Q-learning solver
#0.1 ; 0.9 ; 0.1
q_solver = QLearningSolver(observation_space, action_space, 1)

# Trening agenta
num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0
    
    while not done:
        # Wybór akcji na podstawie epsilon-greedy
        if q_solver.epsilon < rd.random():
            action = env.action_space.sample()
        else:
            action = q_solver.get_best_action(state)
        
        # Wykonanie akcji w środowisku
        next_state, reward, done= env.step(action)[0:3]
        
        # Aktualizacja Q-wartości na podstawie otrzymanego wyniku
        q_solver.update(state, action, reward, next_state)
        
        total_reward += reward
        state = next_state
    
    # Wyświetlenie wyników po zakończeniu epizodu
    if episode % 100 == 0:
        print(f"Episode: {episode}, Total Reward: {total_reward}")

# Testowanie agenta po treningu
test_episodes = 5
for episode in range(test_episodes):
    state = env.reset()[0]
    done = False
    total_reward = 0
    
    while not done:
        action = q_solver.get_best_action(state)
        next_state, reward, done= env.step(action)[0:3]
        total_reward += reward
        state = next_state
    
    print(f"Test Episode: {episode}, Total Reward: {total_reward}")


Episode: 0, Total Reward: -4177
Episode: 100, Total Reward: -380
Episode: 200, Total Reward: -480
Episode: 300, Total Reward: -209
Episode: 400, Total Reward: -743
Episode: 500, Total Reward: -1013
Episode: 600, Total Reward: -628
Episode: 700, Total Reward: -384
Episode: 800, Total Reward: -725
Episode: 900, Total Reward: -1237
Test Episode: 0, Total Reward: 8
Test Episode: 1, Total Reward: 8
Test Episode: 2, Total Reward: 4
Test Episode: 3, Total Reward: 10
Test Episode: 4, Total Reward: 10


# Eksperymenty

# Wnioski