<a href="https://colab.research.google.com/github/ushio2580/Gradient2/blob/main/6x6Maze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Import required libraries


In [2]:
# Importar bibliotecas necesarias
import gym
from gym import spaces
import numpy as np
import tensorflow as tf
from collections import deque
import random
import matplotlib.pyplot as plt
import argparse

# Definir el entorno del laberinto
class MazeEnv(gym.Env):
    def __init__(self):
        super(MazeEnv, self).__init__()
        # Laberinto 6x6: 0=espacio libre, 1=pared, 2=inicio, 3=fin
        self.maze = np.array([
            [2, 0, 0, 0, 0, 0],
            [1, 1, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 0],
            [0, 1, 1, 1, 0, 1],
            [0, 0, 0, 0, 0, 0],
            [0, 1, 0, 1, 1, 3]
        ])
        self.start_pos = (0, 0)  # Inicio en (0,0)
        self.end_pos = (5, 5)    # Fin en (5,5)
        self.agent_pos = self.start_pos
        self.max_steps = 100     # Límite de pasos por episodio
        self.current_step = 0
        self.action_space = spaces.Discrete(4)  # Arriba, derecha, abajo, izquierda
        self.observation_space = spaces.Tuple((spaces.Discrete(6), spaces.Discrete(6)))

    def reset(self):
        self.agent_pos = self.start_pos
        self.current_step = 0
        return self.agent_pos

    def step(self, action):
        self.current_step += 1
        row, col = self.agent_pos
        # Movimientos posibles
        if action == 0 and row > 0:      # Arriba
            new_pos = (row - 1, col)
        elif action == 1 and col < 5:    # Derecha
            new_pos = (row, col + 1)
        elif action == 2 and row < 5:    # Abajo
            new_pos = (row + 1, col)
        elif action == 3 and col > 0:    # Izquierda
            new_pos = (row, col - 1)
        else:
            new_pos = self.agent_pos  # Movimiento inválido

        # Verificar si el movimiento es válido (no pared)
        if self.maze[new_pos] != 1:
            self.agent_pos = new_pos

        # Calcular recompensa y estado de finalización
        if self.agent_pos == self.end_pos:
            reward = 1.0
            done = True
        elif self.current_step >= self.max_steps:
            reward = -0.1
            done = True
        else:
            reward = -0.1
            done = False
        return self.agent_pos, reward, done, {}

# Definir el agente DQN
class DQNAgent:
    def __init__(self, state_size=2, action_size=4):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)  # Memoria de repetición
        self.gamma = 0.95                 # Factor de descuento
        self.epsilon = 1.0                # Tasa de exploración inicial
        self.epsilon_min = 0.01           # Tasa mínima de exploración
        self.epsilon_decay = 0.995        # Decaimiento de epsilon
        self.learning_rate = 0.001        # Tasa de aprendizaje
        self.model = self._build_model()  # Red principal
        self.target_model = self._build_model()  # Red objetivo
        self.update_target_model()

    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), loss='mse')
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)  # Exploración
        state = np.array(state).reshape(1, -1)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])  # Explotación

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([t[0] for t in minibatch])
        next_states = np.array([t[3] for t in minibatch])
        targets = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            targets[i][action] = target
        self.model.fit(states, targets, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, path):
        self.model.save_weights(path)

    def load(self, path):
        self.model.load_weights(path)
        self.target_model.load_weights(path)

# Función de entrenamiento
def train_dqn(episodes=200, batch_size=32, model_path='dqn_model.h5'):
    env = MazeEnv()
    agent = DQNAgent()
    rewards = []

    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
        rewards.append(total_reward)
        if episode % 10 == 0:
            agent.update_target_model()
        print(f"Episodio {episode+1}/{episodes}, Recompensa: {total_reward:.2f}, Epsilon: {agent.epsilon:.3f}")

    agent.save(model_path)
    plt.plot(rewards)
    plt.xlabel('Episodio')
    plt.ylabel('Recompensa Total')
    plt.title('Progreso del Entrenamiento')
    plt.show()
    return rewards

# Función de evaluación con visualización
def evaluate_dqn(model_path='dqn_model.h5'):
    env = MazeEnv()
    agent = DQNAgent()
    agent.load(model_path)
    agent.epsilon = 0  # Sin exploración en evaluación

    for episode in range(10):
        state = env.reset()
        path = [state]
        total_reward = 0
        done = False
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            path.append(next_state)
            total_reward += reward
            state = next_state

        # Visualizar el camino
        fig, ax = plt.subplots()
        ax.imshow(env.maze, cmap='gray')
        for pos in path:
            ax.plot(pos[1], pos[0], 'bo')  # Puntos azules para el camino
        ax.plot(path[0][1], path[0][0], 'go')  # Inicio en verde
        ax.plot(path[-1][1], path[-1][0], 'ro')  # Fin en rojo
        plt.title(f"Episodio {episode+1}")
        plt.show()

        print(f"\nEpisodio {episode+1}:")
        print(f"Camino: {path}")
        print(f"Longitud del camino: {len(path)}")
        print(f"Recompensa Total: {total_reward:.2f}")

# Ejecutar entrenamiento y evaluación
if __name__ == "__main__":
    # Entrenar el modelo
    print("Entrenando el agente...")
    train_dqn(episodes=200, batch_size=32, model_path='dqn_model.weights.h5')

    # Evaluar el modelo
    print("\nEvaluando el agente...")
    evaluate_dqn(model_path='dqn_model.weights.h5')

Entrenando el agente...
Episodio 1/200, Recompensa: -10.00, Epsilon: 0.711
Episodio 2/200, Recompensa: -10.00, Epsilon: 0.431
Episodio 3/200, Recompensa: -10.00, Epsilon: 0.261
Episodio 4/200, Recompensa: -10.00, Epsilon: 0.158
Episodio 5/200, Recompensa: -10.00, Epsilon: 0.096
Episodio 6/200, Recompensa: -10.00, Epsilon: 0.058
Episodio 7/200, Recompensa: -10.00, Epsilon: 0.035
Episodio 8/200, Recompensa: -10.00, Epsilon: 0.021
Episodio 9/200, Recompensa: -10.00, Epsilon: 0.013
Episodio 10/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 11/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 12/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 13/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 14/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 15/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 16/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 17/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 18/200, Recompensa: -10.00, Epsilon: 0.010
Episodio 19/200, Recompensa: -10.00, Epsilon: 0.0

KeyboardInterrupt: 



```
# Cell 1: Imports and Class Definitions
```



In [None]:
# Importar bibliotecas necesarias
import gym
from gym import spaces
import numpy as np
import tensorflow as tf
from collections import deque
import random
import matplotlib.pyplot as plt
import argparse

# Definir el entorno del laberinto
class MazeEnv(gym.Env):
    def __init__(self):
        super(MazeEnv, self).__init__()
        # Laberinto 6x6: 0=espacio libre, 1=pared, 2=inicio, 3=fin
        self.maze = np.array([
            [2, 0, 0, 0, 0, 0],
            [1, 1, 0, 1, 1, 0],
            [0, 0, 0, 0, 0, 0],
            [0, 1, 1, 1, 0, 1],
            [0, 0, 0, 0, 0, 0],
            [0, 1, 0, 1, 1, 3]
        ])
        self.start_pos = (0, 0)  # Posición inicial
        self.end_pos = (5, 5)    # Posición final
        self.agent_pos = self.start_pos
        self.max_steps = 100     # Máximo de pasos por episodio
        self.current_step = 0
        self.action_space = spaces.Discrete(4)  # Acciones: 0=arriba, 1=derecha, 2=abajo, 3=izquierda
        self.observation_space = spaces.Tuple((spaces.Discrete(6), spaces.Discrete(6)))

    def reset(self):
        """Reinicia el entorno al estado inicial."""
        self.agent_pos = self.start_pos
        self.current_step = 0
        return self.agent_pos

    def step(self, action):
        """Realiza un paso en el entorno basado en la acción."""
        self.current_step += 1
        row, col = self.agent_pos
        # Definir movimientos posibles
        if action == 0 and row > 0:      # Arriba
            new_pos = (row - 1, col)
        elif action == 1 and col < 5:    # Derecha
            new_pos = (row, col + 1)
        elif action == 2 and row < 5:    # Abajo
            new_pos = (row + 1, col)
        elif action == 3 and col > 0:    # Izquierda
            new_pos = (row, col - 1)
        else:
            new_pos = self.agent_pos  # Movimiento inválido, no cambia posición

        # Verificar si el movimiento es válido (no es pared)
        if self.maze[new_pos] != 1:
            self.agent_pos = new_pos

        # Determinar recompensa y estado de finalización
        if self.agent_pos == self.end_pos:
            reward = 1.0
            done = True
        elif self.current_step >= self.max_steps:
            reward = -0.1
            done = True
        else:
            reward = -0.1
            done = False
        return self.agent_pos, reward, done, {}

# Definir el agente DQN
class DQNAgent:
    def __init__(self, state_size=2, action_size=4):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)  # Memoria de repetición
        self.gamma = 0.95                 # Factor de descuento
        self.epsilon = 1.0                # Tasa de exploración inicial
        self.epsilon_min = 0.01           # Tasa mínima de exploración
        self.epsilon_decay = 0.995        # Decaimiento de epsilon
        self.learning_rate = 0.001        # Tasa de aprendizaje
        self.model = self._build_model()  # Modelo principal
        self.target_model = self._build_model()  # Modelo objetivo
        self.update_target_model()

    def _build_model(self):
        """Construye la red neuronal para el agente."""
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate), loss='mse')
        return model

    def update_target_model(self):
        """Actualiza los pesos del modelo objetivo."""
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        """Almacena una experiencia en la memoria."""
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        """Elige una acción basada en exploración o explotación."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)  # Exploración
        state = np.array(state).reshape(1, -1)
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])  # Explotación

    def replay(self, batch_size):
        """Entrena el modelo con una muestra de la memoria."""
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        states = np.array([t[0] for t in minibatch])
        next_states = np.array([t[3] for t in minibatch])
        targets = self.model.predict(states, verbose=0)
        next_q_values = self.target_model.predict(next_states, verbose=0)
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            target = reward if done else reward + self.gamma * np.max(next_q_values[i])
            targets[i][action] = target
        self.model.fit(states, targets, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save(self, path):
        """Guarda los pesos del modelo."""
        self.model.save_weights(path)

    def load(self, path):
        """Carga los pesos del modelo."""
        self.model.load_weights(path)
        self.target_model.load_weights(path)