## Imports

In [1]:
import numpy as np
import random
import datetime
from TreasureMaze import TreasureMaze
from GameExperience import GameExperience
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer

## Maze

In [2]:
maze = np.array([
    [1., 1., 1., 1.],
    [0., 1., 0., 1.],
    [0., 1., 0., 1.],
    [1., 1., 1., 1.]
])

## Deep Q-Network Model

In [3]:
def build_model(maze):
    input_size = maze.size
    output_size = 4  # LEFT, UP, RIGHT, DOWN

    model = Sequential()
    model.add(InputLayer(input_shape=(input_size,)))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(optimizer='adam', loss='mse')
    return model

## Display Helper

In [4]:
def show(qmaze):
    print(qmaze.draw_env())

## Completion Check Function

In [5]:
def completion_check(model, qmaze):
    for cell in qmaze.free_cells:
        qmaze.reset(cell)
        envstate = qmaze.observe()
        game_over = False
        steps = 0
        while not game_over and steps < 100:
            q = model.predict(envstate)[0]
            q = [q[a] if a in qmaze.valid_actions() else -np.inf for a in range(4)]
            action = int(np.argmax(q))
            envstate, _, game_status = qmaze.act(action)
            game_over = (game_status != 'not_over')
            steps += 1
        if game_status != 'win':
            return False
    return True

## Time Formatting Utility

In [6]:
def format_time(seconds):
    if seconds < 400:
        return "%.1f seconds" % seconds
    elif seconds < 4000:
        return "%.2f minutes" % (seconds / 60.0)
    else:
        return "%.2f hours" % (seconds / 3600.0)

## Q-Learning Training Function

In [7]:
def qtrain(model, maze, **opt):
    global epsilon
    n_epoch = opt.get('epochs', 15000)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)

    qmaze = TreasureMaze(maze)
    experience = GameExperience(model, max_memory=max_memory)

    start_time = datetime.datetime.now()
    win_history = []
    hsize = qmaze.maze.size // 2
    win_rate = 0.0

    for epoch in range(n_epoch):
        loss = 0.0
        n_episodes = 0

        agent_cell = random.choice(qmaze.free_cells)
        qmaze.reset(agent_cell)
        envstate = qmaze.observe()
        game_over = False

        while not game_over:
            valid_actions = qmaze.valid_actions()
            if np.random.rand() < epsilon:
                action = random.choice(valid_actions)
            else:
                q = experience.predict(envstate)
                q = [q[a] if a in valid_actions else -np.inf for a in range(4)]
                action = int(np.argmax(q))

            prev_envstate = envstate
            envstate, reward, game_status = qmaze.act(action)
            game_over = (game_status != 'not_over')

            episode = [prev_envstate, action, reward, envstate, game_over]
            experience.remember(episode)

            inputs, targets = experience.get_data(data_size=data_size)
            h = model.fit(inputs, targets, epochs=1, verbose=0)
            loss += h.history['loss'][0]

        n_episodes += 1
        win_history.append(1 if game_status == 'win' else 0)
        if len(win_history) > hsize:
            del win_history[0]
        win_rate = sum(win_history) / len(win_history)

        dt = datetime.datetime.now() - start_time
        t = format_time(dt.total_seconds())
        print(f"Epoch: {epoch:03d}/{n_epoch - 1} | Loss: {loss:.4f} | Episodes: {n_episodes} | Win count: {sum(win_history)} | Win rate: {win_rate:.3f} | time: {t}")

        if win_rate > 0.9:
            epsilon = 0.05

        if sum(win_history[-hsize:]) == hsize and completion_check(model, qmaze):
            print("Reached 100% win rate at epoch:", epoch)
            break

    total_time = datetime.datetime.now() - start_time
    print("Training completed in:", format_time(total_time.total_seconds()))
    print(f"n_epoch: {epoch}, max_mem: {max_memory}, data: {data_size}")
    return total_time.total_seconds()

## Train the Agent

In [8]:
epsilon = 1.0
model = build_model(maze)
qtrain(model, maze, epochs=1000, max_memory=8 * maze.size, data_size=32)

Epoch: 000/999 | Loss: 0.6997 | Episodes: 1 | Win count: 1 | Win rate: 1.000 | time: 11.8 seconds
Epoch: 001/999 | Loss: 0.9511 | Episodes: 1 | Win count: 1 | Win rate: 0.500 | time: 83.5 seconds
Epoch: 002/999 | Loss: 0.1846 | Episodes: 1 | Win count: 2 | Win rate: 0.667 | time: 118.1 seconds
Epoch: 003/999 | Loss: 0.0441 | Episodes: 1 | Win count: 3 | Win rate: 0.750 | time: 124.9 seconds
Epoch: 004/999 | Loss: 0.6479 | Episodes: 1 | Win count: 4 | Win rate: 0.800 | time: 208.4 seconds
Epoch: 005/999 | Loss: 0.3412 | Episodes: 1 | Win count: 5 | Win rate: 0.833 | time: 261.5 seconds
Epoch: 006/999 | Loss: 0.0448 | Episodes: 1 | Win count: 6 | Win rate: 0.857 | time: 268.4 seconds
Epoch: 007/999 | Loss: 0.0641 | Episodes: 1 | Win count: 7 | Win rate: 0.875 | time: 278.3 seconds
Epoch: 008/999 | Loss: 0.1515 | Episodes: 1 | Win count: 7 | Win rate: 0.875 | time: 290.5 seconds
Epoch: 009/999 | Loss: 0.1381 | Episodes: 1 | Win count: 8 | Win rate: 1.000 | time: 297.9 seconds
Reached 100%

299.052286

## Completion Check & Maze Display

In [2]:
qmaze = TreasureMaze(maze)
completion_check(model, qmaze)
show(qmaze)

NameError: name 'maze' is not defined