In [None]:
import numpy as np
import torch
import torch.nn as nn
import random

In [None]:
class PegSolitaire:
    def __init__(self):
        self.board_layout = np.array([
            [-1, -1,  1,  1,  1, -1, -1],
            [-1, -1,  1,  1,  1, -1, -1],
            [ 1,  1,  1,  1,  1,  1,  1],
            [ 1,  1,  1,  0,  1,  1,  1],
            [ 1,  1,  1,  1,  1,  1,  1],
            [-1, -1,  1,  1,  1, -1, -1],
            [-1, -1,  1,  1,  1, -1, -1]
        ]).flatten()

        self.actions = np.where(self.board_layout != -1)[0]
        self.moves = self.get_moves()
        self.reset(0, 1)

    def get_moves(self):
        moves = []
        directions = [(-2, 0), (2, 0), (0, -2), (0, 2)]
        for f in range(49):
            if self.board_layout[f] == -1: continue
            x, y = divmod(f, 7)
            for dx, dy in directions:
                tx, ty = x + dx, y + dy
                if 0 <= tx < 7 and 0 <= ty < 7:
                    t = tx * 7 + ty
                    if self.board_layout[t] != -1:
                        m = (x + dx//2) * 7 + y + dy//2
                        moves.append([f, m, t])
        return np.array(moves, dtype=np.int8)

    def get_mask(self):
        b = self.board
        return (b[self.moves[:, 0]] == 1) & (b[self.moves[:, 1]] == 1) & (b[self.moves[:, 2]] == 0)

    def step(self, action):
        self.board[self.moves[action]] = [0, 0, 1]
        self.remain -= 1
        mask = self.get_mask()
        self.done = not np.any(mask)
        return self.board.copy(), 0, self.done, mask

    def reset(self, steps, mode=0):
        if mode == 0:
            while True:
                self.board = np.full(49, -1, dtype=np.int8)
                for idx in self.actions: self.board[idx] = 0
                self.board[24] = 1 
                
                actual_steps = 0
                for _ in range(steps):
                    b = self.board
                    rev_mask = (b[self.moves[:, 0]] == 0) & (b[self.moves[:, 1]] == 0) & (b[self.moves[:, 2]] == 1)
                    possible = np.where(rev_mask)[0]
                    if len(possible) == 0: break 
                    idx = np.random.choice(possible)
                    self.board[self.moves[idx, [0, 1, 2]]] = [1, 1, 0]
                    actual_steps += 1
                    
                if actual_steps == steps: 
                    break
        else:
            while True:
                self.board = self.board_layout.copy()
                
                actual_steps = 0
                for _ in range(steps):
                    mask = self.get_mask()
                    actions = np.where(mask)[0]
                    if len(actions) == 0: 
                        break
                    idx = np.random.choice(actions)
                    self.board[self.moves[idx]] = [0, 0, 1]
                    actual_steps += 1
                
                if actual_steps == steps:
                    break

        self.remain = np.sum(self.board == 1)
        self.done = False
        return self.board.copy()

In [None]:
class Agent:
    def __init__(self):
        self.device = torch.device("cpu") 
        self.q_network = DQN().to(self.device)
        self.q_network.load_state_dict(torch.load('/kaggle/input/peg-solitaire-model-weights/model.pth', map_location=self.device))
        self.q_network.eval() 
    def act(self, state, mask):
        with torch.no_grad():
            state_t = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.q_network(state_t).cpu().numpy()[0]
            
            actions = np.where(mask)[0]
            if len(actions) == 0:
                return None, 0

            best_idx = np.argmax(q_values[actions])
            action = actions[best_idx]
            
            sorted_q = np.sort(q_values)[::-1]
            chosen_q = q_values[action]
            rank = np.where(sorted_q == chosen_q)[0][0]
            
            return action, rank

In [None]:
class DQN(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature_layer = nn.Sequential(
            nn.Linear(49, 128),
            nn.ReLU(),
        )
        self.value_stream = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
        self.advantage_stream = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 76)
        )

    def forward(self, x):
        features = self.feature_layer(x)
        value = self.value_stream(features)
        advantage = self.advantage_stream(features)
        return value + advantage - advantage.mean(dim=-1, keepdim=True)

In [None]:
def test(step, mode):
    wins = 0
    episodes = 1000
    remains = []
    q_ranks = []
        
    for _ in range(episodes):
        state = env.reset(step, mode)
        mask = env.get_mask()
            
        game_ranks = []
            
        while not env.done:
            action, rank = agent.act(state, mask)
            game_ranks.append(rank)
                
            state, _, done, mask = env.step(action)

        if env.remain == 1 and env.board[24] == 1: 
            wins += 1
        remains.append(env.remain)
        if len(game_ranks) > 0:
            q_ranks.append(np.mean(game_ranks))
            
    if mode == 1:
        step = 31 - step
        
    print(f'Step {step:<3}  |  Win {wins/episodes:<6.1%}  |  Remain {np.mean(remains):<6.2f}  |  Q Rank {np.mean(q_ranks):.1f}')

In [None]:
env = PegSolitaire()
agent = Agent()

print('Reverse Test')
for step in range(1, 27):
    test(step, 0)

print('Forward Test')
for step in range(1, 11):
    test(step, 1)