# NNUE training

## Model architecture

Current model architecture is a sparse, binary array of length 768. Each element of the array represents a possible combination of piece type (6), piece_color (2) and position (64) (6*2*64 = 768).  

The fully connected feedfoward network has 3 hidden layers: 768 -> 8, 8 -> 8 and 8 -> 1.

The output is a single scalar.

Currently used training data: https://www.kaggle.com/competitions/train-your-own-stockfish-nnue/data

Great source on NNUE: https://official-stockfish.github.io/docs/nnue-pytorch-wiki/docs/nnue.html

## Input data
### Load input data

In [1]:
import pandas as pd
import numpy as np

input_path = '/home/yvlaere/projects/yvl-chess/NNUE_training/training_data/train.csv'

training_df = pd.read_csv(input_path)

print(training_df.head())

print(np.mean(training_df['Evaluation']))
print(np.std(training_df['Evaluation']))
print(np.min(training_df['Evaluation']))
print(np.max(training_df['Evaluation']))
print(np.median(training_df['Evaluation']))

                                                 FEN  Evaluation
0  r1b2rk1/ppp2pbp/3q1np1/n3p1B1/2B5/1Q3N2/PP1N1P...        -135
1  8/1pp2p2/6k1/4P2p/p1PR1K1P/2r2P2/6P1/8 w - - 0 33         -57
2  r2qk1nr/1b3pbp/n3p1p1/1pp1P3/p2PN3/2P2N2/PPB3P...         541
3  2b2rk1/5pp1/p2q1n1p/P2pn3/3N4/3BP1B1/2Q2PPP/Rr...         163
4  r2qkb1r/ppp2ppb/2n1p3/3n2PQ/3Pp3/2P4P/PP6/RNB1...        -332
18.06853701380683
619.097853227714
-6462
7462
14.0


### Turn FEN into input layer

In [2]:
piece_dict = {'P': 0, 'N': 1, 'B': 2, 'R': 3, 'Q': 4, 'K': 5, 'p': 6, 'n': 7, 'b': 8, 'r': 9, 'q': 10, 'k': 11}

def FEN_to_input(fen):
    """
    Convert a FEN string to an NNUE input vector.
    """
    # Split the FEN string into its components
    sub_FEN = fen.split(' ')
    board = sub_FEN[0]
    ranks = board.split('/')

    # Convert the board to a 1D boolean array
    # in the chess engine, position 0 corresponds to a1, so the ranks in the FEN string will need to be reversed
    input_layer = np.zeros(768, dtype = np.bool)
    position = 0
    for rank in ranks[::-1]:
        for char in rank:
            if char.isdigit():
                position += int(char)
            else:
                input_layer[position*piece_dict[char]] = True

    return input_layer

In [3]:
#print(FEN_to_input(training_df['FEN'][0]))

import torch
import torch.nn as nn

class SimpleNNUE(nn.Module):
    def __init__(self):
        super(SimpleNNUE, self).__init__()
        # three fully connected layers
        self.fc1 = nn.Linear(768, 1024)
        self.fc2 = nn.Linear(1024, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.clamp(self.fc1(x), min = 0, max = 1)
        x = torch.clamp(self.fc2(x), min = 0, max = 1)
        x = torch.clamp(self.fc3(x), min = 0, max = 1)
        x = self.fc4(x)  # output can be raw score
        return x


In [4]:
# general information on data
input_data = np.array([FEN_to_input(fen) for fen in training_df['FEN'].values])
output_data = np.array(training_df['Evaluation'].values)
print(input_data.shape)
print(output_data.shape)

#1 979 383 entries
# if converted to torch tensors, it becomes too large for the RAM (bool -> float32)
# so it needs to be fed in batches

(1979383, 768)
(1979383,)


In [5]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, np_input, np_output):
        self.input = np_input
        self.output = np_output

    def __len__(self):
        return self.input.shape[0]
    
    def __getitem__(self, idx):
        # get one data point
        np_x = self.input[idx]
        np_y = self.output[idx]

        # convert to torch tensors
        x = torch.tensor(np_x, dtype=torch.float32)
        y = torch.tensor(np_y, dtype=torch.float32)
        return x, y

In [6]:
from torch.utils.data import DataLoader

# tranport data to torch tensors
input_data = np.array([FEN_to_input(fen) for fen in training_df['FEN'].values])
output_data = np.array(training_df['Evaluation'].values)
dataset = MyDataset(input_data, output_data)
loader = DataLoader(dataset, batch_size = 2048, shuffle = True, num_workers = 4, pin_memory = True)

# about 2.4G VRAM available

https://talkchess.com/viewtopic.php?start=60&t=75724

In [7]:
from torch.optim.lr_scheduler import StepLR

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleNNUE().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=10, gamma=0.5)
criterion = nn.MSELoss()
MAE_loss = nn.L1Loss()
lowest_MAE = 10000

for epoch in range(200):
    for batch_x, batch_y in loader:
        # move data to GPU
        batch_x = batch_x.to(device, non_blocking = True)
        batch_y = batch_y.to(device, non_blocking = True)
        pred = model(batch_x).squeeze(1)  # remove the last dimension
        loss = criterion(pred, batch_y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # calculate MAE
        MAE = MAE_loss(pred, batch_y)
        if MAE < lowest_MAE:
            lowest_MAE = MAE
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"New best model saved with MAE: {lowest_MAE.item():.4f}")
        
    scheduler.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
    print(f"Epoch {epoch+1}, MAE: {MAE.item():.4f}, lowest MAE: {lowest_MAE:.4f}")

Epoch 1, Loss: 407106.0312
Epoch 1, MAE: 351.9447
Epoch 2, Loss: 445752.2812
Epoch 2, MAE: 357.8810
Epoch 3, Loss: 397980.4375
Epoch 3, MAE: 357.7122
Epoch 4, Loss: 392404.8125
Epoch 4, MAE: 351.9115
Epoch 5, Loss: 372451.7500
Epoch 5, MAE: 331.7231
Epoch 6, Loss: 353362.3125
Epoch 6, MAE: 331.5839
Epoch 7, Loss: 300653.7188
Epoch 7, MAE: 321.7122
Epoch 8, Loss: 320223.1562
Epoch 8, MAE: 328.6672
Epoch 9, Loss: 397044.7812
Epoch 9, MAE: 344.6224
Epoch 10, Loss: 378886.5625
Epoch 10, MAE: 346.0128


KeyboardInterrupt: 