In [1]:
import numpy as np # type: ignore
import time
import torch
import torch.nn as nn # type: ignore
import torch.optim as optim # type: ignore
from torch.utils.data import random_split, DataLoader # type: ignore
from torch.optim.lr_scheduler import MultiStepLR
from chess import pgn # type: ignore
from tqdm import tqdm # type: ignore
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from auxiliary_func import check_memory, load_dataset, encode_moves
from dataset import ChessDataset
from model import ChessModel
from model2 import ChessModel2
from model3 import ChessModel3
from model4 import ChessModel4
from model5 import ChessModel5
from model7 import ChessModel7
from MiniMaia import MiniMaia
import pickle

2025-11-08 16:41:17.600868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-08 16:41:17.622002: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-08 16:41:17.622037: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-08 16:41:17.634968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MiniMaia(1000).to(device)
print(model)

MiniMaia(
  (init_layer): Conv2d(16, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (final_layer): Conv2d(64, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (init_bn): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (final_bn): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=5120, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1000, bias=True)
  (mid_layers): Sequential(
    (0): MiniMaiaBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
      (sigmoid): Sigmoid()
      (avg_pool): AvgPool2d(kernel_size=8, stride=8, padding=0)


In [None]:
total_mem = check_memory()
print(total_mem)


X, y, games_parsed, files_parsed = load_dataset(data_folder="../../data/Lichess_Elite_Database", pgn_memory_mark=1.0, file_limit=2)


X, y = np.array(X, dtype=np.float32), np.array(y)

y, move_to_int = encode_moves(y)
num_classes = len(move_to_int)

In [None]:

X = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.long)


In [None]:
X.shape
# y.shape

In [None]:
# Create Dataset
dataset = ChessDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Compute split sizes
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Then create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}', flush=True)

# Model Initialization
model = ChessModel7(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

scheduler = MultiStepLR(optimizer, milestones=[50000, 250000, 400000], gamma=0.2)

In [None]:
model.train()
running_loss = 0.0
for inputs, labels in tqdm(train_loader):
    inputs, labels = inputs.to(device), labels.to(device)  # Move data to GPU
    optimizer.zero_grad()

    outputs = model(inputs)  # Raw logits

    print(outputs)
    # Compute loss
    loss = criterion(outputs, labels)
    loss.backward()
    
    # Gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    scheduler.step()
    print(loss.item())
    print(loss.item())
    running_loss += loss.item()