In [2]:
import opendatasets as od
od.download('https://www.kaggle.com/datasets/arevel/chess-games')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/arevel/chess-games
Downloading chess-games.zip to ./chess-games


100%|██████████| 1.45G/1.45G [00:01<00:00, 838MB/s]





In [5]:
import re

import numpy as np
import pandas as pd

import preprocessing
from constants import PIECE_TYPES

import torch
from torch.utils.data import TensorDataset, DataLoader


In [None]:
def adapt_moves(game_history):
    no_numbers = re.sub(r'\d+\.', '', game_history)

    moves = [move.strip() for move in no_numbers.split() if move.strip()]
    moves.pop() # remove winner / loser

    return ' '.join(moves)

In [4]:
FILE_PATH = '../data/chess_games.csv'
MOVES_COLUMN = 'AN'
ELO_COLUMN = 'WhiteElo'
RESULT_COLUMN = 'Result'

ELO_THRESHOLD = 1600
WHITE_WIN = '1-0'

BATCH_SIZE = 32
SHUFFLE = True

In [9]:
def load_dataloaders(original_headers, limit=1_000_000, offset=0):
    data_df = pd.read_csv(FILE_PATH, nrows=limit, skiprows=offset+1, names=original_headers)

    piece_selection_X = []
    piece_selection_y = []

    move_selection = {}

    for index, raw in enumerate(data_df[MOVES_COLUMN]):
        if data_df[ELO_COLUMN][index] <= ELO_THRESHOLD:
          continue

        if data_df[RESULT_COLUMN][index] != WHITE_WIN:
          continue

        try:
            (current_ps_X, current_ps_y, current_ms) = preprocessing.preprocess_game(adapt_moves(raw))

            print(f"Loaded & preprocessed game from the dataset {index}/{len(data_df)}")

            piece_selection_X.append(current_ps_X)
            piece_selection_y.append(current_ps_y)

            move_selection = {
                k: move_selection.get(k, []) + current_ms.get(k, []) for k in set(move_selection) | set(current_ms)
            }
        except Exception as ex:
            continue

    ps_dataloader = DataLoader(
        TensorDataset(
            torch.from_numpy(np.concatenate(piece_selection_X, axis=0)).float(),
            torch.from_numpy(np.concatenate(piece_selection_y, axis=0)).long(),
        ),
        batch_size=BATCH_SIZE,
        shuffle=SHUFFLE,
    )

    ms_dataloaders = []

    for piece_type in PIECE_TYPES:
        index = PIECE_TYPES[piece_type]
        ms_X, ms_y = zip(*move_selection[index])
        ms_dataloaders.append(
            DataLoader(
                TensorDataset(
                    torch.tensor(ms_X, dtype=torch.float32),
                    torch.tensor(ms_y, dtype=torch.long),
                ),
                batch_size=BATCH_SIZE,
                shuffle=SHUFFLE,
            )
        )

    return ps_dataloader, ms_dataloaders

In [10]:
def train(dataloader, model, device):
    model.train()
    total_loss = 0

    for batch_idx, (X, y) in enumerate(dataloader):
        size = len(dataloader.dataset)
        X, y = X.to(device), y.to(device)

        prediction = model(X)
        loss = model.loss_fn(prediction, y)
        total_loss += loss

        loss.backward()
        model.optimizer.step()
        model.optimizer.zero_grad()

        if batch_idx % 100 == 0:
            loss, current = loss.item(), (batch_idx + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, device):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    model.eval()
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += model.loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [11]:
from models import PieceSelector, MoveSelector

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
training_limit = 2_000_000
dataset_batch_step = 10_000

piece_selector = PieceSelector().to(device)
move_selectors = []

for piece_type in PIECE_TYPES:
    move_selectors.append(MoveSelector().to(device))

header_df = pd.read_csv(FILE_PATH, nrows=0)
original_headers = list(header_df.columns)

epochs = 15

for epoch in range(epochs):
  print(f"Epoch: {epoch+1}/{epochs}")

  for dataset_batch in range(0, training_limit, dataset_batch_step):
    print(f"Processing dataset batch: {dataset_batch} / {training_limit}")
    ps_dataloader, ms_dataloaders = load_dataloaders(original_headers, dataset_batch, dataset_batch)

    print("Training piece selector model")
    train(ps_dataloader, piece_selector, device)
    test(ps_dataloader, piece_selector, device)

    for piece_type in PIECE_TYPES:
      print(f"Training move selector model for {piece_type}")
      index = PIECE_TYPES[piece_type]
      move_selector = move_selectors[index]
      train(ms_dataloaders[index], move_selector, device)
      test(ms_dataloaders[index], move_selector, device)

  torch.save(piece_selector.state_dict(), 'model/piece-selector.pth')

  for piece_type in PIECE_TYPES:
    index = PIECE_TYPES[piece_type]
    move_selector = move_selectors[index]
    torch.save(move_selector.state_dict(), f'model/{piece_type}.pth')


Epoch: 1/15
Processing dataset batch: 0 / 2000000
Loaded & preprocessed game from the dataset 0/5000
Loaded & preprocessed game from the dataset 2/5000
Loaded & preprocessed game from the dataset 3/5000
Loaded & preprocessed game from the dataset 7/5000
Loaded & preprocessed game from the dataset 9/5000
Loaded & preprocessed game from the dataset 11/5000
Loaded & preprocessed game from the dataset 12/5000
Loaded & preprocessed game from the dataset 13/5000
Loaded & preprocessed game from the dataset 16/5000
Loaded & preprocessed game from the dataset 17/5000
Loaded & preprocessed game from the dataset 19/5000
Loaded & preprocessed game from the dataset 20/5000
Loaded & preprocessed game from the dataset 22/5000
Loaded & preprocessed game from the dataset 25/5000
Loaded & preprocessed game from the dataset 27/5000
Loaded & preprocessed game from the dataset 28/5000
Loaded & preprocessed game from the dataset 29/5000
Loaded & preprocessed game from the dataset 38/5000
Loaded & preprocess

KeyboardInterrupt: 