In [1]:
# Imports 

import os
import chess
import chess.pgn
import chess.engine
import numpy as np
import pandas as pd
import time
import copy

In [2]:
# Split large pgn file into separate pgns 

# num_games_per_file = 1000
# file_no = 1221
# count = 0
# while True:
#     if count == num_games_per_file:
#         file_no = file_no + 1
#         count = 0
#         print(str(file_no) + " files completed!")
#     game = chess.pgn.read_game(pgn)
#     filepath = "data/chess_games/" + str(file_no) + ".pgn" 
#     print(game, file=open(filepath, "a"), end="\n\n")
#     count = count + 1
#     if count % 10 == 0:
#         print(str(count // 10) + "%")

# while file_no < 1222:
#     filepath = "data/chess_games/" + str(file_no) + ".pgn"
#     newpath = "data/games/" + str(file_no) + ".pgn"
#     os.rename(filepath, newpath)
#     print(file_no)
#     file_no = file_no + 1
    
# filepath = "data/games/" + str(file_no) + ".pgn"
# with open(filepath, "r") as f:
#     lines = f.readlines()
# with open(filepath, "w") as f:
#     for line in lines:
#         if line.strip("\n") != "None":
#             f.write(line)

In [9]:
# Constants
init_piece_bitstrings = {
    'K': np.zeros(64),
    'Q': np.zeros(64),
    'R': np.zeros(64),
    'B': np.zeros(64),
    'N': np.zeros(64),
    'P': np.zeros(64),
    'k': np.zeros(64),
    'q': np.zeros(64),
    'r': np.zeros(64),
    'b': np.zeros(64),
    'n': np.zeros(64),
    'p': np.zeros(64),
}

result_dict = {
    '1-0': 2,
    '1/2-1/2': 1,
    '0-1': 0
}

fmt = ['int64' for i in range(777)]

header_cols = ["WhiteElo", "BlackElo", "MovesPlayed"]
board_cols = [key + "_" + square for key in init_piece_bitstrings.keys() for square in chess.SQUARE_NAMES ]
extra_cols = ["Turn", "WKC", "WQC", "BKC", "BQC", "Result"]
cols = np.concatenate((header_cols, board_cols, extra_cols))
len(cols)

777

# Selecting Data

Data from each game is collected in the following way:

1. Header information (ELO of White and Black, number of moves played thus far, result of the game - will be used as a label). This will mainly be used to select data for training if needed. 
2. Bitboards for each piece of each color (6 for each side x 2 sides x 64 squares per piece). 
3. Other board information (castling rights, whose turn)

In [4]:
# # Engine
# engine = chess.engine.SimpleEngine.popen_uci("/usr/local/Cellar/stockfish/13/bin/stockfish")
# evaltime = 0.01
# score = engine.analyse(board, chess.engine.Limit(time=evaltime))['score'].white().score(mate_score=1000000)

In [10]:
# Parse a pgn file into the desired form, and save data in .csv file
def parse(pgn_path, csv_path = None):

    pgn = open(pgn_path)
    
    # Data generation
    data = []
    count = 0
    start = time.time()

    # Label each position in each game appropriately
    while True:
        game = chess.pgn.read_game(pgn)
        if game == None:
            break

        try:    
            white_elo = game.headers['WhiteElo'] if 'WhiteElo' in game.headers else None
            black_elo = game.headers['BlackElo'] if 'BlackElo' in game.headers else None
            result = result_dict[game.headers['Result']]
            game_moves = list(game.mainline_moves())
            board = game.board()

            for moves_played, move in enumerate(game_moves):
                board.push(move)

                turn = board.turn
                wkc = board.has_kingside_castling_rights(chess.WHITE)
                wqc = board.has_queenside_castling_rights(chess.WHITE)
                bkc = board.has_kingside_castling_rights(chess.BLACK)
                bqc = board.has_queenside_castling_rights(chess.BLACK)
                fen = board.board_fen()

                piece_bitstrings = copy.deepcopy(init_piece_bitstrings)
                for square in chess.SQUARES:
                    piece = str(board.piece_at(square))
                    if piece == 'None':
                        continue
                    piece_bitstrings[piece][square] = 1
                board_bitstring = np.concatenate(list(piece_bitstrings.values())).astype(int)

                header = [white_elo, black_elo, moves_played + 1]
                extras = np.array([turn, wkc, wqc, bkc, bqc, result]).astype(int)
                row = np.concatenate((header, board_bitstring, extras)).astype(int)
                data.append(row)

            count = count + 1
            if count % 10 == 0:
                print("Finished " + str(count) + " in ", time.time() - start, " sec")
        except Exception:
            print("Skipped game")
            continue
    
    if csv_path:
        start = time.time()
        data = np.asarray(data)
        np.savetxt(csv_path, data, delimiter=",", fmt='%d', header=", ".join(cols), comments='')
        print("Finished csv in ", time.time() - start, " sec")

In [21]:
start_no = 24
end_no = 29

while start_no < end_no:
    pgn_path = "data/ccrl/pgn/" + str(start_no) + ".pgn"
    csv_path = "data/ccrl/csv/" + str(start_no) + ".csv"
    
    start = time.time()
    parse(pgn_path, csv_path)
    print("Finished " + str(start_no) + " in ", time.time() - start, " sec")
    print("-----")

    start_no = start_no + 1

Finished 10 in  1.2903690338134766  sec
Finished 20 in  2.456040859222412  sec
Finished 30 in  3.568775177001953  sec
Finished 40 in  4.705726861953735  sec
Finished 50 in  5.797083139419556  sec
Finished 60 in  7.140123128890991  sec
Finished 70 in  8.826962947845459  sec
Finished 80 in  9.878767013549805  sec
Finished 90 in  11.548470973968506  sec
Finished 100 in  12.949291944503784  sec
Finished 110 in  14.053181886672974  sec
Finished 120 in  15.166216850280762  sec
Finished 130 in  17.086151838302612  sec
Finished 140 in  18.43408703804016  sec
Finished 150 in  20.01570200920105  sec
Finished 160 in  21.505189180374146  sec
Finished 170 in  23.12090015411377  sec
Finished 180 in  25.191262006759644  sec
Finished 190 in  26.843657970428467  sec
Finished 200 in  27.801297187805176  sec
Finished 210 in  29.008444786071777  sec
Finished 220 in  30.16166090965271  sec
Finished 230 in  31.570312976837158  sec
Finished 240 in  32.923091888427734  sec
Finished 250 in  34.20402717590332  

Finished 10 in  1.856031894683838  sec
Finished 20 in  3.3231961727142334  sec
Finished 30 in  4.779452085494995  sec
Finished 40 in  5.99655294418335  sec
Finished 50 in  7.378483057022095  sec
Finished 60 in  8.714419841766357  sec
Finished 70 in  9.991978883743286  sec
Finished 80 in  11.26970887184143  sec
Finished 90 in  12.986999034881592  sec
Finished 100 in  15.00292706489563  sec
Finished 110 in  16.443952083587646  sec
Finished 120 in  17.63641095161438  sec
Finished 130 in  18.814032077789307  sec
Finished 140 in  20.194665908813477  sec
Finished 150 in  21.12075400352478  sec
Finished 160 in  22.222764015197754  sec
Finished 170 in  23.266947984695435  sec
Finished 180 in  24.23186492919922  sec
Finished 190 in  25.8182110786438  sec
Finished 200 in  26.972168922424316  sec
Finished 210 in  28.307039976119995  sec
Finished 220 in  29.329511165618896  sec
Finished 230 in  30.62019395828247  sec
Finished 240 in  31.736363887786865  sec
Finished 250 in  32.84928798675537  sec


Finished 10 in  1.4123609066009521  sec
Finished 20 in  2.689586877822876  sec
Finished 30 in  4.0359838008880615  sec
Finished 40 in  5.100332736968994  sec
Finished 50 in  6.532705783843994  sec
Finished 60 in  7.705802917480469  sec
Finished 70 in  8.960648775100708  sec
Finished 80 in  10.296727657318115  sec
Finished 90 in  11.791110754013062  sec
Finished 100 in  12.981093883514404  sec
Finished 110 in  14.136890888214111  sec
Finished 120 in  15.22600769996643  sec
Finished 130 in  16.62199091911316  sec
Finished 140 in  17.818126678466797  sec
Finished 150 in  19.002081632614136  sec
Finished 160 in  20.21742081642151  sec
Finished 170 in  21.74813175201416  sec
Finished 180 in  22.886149883270264  sec
Finished 190 in  23.996795892715454  sec
Finished 200 in  26.309820890426636  sec
Finished 210 in  27.720574855804443  sec
Finished 220 in  29.37729787826538  sec
Finished 230 in  30.541982889175415  sec
Finished 240 in  31.825390815734863  sec
Finished 250 in  32.95066690444946 

Finished 10 in  1.1837728023529053  sec
Finished 20 in  2.7599737644195557  sec
Finished 30 in  4.4514570236206055  sec
Finished 40 in  5.727691888809204  sec
Finished 50 in  7.053158760070801  sec
Finished 60 in  8.821571826934814  sec
Finished 70 in  10.792003870010376  sec
Finished 80 in  13.006967067718506  sec
Finished 90 in  14.989871740341187  sec
Finished 100 in  17.53551197052002  sec
Finished 110 in  19.992339849472046  sec
Finished 120 in  22.654520750045776  sec
Finished 130 in  24.292890787124634  sec
Finished 140 in  26.32473874092102  sec
Finished 150 in  27.925020933151245  sec
Finished 160 in  29.258193969726562  sec
Finished 170 in  31.06448984146118  sec
Finished 180 in  32.1482949256897  sec
Finished 190 in  33.58384680747986  sec
Finished 200 in  34.68425989151001  sec
Finished 210 in  36.71548080444336  sec
Finished 220 in  38.77115082740784  sec
Finished 230 in  40.3659987449646  sec
Finished 240 in  41.64542269706726  sec
Finished 250 in  43.40211081504822  sec
