In [1]:
from pathlib import Path
from typing import Dict, Iterator, Optional, Any
import chess.pgn

In [2]:
def iter_games(pgn_path: Path) -> Iterator[chess.pgn.Game]:
    """Yield games one by one from a PGN file"""
    
    if pgn_path.suffix.lower() != ".pgn":
        raise ValueError(f"Expected a .pgn file, got: {pgn_path.suffix}")
    
    with open(pgn_path, "r", encoding="utf-8", errors="replace") as f:
        while True:
            game = chess.pgn.read_game(f)
            if game is None:
                break
            yield game

In [None]:
def pgn_to_dict(chess_games_folder: Path) -> Dict[str, Optional[int]]:
    """From a folder of pgn files, yield a dictionary of deduplicated fen strings"""  
      
    pgn_generator = chess_games_folder.iterdir()
    fens = {}           

    while True:
        try:
            pgn_file = next(pgn_generator)
        except StopIteration:
            break

        for game in iter_games(pgn_file):
            board = game.board()

            # Save the initial game position
            starting_pos = board.fen()
            if starting_pos not in fens:
                fens[starting_pos] = None

            for move in game.mainline_moves():
                try:
                    board.push(move)
                    fen = board.fen()
                    if fen not in fens:
                        fens[fen] = None
                except ValueError as e:
                    print(f"Skipping illegal move in {pgn_file}:{e}")            


    print(f"{len(fens)} positions loaded.")

In [6]:
chess_games_folder = Path("./chess_games")
pgn_to_dict(chess_games_folder)

illegal san: 'dxe6' in rn1q1rk1/pbp1b1pp/1p1p4/3Ppp2/2P1P3/2B2NP1/PP3PBP/R2Q1RK1 w - - 1 12 while parsing <Game at 0x23afaf11990 ('Burger Karl (USA)' vs. 'Tai', '1990.??.??' at 'USA')>
illegal san: 'Nb4' in 4Qk2/pppb2p1/2np3p/2b5/2B2Bnq/2N5/PP4PP/4RR1K b - - 0 18 while parsing <Game at 0x23afb76b280 ('Charousek R' vs. 'Volner', '1893.??.??' at 'Vienna')>
illegal san: 'Qcd7' in 2QQ4/5ppk/8/7r/1q1Pq3/3BP2P/8/4K3 w - - 4 53 while parsing <Game at 0x23a85f62470 ('Short' vs. 'Hebden, M.', '1982.??.??' at 'Hastings')>
illegal san: 'Qd3' in 8/p5kp/1p4p1/P1P5/7P/2P1Q3/3K2q1/8 w - - 2 43 while parsing <Game at 0x23aef5323e0 ('Kamsky,Gata' vs. 'Short,Nigel', '1990.??.??' at 'Tilburg')>
illegal san: 'Rxa8' in R3r1k1/2b3p1/2p3qp/1p1n1pN1/3P4/2P2QP1/1P1BrP1P/3R2K1 w - - 0 27 while parsing <Game at 0x23aef29fd90 ('Anand Viswanathan' vs. 'Kamsky Gata', '1994.??.??' at 'Sanghi Nagar (India)')>
illegal san: 'Rxd2' in R3r1k1/2b3p1/2p3qp/1p1n1pN1/3P4/2P2QP1/1P1BrP1P/3R2K1 w - - 0 27 while parsing <Game a

6083549 positions loaded.
