In [1]:
import logging

logging.basicConfig(format="%(asctime)s %(levelname)s:%(name)s:%(message)s", level=logging.INFO, datefmt="%I:%M:%S")

In [None]:
import os
from pathlib import Path

import blunder._internal.pipeline as pl

In [3]:
logger = logging.getLogger(__name__)
logger.propagate = False

In [4]:
pgn_path = Path("/home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.pgn")
index_path = Path("/home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.idx")

engine_path = os.environ.get("STOCKFISH_PATH", "")
engine_config = pl.EngineConfig(executable_path=Path(engine_path))
engine_config.config_hash_mb = 1024
engine_config.config_threads = 2
engine_config.depth = 14

config = pl.ProcessingConfig(
    pgn_path=pgn_path,
    index_path=index_path,
    workers=6,
    max_games=64,
    resume_from_checkpoint=False,
    engine=engine_config,
    output_parquet_dir=Path("/home/vandy/Work/MATH6310/blunder-analysis/data/silver/test/"),
    checkpoint_path=Path("/home/vandy/Work/MATH6310/blunder-analysis/data/checkpoint/test/checkpoint.json"),
)

In [5]:
offsets = pl.ensure_index(config)
offsets = pl.game_offsets(offsets, max_games=config.max_games)
len(offsets)

10:20:46 INFO:blunder._internal.pipeline:Loading offsets from /home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.idx


64

In [6]:
checkpoint: pl.OffsetCheckpoint | None = None
if config.resume_from_checkpoint:
    checkpoint_path = config.checkpoint_path or config.pgn_path.with_suffix(".checkpoint.json")
    checkpoint = pl.OffsetCheckpoint.load(checkpoint_path)
    offsets = checkpoint.filter_offsets(offsets)

In [7]:
engine_config = config.engine
parquet_writer = pl.ParquetWriter(
    config.output_parquet_dir,
    config.parquet_compression,
    config.parquet_row_group_size,
)

In [8]:
logging.getLogger("chess.engine").setLevel(logging.WARNING)
# Optional: prevent messages from propagating to the root logger
logging.getLogger("chess.engine").propagate = False

In [9]:
records = pl.worker_process(
    config.pgn_path,
    engine_config=engine_config,
    parquet_writer=parquet_writer,
    worker_id=0,
    offsets=offsets.tolist(),
)

In [None]:
record = records[0]
record[1].moves[0].piece_moved