In [1]:
import io
import os
from concurrent.futures import ProcessPoolExecutor, as_completed

import chess
import chess.pgn


In [2]:
pgn_file = "/home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.pgn"

In [3]:
def stream_games(pgn_path):
    """Stream raw PGN game texts from file. Yields one game's full text (string) at a time.

    Heuristic: a game contains tag-pairs (lines starting with '[') and moves.
    A blank line after tags+moves marks the boundary.
    """
    with open(pgn_path, encoding="utf-8", errors="replace") as f:
        buf = []
        seen_tag = False
        seen_move = False
        for line in f:
            buf.append(line)
            if line.startswith("["):
                seen_tag = True
            elif line.strip() == "":
                # blank line potentially ends a game
                if seen_tag and seen_move:
                    yield "".join(buf)
                    buf = []
                    seen_tag = False
                    seen_move = False
            # non-empty non-tag line -> move text (or comments)
            elif not line.startswith("["):
                seen_move = True
        # EOF: if any buffered lines, yield them as last game
        if buf:
            yield "".join(buf)

In [4]:
def stream_chunks(pgn_path, chunk_size_bytes=8 * 1024 * 1024) -> str:
    """Yield PGN text chunks sized for parallel parsing without splitting games."""
    if chunk_size_bytes <= 0:
        raise ValueError("chunk_size_bytes must be positive")
    with open(pgn_path, encoding="utf-8", errors="replace") as f:
        buffer = []
        size_acc = 0
        for line in f:
            buffer.append(line)
            size_acc += len(line.encode("utf-8"))
            if size_acc >= chunk_size_bytes and line.strip() == "":
                yield "".join(buffer)
                buffer = []
                size_acc = 0
        if buffer:
            yield "".join(buffer)


def chunk_to_games(chunk_text):
    """Split a chunk of PGN text into complete game strings."""
    games = []
    buf = []
    seen_tag = False
    seen_move = False
    for line in chunk_text.splitlines(keepends=True):
        buf.append(line)
        if line.startswith("["):
            seen_tag = True
        elif line.strip() == "":
            if seen_tag and seen_move:
                games.append("".join(buf))
                buf = []
                seen_tag = False
                seen_move = False
        else:
            seen_move = True
    if buf and "".join(buf).strip():
        games.append("".join(buf))
    return games


def parse_game_text(game_text):
    """Worker: parse a single game PGN text and return a small summary dict.

    Keep it small to reduce pickling cost.
    """
    try:
        game = chess.pgn.read_game(io.StringIO(game_text))
        if game is None:
            return None
        headers = dict(game.headers)
        # count moves
        move_count = sum(1 for _ in game.mainline_moves())
        return {
            "Event": headers.get("Event"),
            "White": headers.get("White"),
            "Black": headers.get("Black"),
            "Result": headers.get("Result"),
            "Moves": move_count,
        }
    except Exception as e:
        # return minimal error info, or you can log to file
        return {"error": str(e)}


def parse_chunk(chunk_text):
    """Worker helper: parse every game in a chunk."""
    return parse_batch(chunk_to_games(chunk_text))


def process_pgn_parallel(pgn_path, max_workers=None, chunk_size_bytes=8 * 1024 * 1024, max_games=10_000):
    """Read games from pgn_path and parse them in parallel by chunk."""
    results = []
    max_workers = max_workers or max(1, os.cpu_count() - 1)
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for chunk in stream_chunks(pgn_path, chunk_size_bytes=chunk_size_bytes):
            futures.append(executor.submit(parse_chunk, chunk))
        stop = False
        for fut in as_completed(futures):
            chunk_results = fut.result()
            for record in chunk_results:
                if record is None:
                    continue
                results.append(record)
                if len(results) >= max_games:
                    stop = True
                    break
            if stop:
                break
        if stop:
            for fut in futures:
                if not fut.done():
                    fut.cancel()
    return results[:max_games]


def parse_batch(game_texts):
    """Helper executed in worker process: parse a list of game texts."""
    out = []
    for gt in game_texts:
        out.append(parse_game_text(gt))
    return out


def parse_games_to_records(game_summaries):
    """Dummy hook: convert parsed game summaries into DB-ready records."""
    return game_summaries


def load_records_to_db(records, connection_params=None):
    """Dummy hook: persist records into a database."""
    return len(records)


# summaries = process_pgn_parallel(pgn_file, max_workers=6, max_games=10_000)
# Example: count games and aggregate results
# print(f"Parsed {len(summaries)} games")
# # e.g. count results

# c = Counter(s.get("Result") for s in summaries)
# print(c)

In [5]:
pgn_file

'/home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.pgn'

In [None]:
%%time
i = 0
events = set()
for game in stream_games(pgn_file):
    game_text = parse_game_text(game)
    events.add(game_text.get("Event"))
    i += 1

    if i > 100000:
        break

print(f"Total Games Processed: {i}")

Total Games Processed: 100001
CPU times: user 1min 56s, sys: 155 ms, total: 1min 56s
Wall time: 1min 56s


In [None]:
MAX_GAMES = 100000
results = []
max_workers = max(1, os.cpu_count() - 1)
chunk_iter = stream_chunks(pgn_file, chunk_size_bytes=8 * 1024 * 1024)
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    in_flight = set()

    def submit_next_chunk():
        try:
            chunk = next(chunk_iter)
        except StopIteration:
            return False
        future = executor.submit(parse_chunk, chunk)
        in_flight.add(future)
        return True

    for _ in range(max_workers):
        if not submit_next_chunk():
            break

    stop = False
    while in_flight:
        fut = next(as_completed(in_flight))
        in_flight.remove(fut)
        chunk_results = fut.result()
        for record in chunk_results:
            if record is None:
                continue
            results.append(record)
            if len(results) >= MAX_GAMES:
                stop = True
                break
        if stop:
            break
        submit_next_chunk()

    if stop:
        for fut in in_flight:
            if not fut.done():
                fut.cancel()
results = results[:MAX_GAMES]

In [12]:
len(results)

100000

In [15]:
# ...existing code...
def stream_chunks2(pgn_path, chunk_size_bytes=8 * 1024 * 1024):
    """Yield PGN text chunks sized for parallel parsing without splitting games."""
    if chunk_size_bytes <= 0:
        raise ValueError("chunk_size_bytes must be positive")
    with open(pgn_path, encoding="utf-8", errors="replace") as f:
        buffer = []
        size_acc = 0
        seen_tag = False
        seen_move = False
        for line in f:
            buffer.append(line)
            size_acc += len(line.encode("utf-8"))
            if line.startswith("["):
                seen_tag = True
            elif line.strip() == "":
                # only cut if we've seen a complete game (tags + moves)
                if size_acc >= chunk_size_bytes and seen_tag and seen_move:
                    yield "".join(buffer)
                    buffer = []
                    size_acc = 0
                    seen_tag = False
                    seen_move = False
            # non-empty non-tag line => moves (or comments)
            elif not line.startswith("["):
                seen_move = True
        if buffer:
            yield "".join(buffer)


# ...existing code...

In [43]:
chunk_iter = stream_chunks(pgn_file, chunk_size_bytes=8 * 1024 * 1024)
chunk_iter2 = stream_chunks2(pgn_file, chunk_size_bytes=8 * 1024 * 1024)

In [50]:
next(chunk_iter2).splitlines(keepends=True)[-2:]


['[Termination "Time forfeit"]\n', '\n']

In [51]:
events = set()
with open(pgn_file, encoding="utf-8", errors="replace") as f:
    i=0
    while True:
        game = chess.pgn.read_game(f)
        if game is None:
            break
        events.add(game.headers.get("Event"))
        i += 1
        if i > 100000:
            break
events

{'Blitz swiss https://lichess.org/swiss/2YtVa2l2',
 'Blitz swiss https://lichess.org/swiss/5P80WKBV',
 'Blitz swiss https://lichess.org/swiss/ACHYV6sJ',
 'Blitz swiss https://lichess.org/swiss/AZIRf2ev',
 'Blitz swiss https://lichess.org/swiss/AbevL0Q9',
 'Blitz swiss https://lichess.org/swiss/BBzlRwQD',
 'Blitz swiss https://lichess.org/swiss/Gnpu9Apt',
 'Blitz swiss https://lichess.org/swiss/MFUUIgH4',
 'Blitz swiss https://lichess.org/swiss/MhzSBSXH',
 'Blitz swiss https://lichess.org/swiss/Qv6afZ0z',
 'Blitz swiss https://lichess.org/swiss/VxZrwpC5',
 'Blitz swiss https://lichess.org/swiss/ZAAcAy8B',
 'Blitz swiss https://lichess.org/swiss/mIbOfMJP',
 'Blitz swiss https://lichess.org/swiss/r0rkgfqy',
 'Blitz swiss https://lichess.org/swiss/ym1dmEas',
 'Bullet swiss https://lichess.org/swiss/KtR4gySL',
 'Bullet swiss https://lichess.org/swiss/LsNw6aHy',
 'Bullet swiss https://lichess.org/swiss/fAVl6Fb3',
 'Bullet swiss https://lichess.org/swiss/qrlLuyEU',
 'Classical swiss https://l

In [52]:
len(events)

220

In [5]:
def discover_offsets(path, encoding="utf-8", show_progress=True):
    import os
    from tqdm.auto import tqdm
    offsets = []
    total = os.path.getsize(path)
    with open(path, "r", encoding=encoding, errors="replace") as f:
        pbar = tqdm(total=total, unit="B", unit_scale=True, disable=not show_progress)
        while True:
            start = f.tell()
            headers = chess.pgn.read_headers(f)  # consumes a whole game
            end = f.tell()
            if headers is None:
                break
            offsets.append(start)
            pbar.update(end - start)
        pbar.update(total - pbar.n)  # finish if needed
        pbar.close()
    return offsets

In [6]:
offsets = discover_offsets(pgn_file)
len(offsets)

  0%|          | 0.00/216G [00:00<?, ?B/s]

92695519

In [None]:
import numpy as np

def save_offsets_npy(path, offsets):
    np.save(path, np.asarray(offsets, dtype=np.uint64))

def load_offsets_npy(path):
    return np.load(path).tolist()

In [None]:
save_offsets_npy("/home/vandy/Work/MATH6310/blunder-analysis/data/raw/dataoffsets.npy", offsets)

In [1]:
import numpy as np, os

idx_path = "/home/vandy/Work/MATH6310/blunder-analysis/data/raw/lichess_db_standard_rated_2025-08.idx"
mm = np.memmap(idx_path, dtype=np.int64, mode="r")

len(mm)

92695519

In [2]:
diff = np.diff(mm, prepend=mm[0]-1)
invalid = np.nonzero(diff <= 0)[0]
valid_count = int(invalid[0]) if invalid.size else mm.size
valid_count

92695519

In [7]:
del mm  # close the mapping
os.truncate(idx_path, valid_count * np.dtype(np.int64).itemsize)