In [10]:
import os

# Point JAVA_HOME to your Java 17
os.environ["JAVA_HOME"] = "/opt/homebrew/Cellar/openjdk@17/17.0.16/libexec/openjdk.jdk/Contents/Home"

# Update PATH so 'java' command uses Java 17
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Check
!java -version


openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment Homebrew (build 17.0.16+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.16+0, mixed mode, sharing)


In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, BooleanType

spark = SparkSession.builder \
    .appName("LichessPGN") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()



In [31]:
import requests
import zstandard as zstd
import chess.pgn
import io
from tqdm.notebook import tqdm
import pandas as pd

url = "https://database.lichess.org/standard/lichess_db_standard_rated_2025-10.pgn.zst"

rows = []

def extract_moves(game):
    moves = []
    node = game
    while node.variations:
        node = node.variations[0]
        moves.append(node.move.uci())
    return moves

def game_has_evals(game):
    node = game
    while node.variations:
        node = node.variations[0]
        if node.comment and "%eval" in node.comment:
            return True
    return False

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    dctx = zstd.ZstdDecompressor()
    with dctx.stream_reader(r.raw) as reader:
        text_stream = io.TextIOWrapper(reader, encoding="utf-8")

        # Create a generator for all games
        games_iter = iter(lambda: chess.pgn.read_game(text_stream), None)

        # Wrap the iterator in tqdm for a live progress bar
        for i, game in enumerate(tqdm(games_iter, desc="Parsing games", unit="games")):
            if game is None:
                break
            if i >= 5000000:  # limit for demo
                break

            headers = game.headers
            moves = extract_moves(game)

            row = {
                "white_rating": int(headers.get("WhiteElo", 0)),
                "black_rating": int(headers.get("BlackElo", 0)),
                "time_control": headers.get("TimeControl"),
                "opening": headers.get("Opening"),
                "result": headers.get("Result"),
                "moves": moves,
                "move_count": len(moves),
                "has_evals": game_has_evals(game)
            }

            rows.append(row)

# Convert to pandas DataFrame (or Spark later)
df = pd.DataFrame(rows)
df.head()


Parsing games: 0games [00:00, ?games/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x111e47490>>
Traceback (most recent call last):
  File "/opt/miniconda3/envs/chessenv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


KeyboardInterrupt: 

In [17]:
sdf.select("white_rating", "black_rating").show(5)

+------------+------------+
|white_rating|black_rating|
+------------+------------+
|        1217|        1232|
|        2322|        2202|
|        1533|        1525|
|        1313|        1291|
|        1358|        1436|
+------------+------------+
only showing top 5 rows
