In [1]:
import os
import requests
import zstandard as zstd
import chess.pgn
import io
from tqdm.notebook import tqdm
import pandas as pd

In [3]:
# Point JAVA_HOME to your Java 17
os.environ["JAVA_HOME"] = "/opt/homebrew/Cellar/openjdk@17/17.0.16/libexec/openjdk.jdk/Contents/Home"

# Update PATH so 'java' command uses Java 17
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Check
!java -version

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, BooleanType

spark = SparkSession.builder \
    .appName("LichessPGN") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .getOrCreate()


openjdk version "17.0.16" 2025-07-15
OpenJDK Runtime Environment Homebrew (build 17.0.16+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.16+0, mixed mode, sharing)


In [4]:
import requests
import zstandard as zstd
import chess.pgn
import io
from tqdm import tqdm

url = "https://database.lichess.org/standard/lichess_db_standard_rated_2025-10.pgn.zst"
parquet_dir = "data/01_Raw/lichess_1M_games"

BATCH_SIZE = 10000
MAX_GAMES = 1_000_000

batch = []
count = 0
file_index = 0

def extract_moves(game):
    moves = []
    node = game
    while node.variations:
        node = node.variations[0]
        moves.append(node.move.uci())
    return moves

def game_has_evals(game):
    node = game
    while node.variations:
        node = node.variations[0]
        if node.comment and "%eval" in node.comment:
            return True
    return False

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    dctx = zstd.ZstdDecompressor()

    with dctx.stream_reader(r.raw) as reader:
        text_stream = io.TextIOWrapper(reader, encoding="utf-8")
        games_iter = iter(lambda: chess.pgn.read_game(text_stream), None)

        for game in tqdm(games_iter, desc="Parsing games"):
            if game is None:
                break
            if count >= MAX_GAMES:
                break

            headers = game.headers
            batch.append({
                "white_rating": int(headers.get("WhiteElo", 0)),
                "black_rating": int(headers.get("BlackElo", 0)),
                "time_control": headers.get("TimeControl"),
                "opening": headers.get("Opening"),
                "result": headers.get("Result"),
                "moves": extract_moves(game),
                "move_count": len(extract_moves(game)),
                "has_evals": game_has_evals(game)
            })

            count += 1

            # ---------- write each batch to parquet ----------
            if len(batch) >= BATCH_SIZE:
                sdf_batch = spark.createDataFrame(batch)
                sdf_batch.write.mode("append").parquet(parquet_dir)
                batch = []   # clear memory

# Write remaining leftovers
if batch:
    sdf_batch = spark.createDataFrame(batch)
    sdf_batch.write.mode("append").parquet(parquet_dir)

print("Done. Saved:", count, "games.")


Parsing games: 1000000it [13:35, 1225.80it/s]                                   

Done. Saved: 1000000 games.





In [5]:
sdf = spark.read.parquet("data/01_Raw/lichess_1M_games")
print(sdf.count())
sdf.show(5)

1000000
+------------+---------+----------+--------------------+--------------------+------+------------+------------+
|black_rating|has_evals|move_count|               moves|             opening|result|time_control|white_rating|
+------------+---------+----------+--------------------+--------------------+------+------------+------------+
|        1884|    false|        54|[e2e4, e7e5, g1f3...|     Elephant Gambit|   0-1|       180+0|        1646|
|        2022|    false|        93|[d2d4, d7d5, g1f3...|Queen's Pawn Game...|   1-0|        15+0|        2163|
|        1903|    false|        49|[e2e4, e7e5, f1c4...|Italian Game: Par...|   1-0|       180+0|        1901|
|        1690|    false|        80|[d2d4, c7c5, c2c4...|Benoni Defense: O...|   0-1|       180+0|        1622|
|        1222|    false|        49|[b2b3, g7g6, c1b2...| Nimzo-Larsen Attack|   1-0|       180+0|        1253|
+------------+---------+----------+--------------------+--------------------+------+------------+-------