In [6]:
import huggingface_hub
import datasets


In [None]:
# Load using streaming mode
dataset = datasets.load_dataset("Lichess/standard-chess-games", split="train", streaming=True)

# Get a small sample of 10 games
sample = []
for i, row in enumerate(dataset):
    if i <= 10000:
        continue
    if i > 200000:
        break
    sample.append(row)

In [3]:
import datasets
import itertools

# Load using streaming mode
dataset_iterable = datasets.load_dataset("Lichess/standard-chess-games", split="train", streaming=True)

# Define the slice (0-indexed)
# You want to skip the first 10001 items (indices 0 to 10000)
# And take items up to index 200000 (exclusive of 200001 if using as stop)
# So, start index for islice = 10001
# Stop index for islice = 200001 (to include item at index 200000)
# This means you're interested in dataset items that would correspond to `i` from 10001 to 200000 in your original loop.

# Original loop:
# i = 0 to 10000 -> continue (skip 10001 items: 0 to 10000)
# i = 10001 to 200000 -> append (190000 items)
# i = 200001 -> break

start_index_for_slice = 10001 # Corresponds to `i` starting at 10001
stop_index_for_slice = 200001 # Corresponds to `i` up to 200000, so `islice` stops *before* 200001

print(f"Creating a slice from item {start_index_for_slice} to item {stop_index_for_slice -1}...")

# itertools.islice(iterable, start, stop, step)
# We are taking items from the (start_index_for_slice)-th position up to (stop_index_for_slice-1)-th position.
# If dataset_iterable were a list, it would be dataset_iterable[start_index_for_slice : stop_index_for_slice]
sliced_data = itertools.islice(dataset_iterable, start_index_for_slice, stop_index_for_slice)

# Get a sample (now from the sliced_data)
sample = []
for i, row in enumerate(sliced_data): # enumerate here will be 0-indexed for the slice
    sample.append(row)
    if (i + 1) % 10000 == 0: # Print progress every 10000 items collected from the slice
        print(f"Collected {i+1} items from the slice...")

print(f"Finished. Total items in sample: {len(sample)}")
if sample:
    print("First item of the sample (which was the 10001st item of the dataset):", sample[0])
    # print("Last item of the sample (which was the 200000th item of the dataset):", sample[-1]) # only if len(sample) == expected_count

Creating a slice from item 10001 to item 200000...
Collected 10000 items from the slice...
Collected 20000 items from the slice...
Collected 30000 items from the slice...
Collected 40000 items from the slice...
Collected 50000 items from the slice...
Collected 60000 items from the slice...
Collected 70000 items from the slice...
Collected 80000 items from the slice...
Collected 90000 items from the slice...
Collected 100000 items from the slice...
Collected 110000 items from the slice...
Collected 120000 items from the slice...
Collected 130000 items from the slice...
Collected 140000 items from the slice...
Collected 150000 items from the slice...
Collected 160000 items from the slice...
Collected 170000 items from the slice...
Collected 180000 items from the slice...
Collected 190000 items from the slice...
Finished. Total items in sample: 190000
First item of the sample (which was the 10001st item of the dataset): {'Event': 'Rated Bullet game', 'Site': 'https://lichess.org/tcobg87g'

In [4]:
import re
import chess
import torch
from datetime import datetime

# ------------------- CONFIG: integer IDs for piece types ------------------- #
piece_label_to_id = {
    "WR": 1, "WN": 2, "WB1": 3, "WQ": 4, "WK": 5, "WB2": 6, "WP": 7,
    "BP": 8, "BR": 9, "BN": 10, "BB1": 11, "BQ": 12, "BK": 13, "BB2": 14,
}

# ------------------- INITIAL BOARD ------------------- #
initial_map = {
    0: "WR", 1: "WN", 2: "WB1", 3: "WQ", 4: "WK", 5: "WB2", 6: "WN", 7: "WR",
    8: "WP", 9: "WP", 10: "WP", 11: "WP", 12: "WP", 13: "WP", 14: "WP", 15: "WP",
    48: "BP", 49: "BP", 50: "BP", 51: "BP", 52: "BP", 53: "BP", 54: "BP", 55: "BP",
    56: "BR", 57: "BN", 58: "BB1", 59: "BQ", 60: "BK", 61: "BB2", 62: "BN", 63: "BR",
}

# ------------------- Move Parser ------------------- #
_move_num = re.compile(r"^\d+\.(\.\.)?$")

def san_stream(movetext: str):
    for tok in movetext.replace("\n", " ").split():
        if _move_num.match(tok) or tok in {"1-0", "0-1", "1/2-1/2", "*"}:
            continue
        yield tok

# ------------------- Convert mapping to tensor ------------------- #
def mapping_to_tensor(mapping: dict) -> torch.LongTensor:
    return torch.tensor([
        piece_label_to_id.get(mapping.get(i), 0) for i in range(64)
    ], dtype=torch.long)

# ------------------- Determine winner string ------------------- #
def result_to_winner(result: str) -> str:
    if result == "1-0":
        return "white"
    elif result == "0-1":
        return "black"
    else:
        return "draw"
    
    
def create_dataset_from_games(game_dicts: list[dict]) -> list[dict]:
    full_dataset = []

    for game in game_dicts:
        try:
            board = chess.Board()
            mapping = initial_map.copy()
            states = [mapping_to_tensor(mapping)]
            turns = ["white"]  # starting with white
            move_vectors = []

            for san in san_stream(game["movetext"]):
                move = board.parse_san(san)
                from_sq, to_sq = move.from_square, move.to_square
                move_vector = [from_sq, to_sq]

                # Update mapping
                moving_id = mapping.pop(from_sq, None)

                if to_sq in mapping:
                    mapping.pop(to_sq)

                if board.is_en_passant(move):
                    ep_target = to_sq + (-8 if board.turn else 8)
                    mapping.pop(ep_target, None)

                mapping[to_sq] = moving_id

                # Skip rook move, just rely on king's move in castling
                board.push(move)

                states.append(mapping_to_tensor(mapping))
                turns.append("white" if board.turn else "black")
                move_vectors.append(move_vector)

        except Exception:
            continue

        winner = result_to_winner(game["Result"])

        for i in range(len(states) - 1):
            full_dataset.append({
                "input": states[i],
                "output": torch.tensor(move_vectors[i], dtype=torch.long),  # output is [from_sq, to_sq]
                "turn": turns[i],
                "winner": winner
            })

    return full_dataset


In [5]:
ds = create_dataset_from_games(sample)

In [20]:
len(ds)

12761032

In [21]:
import os
import torch
from datasets import Dataset, load_from_disk

# ------------------- CONFIG ------------------- #
HUB_PATH = "youngchiller40/chessset2"
SAVE_DIR = "hf_chess_tmp"
SHARD_SIZE = 500_000
os.makedirs(SAVE_DIR, exist_ok=True)

# ------------------- Shard & Push ------------------- #
num_shards = (len(ds) + SHARD_SIZE - 1) // SHARD_SIZE

for shard_idx in range(num_shards):
    print(f"Processing shard {shard_idx + 1}/{num_shards}...")

    start = shard_idx * SHARD_SIZE
    end = min((shard_idx + 1) * SHARD_SIZE, len(ds))
    shard_data = ds[start:end]

    # Convert tensors to lists for HF compatibility
    hf_ready = [{
        "input": d["input"].tolist(),
        "output": d["output"].tolist(),
        "turn": d["turn"],
        "winner": d["winner"]
    } for d in shard_data]

    new_ds = Dataset.from_list(hf_ready)

    # Save locally and then push
    shard_path = os.path.join(SAVE_DIR, f"shard_{shard_idx:04d}")
    new_ds.save_to_disk(shard_path)

    # Reload and push to Hugging Face Hub
    ds_loaded = load_from_disk(shard_path)
    shard_split_name = f"train_shard{shard_idx:04d}"  # <-- THIS FIXES THE ERROR
    ds_loaded.push_to_hub(HUB_PATH, split=shard_split_name)


Processing shard 1/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 134899.37 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 87.82ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.54s/it]


Processing shard 2/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:04<00:00, 122882.49 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 96.46ba/s] 
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.07s/it]


Processing shard 3/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 128203.40 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 91.03ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.26s/it]


Processing shard 4/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:04<00:00, 124649.30 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 89.68ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.57s/it]


Processing shard 5/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 130130.45 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 93.07ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.39s/it]


Processing shard 6/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 132578.17 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:06<00:00, 79.51ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.02s/it]


Processing shard 7/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 144375.11 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 90.81ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.33s/it]


Processing shard 8/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 135977.41 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 93.39ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.23s/it]


Processing shard 9/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 127751.99 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 85.68ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it]


Processing shard 10/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 136505.42 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:04<00:00, 100.90ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.81s/it]


Processing shard 11/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 128717.78 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.35ba/s] 
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


Processing shard 12/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 140525.22 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 88.67ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.42s/it]


Processing shard 13/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 142392.48 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.16ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.20s/it]


Processing shard 14/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 130779.47 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 89.56ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


Processing shard 15/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 139212.59 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.84ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.18s/it]


Processing shard 16/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 140951.54 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.00ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.82s/it]


Processing shard 17/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 142019.22 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:07<00:00, 65.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:08<00:00,  8.43s/it]


Processing shard 18/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 137322.37 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 91.22ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.24s/it]


Processing shard 19/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 145571.34 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 93.37ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]


Processing shard 20/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 130646.80 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:10<00:00, 48.26ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:12<00:00, 12.18s/it]


Processing shard 21/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 147834.50 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.47ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.15s/it]


Processing shard 22/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 140196.23 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 92.92ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.14s/it]


Processing shard 23/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 148235.16 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 93.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.30s/it]


Processing shard 24/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 146291.88 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 91.94ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]


Processing shard 25/26...


Saving the dataset (1/1 shards): 100%|██████████| 500000/500000 [00:03<00:00, 147139.62 examples/s]
Creating parquet from Arrow format: 100%|██████████| 500/500 [00:05<00:00, 93.20ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


Processing shard 26/26...


Saving the dataset (1/1 shards): 100%|██████████| 261032/261032 [00:01<00:00, 143204.73 examples/s]
Creating parquet from Arrow format: 100%|██████████| 262/262 [00:02<00:00, 90.25ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.45s/it]
