In [1]:
from pathlib import Path
from typing import List, Dict, Iterator, Optional, Any, Generator, Tuple
from enum import Enum
import sys, asyncio
import chess
import chess.pgn
import chess.engine

In [2]:
if sys.platform.startswith("win"):
    stockfish_executable_path = Path("./stockfish/stockfish-windows-x86-64-avx2.exe")
    print(f"Using Stockfish executable: {stockfish_executable_path}")

if sys.platform.startswith("darwin"):
    stockfish_executable_path = Path("./stockfish/stockfish-macos-m1-apple-silicon")
    print(f"Using Stockfish executable: {stockfish_executable_path}")

Using Stockfish executable: stockfish\stockfish-windows-x86-64-avx2.exe


In [3]:
def iter_games(pgn_path: Path) -> Iterator[chess.pgn.Game]:
    """Yield games one by one from a PGN file"""
    
    if pgn_path.suffix.lower() != ".pgn":
        raise ValueError(f"Expected a .pgn file, got: {pgn_path.suffix}")
    
    with open(pgn_path, "r", encoding="utf-8", errors="replace") as f:
        while True:
            game = chess.pgn.read_game(f)
            if game is None:
                break
            yield game

In [4]:
"""
Why this cell exists:
- python-chess launches Stockfish via asyncio.subprocess_exec.
- On Windows, the Selector event loop cannot create subprocesses, it raises NotImplementedError.
- Some Jupyter kernels on Windows start with the Selector policy by default.
- Switching to WindowsProactorEventLoopPolicy enables subprocess support in this notebook.

How to use:
- Run this cell once before creating the engine.
- On macOS or Linux this does nothing and is safe.
"""
if sys.platform.startswith("win"):
    print(f"Initial Policy: {type(asyncio.get_event_loop_policy()).__name__}")
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
    print(f"New Policy: {type(asyncio.get_event_loop_policy()).__name__}")


Initial Policy: WindowsSelectorEventLoopPolicy
New Policy: WindowsProactorEventLoopPolicy


In [5]:
class PositionLabel(Enum):
    WHITE_WINNING = 0
    WHITE_DECISIVE = 1
    WHITE_BETTER = 2
    EQUAL = 3
    BLACK_BETTER = 4
    BLACK_DECISIVE = 5
    BLACK_WINNING = 6

class GameStage(Enum):
    OPENING = 0
    MIDDLEGAME = 1
    ENDGAME = 2

In [6]:
def get_game_result(game: chess.pgn.Game) -> float | None:
    """
    Parses the PGN header result into a float.
    Returns None if the game is unfinished or unknown ('*').
    """
    res = game.headers.get("Result", "*")
    if res == "1-0":
        return 1.0
    elif res == "0-1":
        return 0.0
    elif res == "1/2-1/2":
        return 0.5
    return None

In [None]:
def get_tapered_phase_score(board: chess.Board) -> float:
    """
    Calculates the game phase based on Non-Pawn Material (NPM).
    Returns a phase factor 'p' where:
    - 1.0 represents the Start of the game (Opening/Middlegame).
    - 0.0 represents a completely empty board (Pure Endgame).
    
    Credits: Stockfish
    """
    
    phase = 0
    MAX_PHASE = 24
    phase_weights = {
        chess.KNIGHT: 1,
        chess.BISHOP: 1,
        chess.ROOK: 2,
        chess.QUEEN: 4
    }

    for piece_type, weight in phase_weights.items():
        count = len(board.pieces(piece_type, chess.WHITE)) + \
                len(board.pieces(piece_type, chess.BLACK))
        phase += count * weight
    
    # Clamp phase to ensure it never exceeds bounds (e.g. unexpected promotions)
    phase = min(phase, MAX_PHASE)
    
    # Normalize (0.0 to 1.0)
    return phase / MAX_PHASE
    

In [8]:
import random

def process_game(game: chess.pgn.Game) -> Generator[Dict[str, Any], None, None]:
    """
    Iterates through a single game and yields a dictionary for every position.
    Skip the game if result is unknown.
    """
    result = get_game_result(game)
    if result is None:
        return  
    
    board = game.board()
    
    for move in game.mainline_moves():
        try:
            board.push(move)
            board_copy = board.copy()
            legal_moves = list(board_copy.legal_moves)

            if legal_moves:
                # 3. Apply ONE random legal move (includes captures, checks, etc.)
                random_move = random.choice(legal_moves)
                board_copy.push(random_move)
                
                # 4. Calculate stats for this new, fictitious position
                aug_fen = board_copy.fen()
                aug_phase = get_tapered_phase_score(board_copy)
                
                if aug_phase > 0.66:
                    aug_stage = 0 # Opening
                elif aug_phase > 0.15: 
                    aug_stage = 1 # Middlegame
                else:
                    aug_stage = 2 # Endgame

                aug_check = board_copy.is_check()
                
                yield {
                    "fen": aug_fen,
                    "game_result": result, # Kept for schema compatibility, but unreliable now
                    "game_phase": aug_phase,
                    "game_stage": aug_stage,
                    "is_check": aug_check
                }

        except ValueError:
            continue # Skip illegal moves if any

In [9]:
def worker_evaluator(fen_batch: List[str]) -> List[Tuple[Optional[int], Optional[int]]]:
    """
    Worker function: Starts Stockfish, processes a list of FENs, returns scores then quits.
    """

    if sys.platform.startswith("win"):
        engine_path = Path("./stockfish/stockfish-windows-x86-64-avx2.exe")
    else:
        engine_path = Path("./stockfish/stockfish-macos-m1-apple-silicon")
        
    results = []
    engine = None
    
    try:
        engine = chess.engine.SimpleEngine.popen_uci(str(engine_path))
                
        for fen in fen_batch:
            board = chess.Board(fen)
            
            # Depth 10 (Impulsive)
            info_depth_10 = engine.analyse(board, chess.engine.Limit(depth=10))
            score_depth_10 = info_depth_10["score"].white().score(mate_score=10000)
            
            # Depth 20 (Truth)
            info_depth_20 = engine.analyse(board, chess.engine.Limit(depth=20))
            score_depth_20 = info_depth_20["score"].white().score(mate_score=10000)
            
            results.append((score_depth_10, score_depth_20))

    except Exception as e:
        print(f"Worker Error: {e}")
        results = [(None, None)] * len(fen_batch)
        
    finally:
        if engine:
            engine.quit()
            
    return results

In [10]:
import os

# Ryzen 5900X 12 Core
os.cpu_count()

24

In [11]:
def get_score_label(cp: float) -> int:
    """
    Classifies centipawn score into an integer label (0-6).
    
    0: White Winning (>= 500)
    1: White Decisive (300 to 499)
    2: White Better   (100 to 299)
    3: Equal          (-99 to 99)
    4: Black Better   (-100 to -299)
    5: Black Decisive (-300 to -499)
    6: Black Winning  (<= -500)
    """
    if cp >= 500:       return 0
    if 500 > cp >= 300: return 1
    if 300 > cp >= 100: return 2
    if 100 > cp > -100: return 3
    if -100 >= cp > -300: return 4
    if -300 > cp >= -500: return 5
    return 6 

In [12]:
import concurrent.futures
import tqdm

def run_parallel_evaluation(position_dataset: List[Dict], chunk_size: int) -> List[Dict]:
    """
    Splits data into chunks, runs parallel evaluation.
    Uses a 'temporary' progress bar that vanishes when done.
    """
    num_workers = 10

    fens = [d["fen"] for d in position_dataset]    
    chunks = [fens[i:i + chunk_size] for i in range(0, len(fens), chunk_size)]
    
    flat_results = []
    
    # ThreadPoolExecutor yields values in the exact same order as the inputs were passed
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        all_batch_results = tqdm(executor.map(worker_evaluator, chunks), 
                                 total=len(chunks), 
                                 desc="Stockfish Eval", 
                                 unit="chunk",
                                 leave=False) 
        
        for batch_result in all_batch_results:
            flat_results.extend(batch_result)

    # Merge Scores into Metadata
    labeled_data = []
    
    for i, scores in enumerate(flat_results):
        if scores is not None:
            row = position_dataset[i]

            score_10 = float(scores[0])
            score_20 = float(scores[1])
            row["stockfish_score_depth_10"] = score_10
            row["stockfish_score_depth_20"] = score_20
            row["stockfish_label_depth_20"] = get_score_label(score_20)
            labeled_data.append(row)
            
    return labeled_data

In [13]:
import pandas as pd
import gc
from tqdm import tqdm
from pathlib import Path

def get_next_batch_index(existing_parts) -> int:
    indices = []
    for f in existing_parts:
        try:
            indices.append(int(f.stem.split("_")[-1]))
        except ValueError:
            continue
    return (max(indices) + 1) if indices else 0

def make_save_path(output_dir: Path, batch_index: int) -> Path:
    return output_dir / f"data_part_{batch_index:04d}.parquet"

def evaluate_and_save_batch(batch, output_dir: Path, batch_index: int, chunk_size:int ) -> int:
    evaluated_batch = run_parallel_evaluation(batch, chunk_size)
    df = pd.DataFrame(evaluated_batch)
    df.to_parquet(make_save_path(output_dir, batch_index), index=False)

    # cleanup
    del evaluated_batch, df
    gc.collect()

    return batch_index + 1

def run_batch_pipeline(pgn_folder: Path, target_count: int, batch_size: int, output_dir: Path, chunk_size:int):
    output_dir.mkdir(parents=True, exist_ok=True)

    seen_fens = set()
    current_batch = []

    existing_parts = list(output_dir.glob("data_part_*.parquet"))

    # Resume
    if existing_parts:
        print(f"Found {len(existing_parts)} existing parts. Loading seen FENs to resume...")
        for file_path in tqdm(existing_parts, desc="Resuming"):
            try:
                df_existing = pd.read_parquet(file_path, columns=["fen"])
                seen_fens.update(df_existing["fen"].tolist())
            except Exception as e:
                print(f"Warning: Could not read {file_path}: {e}")

    batch_index = get_next_batch_index(existing_parts)
    total_collected = len(seen_fens)

    print(f"Resumed with {total_collected} positions. Next Batch Index: {batch_index}")
    print(f"Starting pipeline. Target: {target_count} | Batch Size: {batch_size}")

    pgn_files = list(pgn_folder.glob("*.pgn"))
    if not pgn_files:
        print("No PGN files found!")
        return

    pbar = tqdm(total=target_count, initial=total_collected, desc="Total Progress", unit="pos")

    for pgn_file in pgn_files:
        if total_collected >= target_count:
            break

        for game in iter_games(pgn_file):
            if total_collected >= target_count:
                break

            for position in process_game(game):
                fen = position["fen"]

                if fen in seen_fens:
                    continue

                seen_fens.add(fen)
                current_batch.append(position)
                total_collected += 1
                pbar.update(1)

                if len(current_batch) >= batch_size:
                    batch_index = evaluate_and_save_batch(current_batch, output_dir, batch_index, chunk_size)
                    current_batch.clear()

                if total_collected >= target_count:
                    break

    # Final partial batch
    if current_batch:
        print(f"Processing final partial batch of {len(current_batch)}...")
        batch_index = evaluate_and_save_batch(current_batch, output_dir, batch_index, chunk_size)
        print(f"Saved final batch {batch_index - 1}")

    pbar.close()
    print("Run Complete.")


In [14]:
TARGET_POSITIONS = 5000000   
BATCH_SIZE = 10000           # Saves 1 file every 10k positions (100 files total)
CHUNK_SIZE = 250             # 10,000 / 250 = 40 tasks. Perfect for 12 workers.    

GAMES_FOLDER = Path("./cclr/train")
OUTPUT_DIR = Path("./dataset_parts")

run_batch_pipeline(GAMES_FOLDER, TARGET_POSITIONS, BATCH_SIZE, OUTPUT_DIR, CHUNK_SIZE)

Found 100 existing parts. Loading seen FENs to resume...


Resuming: 100%|██████████| 100/100 [00:00<00:00, 128.01it/s]


Resumed with 1000000 positions. Next Batch Index: 100
Starting pipeline. Target: 5000000 | Batch Size: 10000


Total Progress: 100%|██████████| 5000000/5000000 [72:25:23<00:00, 15.34pos/s]     


Run Complete.


In [2]:
import pandas as pd
import numpy as np
import os
import glob
from tqdm import tqdm
from pathlib import Path

INPUT_DIR = Path("./dataset_parts")      
OUTPUT_DIR = Path("./dataset_clean")   

os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_chess_class_vectorized(scores):
    conditions = [
        (scores >= 500),                     # 0: White Winning
        (scores >= 300) & (scores < 500),    # 1: White Decisive
        (scores >= 100) & (scores < 300),    # 2: White Better
        (scores > -100) & (scores < 100),    # 3: Equal
        (scores > -300) & (scores <= -100),  # 4: Black Better
        (scores > -500) & (scores <= -300),  # 5: Black Decisive
        (scores <= -500)                     # 6: Black Winning
    ]
    choices = [0, 1, 2, 3, 4, 5, 6]

    return np.select(conditions, choices, default=3)

def filter_and_save(file_path):
    df = pd.read_parquet(file_path)
    
    df['calc_label_d10'] = get_chess_class_vectorized(df['stockfish_score_depth_10'])
    df['calc_label_d20'] = get_chess_class_vectorized(df['stockfish_score_depth_20'])
    
    # Filters
    stability_mask = abs(df['calc_label_d10'] - df['calc_label_d20']) < 2
    is_mate_d20 = abs(df['stockfish_score_depth_20']) > 9000
    is_blind_d10 = abs(df['stockfish_score_depth_10']) < 300
    mate_trap_mask = (is_mate_d20 & is_blind_d10)
    final_mask = stability_mask & (~mate_trap_mask)
    
    # Apply Filter
    original_count = len(df)
    df_clean = df[final_mask].copy()
    
    # Clean up the temporary columns
    df_clean.drop(columns=['calc_label_d10', 'calc_label_d20'], inplace=True)
    
    filename = os.path.basename(file_path)
    save_path = os.path.join(OUTPUT_DIR, filename)
    df_clean.to_parquet(save_path, index=False)
    
    return original_count, len(df_clean)

# -------------------------------------------------------------------
parquet_files = glob.glob(os.path.join(INPUT_DIR, "*.parquet"))
total_kept = 0
total_removed = 0

print(f"Starting processing on {len(parquet_files)} files...")

for f in tqdm(parquet_files):
    orig, kept = filter_and_save(f)
    total_kept += kept
    total_removed += (orig - kept)

print("-" * 30)
print(f"DONE!")
print(f"Original Positions: {total_kept + total_removed}")
print(f"Positions Removed:  {total_removed}")
print(f"Positions Kept:     {total_kept}")
print(f"Percentage Lost:    {total_removed / (total_kept + total_removed) * 100:.2f}%")

Starting processing on 500 files...


  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:11<00:00, 42.16it/s]

------------------------------
DONE!
Original Positions: 5000000
Positions Removed:  26698
Positions Kept:     4973302
Percentage Lost:    0.53%





In [None]:
"""
With side to move feature
Did not swap black to move to white to move
"""

import pandas as pd
import numpy as np
import chess
import os
from pathlib import Path
from tqdm import tqdm
import shutil

# ==========================================
# CONFIGURATION
# ==========================================
INPUT_DIR = Path("./dataset_clean")
OUTPUT_DIR = Path("./dataset_ready")
RANDOM_SEED = 42

# Define File Indices (Human: 0-99, Computer: 100-499)
HUMAN_FILES = sorted(list(INPUT_DIR.glob("*part_00[0-9][0-9].parquet"))) # Matches 0000-0099
COMP_FILES = sorted(list(INPUT_DIR.glob("*part_0[1-4][0-9][0-9].parquet"))) # Matches 0100-0499

all_files = sorted(list(INPUT_DIR.glob("*.parquet")))
human_files_sorted = all_files[0:100]
comp_files_sorted = all_files[100:500]

train_files = human_files_sorted[0:80] + comp_files_sorted[0:320]
val_files = human_files_sorted[80:90] + comp_files_sorted[320:360]
test_files = human_files_sorted[90:100] + comp_files_sorted[360:400]

print(f"Split Summary:")
print(f"Train Files: {len(train_files)}")
print(f"Val Files:   {len(val_files)}")
print(f"Test Files:  {len(test_files)}")

def get_mirrored_fen(fen):
    """
     Geometric Flip + Color Inversion.
    """
    board = chess.Board(fen)
    return board.mirror().fen()

def get_mirrored_label(label):
    """
    Maps White-based labels to Black-based labels.
    0 (White Win) <-> 6 (Black Win)
    1 (White Dec) <-> 5 (Black Dec)
    2 (White Bet) <-> 4 (Black Bet)
    3 (Equal)     <-> 3 (Equal)
    """
    mapping = {0: 6, 1: 5, 2: 4, 3: 3, 4: 2, 5: 1, 6: 0}
    return mapping.get(label, 3)

def load_and_augment(file_list, split_name):
    """
    Loads files, mirrors non-equal classes, and returns a unified DataFrame.
    """
    print(f"\n--- Processing {split_name} ---")
    
    df_list = []
    for f in tqdm(file_list, desc="Loading Parquet"):
        df_list.append(pd.read_parquet(f))
    
    full_df = pd.concat(df_list, ignore_index=True)
    print(f"Original Count: {len(full_df)}")
    
    # Identify rows to mirror
    mask_augment = full_df["stockfish_label_depth_20"] != 3
    df_to_mirror = full_df[mask_augment].copy()
    
    if len(df_to_mirror) > 0:
        tqdm.pandas(desc="Mirroring FENs")
        
        df_to_mirror["fen"] = df_to_mirror["fen"].progress_apply(get_mirrored_fen)
        
        df_to_mirror["stockfish_label_depth_20"] = df_to_mirror["stockfish_label_depth_20"].apply(get_mirrored_label)
        
        # Invert scores
        if "stockfish_score_depth_20" in df_to_mirror.columns:
             df_to_mirror["stockfish_score_depth_20"] = -df_to_mirror["stockfish_score_depth_20"]
        if "stockfish_score_depth_10" in df_to_mirror.columns:
             df_to_mirror["stockfish_score_depth_10"] = -df_to_mirror["stockfish_score_depth_10"]
            
        augmented_df = pd.concat([full_df, df_to_mirror], ignore_index=True)
    else:
        augmented_df = full_df
        
    print(f"Post-Augmentation Count: {len(augmented_df)}")
    return augmented_df

def balance_classes(df):
    """
    Downsamples all classes to match the count of the minority class.
    """
    groups = df.groupby("stockfish_label_depth_20")
    counts = groups.size()
    
    print("\nClass Distribution (Before Balancing):")
    print(counts)
    
    min_count = counts.min()
    print(f"\nTarget Samples per Class: {min_count}")
    
    # Stratified Sampling
    balanced_df = groups.sample(n=min_count, random_state=RANDOM_SEED)
    
    print("Class Distribution (After Balancing):")
    print(balanced_df["stockfish_label_depth_20"].value_counts())
    
    return balanced_df


for split in ["train", "val", "test"]:
    (OUTPUT_DIR / split).mkdir(parents=True, exist_ok=True)

splits = [
    ("train", train_files),
    ("val", val_files),
    ("test", test_files)
]

for split_name, files in splits:
    df_aug = load_and_augment(files, split_name)
    df_clean = balance_classes(df_aug)

    save_path = OUTPUT_DIR / split_name / f"{split_name}_balanced.parquet"
    print(f"Saving to {save_path}...")
    df_clean.to_parquet(save_path, index=False)
    
    del df_aug, df_clean
    import gc
    gc.collect()

print("\nProcessing Complete!")

In [None]:
"""
Without side to move feature
Swapped black to move to white to move
"""

import pandas as pd
import numpy as np
import chess
import os
from pathlib import Path
from tqdm import tqdm
import shutil

# ==========================================
# CONFIGURATION
# ==========================================
INPUT_DIR = Path("./dataset_clean")
OUTPUT_DIR = Path("./dataset_ready")
RANDOM_SEED = 42

# Define File Indices (Human: 0-99, Computer: 100-499)
HUMAN_FILES = sorted(list(INPUT_DIR.glob("*part_00[0-9][0-9].parquet"))) # Matches 0000-0099
COMP_FILES = sorted(list(INPUT_DIR.glob("*part_0[1-4][0-9][0-9].parquet"))) # Matches 0100-0499

all_files = sorted(list(INPUT_DIR.glob("*.parquet")))
human_files_sorted = all_files[0:100]
comp_files_sorted = all_files[100:500]

train_files = human_files_sorted[0:80] + comp_files_sorted[0:320]
val_files = human_files_sorted[80:90] + comp_files_sorted[320:360]
test_files = human_files_sorted[90:100] + comp_files_sorted[360:400]

print(f"Split Summary:")
print(f"Train Files: {len(train_files)}")
print(f"Val Files:   {len(val_files)}")
print(f"Test Files:  {len(test_files)}")

def get_mirrored_fen(fen):
    """
     Geometric Flip + Color Inversion.
    """
    board = chess.Board(fen)
    return board.mirror().fen()

def get_mirrored_label(label):
    """
    Maps White-based labels to Black-based labels.
    0 (White Win) <-> 6 (Black Win)
    1 (White Dec) <-> 5 (Black Dec)
    2 (White Bet) <-> 4 (Black Bet)
    3 (Equal)     <-> 3 (Equal)
    """
    mapping = {0: 6, 1: 5, 2: 4, 3: 3, 4: 2, 5: 1, 6: 0}
    return mapping.get(label, 3)

def load_and_augment(file_list, split_name):
    """
    Loads files, FLIPS all Black-to-move positions to White-to-move (Canonical),
    and returns a unified DataFrame WITHOUT creating duplicates.
    """
    print(f"\n--- Processing {split_name} ---")
    
    df_list = []
    for f in tqdm(file_list, desc="Loading Parquet"):
        df_list.append(pd.read_parquet(f))
    
    full_df = pd.concat(df_list, ignore_index=True)
    print(f"Original Count: {len(full_df)}")
    
    # Identify Black-to-move rows
    mask_black = full_df["fen"].str.contains(" b ")

    tqdm.pandas(desc="Canonical Flipping")

    # Flip Board
    # We apply get_mirrored_fen ONLY to the Black rows. 
    # This converts them to "White to Move" with pieces mirrored.
    full_df.loc[mask_black, "fen"] = full_df.loc[mask_black, "fen"].progress_apply(get_mirrored_fen)
    
    # Flip Label (6->0, 5->1, etc.)
    full_df.loc[mask_black, "stockfish_label_depth_20"] = full_df.loc[mask_black, "stockfish_label_depth_20"].apply(get_mirrored_label)

    # Flip Score
    if "stockfish_score_depth_20" in full_df.columns:
        full_df.loc[mask_black, "stockfish_score_depth_20"] = -full_df.loc[mask_black, "stockfish_score_depth_20"]
    if "stockfish_score_depth_10" in full_df.columns:
        full_df.loc[mask_black, "stockfish_score_depth_10"] = -full_df.loc[mask_black, "stockfish_score_depth_10"]

    print(f"Post-Flip Count: {len(full_df)} (Count stays identical)")
    return full_df

def balance_classes(df):
    """
    Downsamples all classes to match the count of the minority class.
    """
    groups = df.groupby("stockfish_label_depth_20")
    counts = groups.size()
    
    print("\nClass Distribution (Before Balancing):")
    print(counts)
    
    min_count = counts.min()
    print(f"\nTarget Samples per Class: {min_count}")
    
    # Stratified Sampling
    balanced_df = groups.sample(n=min_count, random_state=RANDOM_SEED)
    
    print("Class Distribution (After Balancing):")
    print(balanced_df["stockfish_label_depth_20"].value_counts())
    
    return balanced_df


for split in ["train", "val", "test"]:
    (OUTPUT_DIR / split).mkdir(parents=True, exist_ok=True)

splits = [
    ("train", train_files),
    ("val", val_files),
    ("test", test_files)
]

for split_name, files in splits:
    df_aug = load_and_augment(files, split_name)
    df_clean = balance_classes(df_aug)

    save_path = OUTPUT_DIR / split_name / f"{split_name}_balanced.parquet"
    print(f"Saving to {save_path}...")
    df_clean.to_parquet(save_path, index=False)
    
    del df_aug, df_clean
    import gc
    gc.collect()

print("\nProcessing Complete!")

Split Summary:
Train Files: 400
Val Files:   50
Test Files:  50

--- Processing train ---


Loading Parquet: 100%|██████████| 400/400 [00:05<00:00, 69.21it/s]


Original Count: 3978504


Canonical Flipping: 100%|██████████| 1990686/1990686 [02:46<00:00, 11942.78it/s]


Post-Flip Count: 3978504 (Count stays identical)

Class Distribution (Before Balancing):
stockfish_label_depth_20
0    1085185
1     568624
2     479232
3    1386704
4     178914
5     125957
6     153888
dtype: int64

Target Samples per Class: 125957
Class Distribution (After Balancing):
stockfish_label_depth_20
0    125957
1    125957
2    125957
3    125957
4    125957
5    125957
6    125957
Name: count, dtype: int64
Saving to dataset_ready\train\train_balanced.parquet...

--- Processing val ---


Loading Parquet: 100%|██████████| 50/50 [00:00<00:00, 69.35it/s]


Original Count: 497423


Canonical Flipping: 100%|██████████| 248764/248764 [00:20<00:00, 12208.33it/s]


Post-Flip Count: 497423 (Count stays identical)

Class Distribution (Before Balancing):
stockfish_label_depth_20
0    129343
1     71917
2     62732
3    178697
4     22869
5     15328
6     16537
dtype: int64

Target Samples per Class: 15328
Class Distribution (After Balancing):
stockfish_label_depth_20
0    15328
1    15328
2    15328
3    15328
4    15328
5    15328
6    15328
Name: count, dtype: int64
Saving to dataset_ready\val\val_balanced.parquet...

--- Processing test ---


Loading Parquet: 100%|██████████| 50/50 [00:00<00:00, 78.74it/s]


Original Count: 497375


Canonical Flipping: 100%|██████████| 248699/248699 [00:20<00:00, 12327.18it/s]


Post-Flip Count: 497375 (Count stays identical)

Class Distribution (Before Balancing):
stockfish_label_depth_20
0    132519
1     71869
2     61043
3    170682
4     23678
5     16961
6     20623
dtype: int64

Target Samples per Class: 16961
Class Distribution (After Balancing):
stockfish_label_depth_20
0    16961
1    16961
2    16961
3    16961
4    16961
5    16961
6    16961
Name: count, dtype: int64
Saving to dataset_ready\test\test_balanced.parquet...

Processing Complete!
