# 08 · Feature Materialization

Este cuaderno genera splits enriquecidos (train/val/test) con todas las variables derivadas del FEN y del target.

**Objetivos:**
- reutilizar las utilidades de `functions/fen_analyzer.py`.
- calcular métricas de material, ventajas por pieza y flags de calidad del objetivo.
- guardar los resultados en `data/processed/enriched/*_enriched.parquet`.

In [1]:
import sys, gc
from pathlib import Path
from collections import Counter

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

sys.path.append('..')
from functions.fen_analyzer import (
    extract_active_turn,
    extract_castling_rights,
    extract_queen_presence,
    extract_bishop_parity,
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_DIR = Path('..') / 'data' / 'processed'
ENRICHED_DIR = DATA_DIR / 'enriched'
ENRICHED_DIR.mkdir(parents=True, exist_ok=True)

SPLITS = {
    'training': DATA_DIR / 'training_data.parquet',
    'validation': DATA_DIR / 'validation_data.parquet',
    'testing': DATA_DIR / 'testing_data.parquet',
}

print('Splits detectados:')
for name, path in SPLITS.items():
    print(f'  · {name}: {path.name}')
print(f'Enriquecidos se guardarán en: {ENRICHED_DIR}')


Splits detectados:
  · training: training_data.parquet
  · validation: validation_data.parquet
  · testing: testing_data.parquet
Enriquecidos se guardarán en: ..\data\processed\enriched


In [3]:
def eval_to_centipawns(eval_str: str) -> float:
    s = str(eval_str).strip()
    if not s:
        return 0.0
    if s.startswith('#'):
        try:
            return 10000.0 if float(s[1:]) >= 0 else -10000.0
        except Exception:
            return 0.0
    try:
        val = float(s)
    except Exception:
        return 0.0
    return val * 100.0 if '.' in s else val

def cp_to_class(cp: float, thr: int = 150) -> int:
    if cp > thr:
        return 0
    if cp < -thr:
        return 2
    return 1

def build_eval_frame(df: pd.DataFrame) -> pd.DataFrame:
    eval_series = df['Evaluation'].astype('string')
    cp_values = eval_series.map(eval_to_centipawns).astype(np.float32)
    is_mate = eval_series.str.startswith('#').fillna(False)
    clipped = cp_values.clip(-1500, 1500)
    is_outlier = cp_values.abs() > 1000
    classes = cp_values.map(cp_to_class).astype('int8')
    return pd.DataFrame({
        'evaluation_cp': cp_values,
        'evaluation_cp_clipped': clipped,
        'eval_class_white': classes,
        'is_mate': is_mate,
        'is_outlier_cp': is_outlier,
    })

def remap_class_to_active(class_series: pd.Series, side_to_move: pd.Series) -> pd.Series:
    class_arr = class_series.to_numpy().copy()
    mask_black = side_to_move.eq('black').to_numpy()
    class_arr[mask_black] = 2 - class_arr[mask_black]
    return pd.Series(class_arr, index=class_series.index, name='eval_class_active')


In [4]:
PIECE_VALUES = {'P': 1, 'N': 3, 'B': 3, 'R': 5, 'Q': 9, 'K': 0}
PIECE_KEYS = list('PNBRQKpnbrqk')

def split_fen_columns(fen_series: pd.Series) -> pd.DataFrame:
    parts = fen_series.astype('string').str.split(' ', n=5, expand=True)
    parts.columns = ['board', 'turn', 'castling', 'ep', 'halfmove', 'fullmove']
    return parts

def board_counters(board_str: str) -> Counter:
    return Counter(ch for ch in board_str if ch.isalpha())

def compute_material_features(board_series: pd.Series) -> pd.DataFrame:
    records = []
    for board in tqdm(board_series, desc='Material features'):
        counts = board_counters(board)
        rec = {}
        for key in PIECE_KEYS:
            color = 'white' if key.isupper() else 'black'
            rec[f'{color}_{key.upper()}'] = counts.get(key, 0)
        white_material = sum(PIECE_VALUES[p] * rec[f'white_{p}'] for p in PIECE_VALUES)
        black_material = sum(PIECE_VALUES[p] * rec[f'black_{p}'] for p in PIECE_VALUES)
        rec['white_material'] = white_material
        rec['black_material'] = black_material
        rec['material_difference'] = white_material - black_material
        rec['total_pieces'] = sum(rec[f'white_{p}'] + rec[f'black_{p}'] for p in PIECE_VALUES)
        rec['pawn_advantage'] = rec['white_P'] - rec['black_P']
        rec['knight_advantage'] = rec['white_N'] - rec['black_N']
        rec['bishop_advantage'] = rec['white_B'] - rec['black_B']
        rec['rook_advantage'] = rec['white_R'] - rec['black_R']
        rec['queen_advantage'] = rec['white_Q'] - rec['black_Q']
        records.append(rec)
    mat_df = pd.DataFrame(records, index=board_series.index)
    bins = [-1, 16, 24, 32]
    labels = ['endgame', 'middlegame', 'opening']
    mat_df['phase_bucket'] = pd.cut(mat_df['total_pieces'], bins=bins, labels=labels, right=True, include_lowest=True)
    return mat_df


In [5]:
def enrich_dataframe(df: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    fen_series = df['FEN'].astype('string')
    eval_features = build_eval_frame(df)
    fen_features = split_fen_columns(fen_series)
    board_series = fen_features['board']

    side_to_move = extract_active_turn(fen_series)
    castling = extract_castling_rights(fen_series)
    queens = extract_queen_presence(fen_series)
    bishops = extract_bishop_parity(fen_series)
    material = compute_material_features(board_series)

    eval_features['eval_class_active'] = remap_class_to_active(eval_features['eval_class_white'], side_to_move)
    eval_features['turn_is_white'] = side_to_move.eq('white')

    feature_blocks = [
        df.reset_index(drop=True)
        , eval_features.reset_index(drop=True)
        , side_to_move.rename('side_to_move').reset_index(drop=True)
        , fen_features[['castling', 'ep', 'halfmove', 'fullmove']].reset_index(drop=True)
        , castling.reset_index(drop=True)
        , queens.reset_index(drop=True)
        , bishops.reset_index(drop=True)
        , material.reset_index(drop=True)
    ]
    enriched = pd.concat(feature_blocks, axis=1)
    enriched['dataset_split'] = dataset_name
    return enriched


In [6]:
summaries = []
for split_name, path in SPLITS.items():
    print(f'\nProcesando {split_name}...')
    df = pd.read_parquet(path)
    enriched = enrich_dataframe(df, split_name)
    out_path = ENRICHED_DIR / f'{split_name}_enriched.parquet'
    enriched.to_parquet(out_path, index=False)
    summaries.append({
        'split': split_name,
        'rows': len(enriched),
        'cols': enriched.shape[1],
        'output': out_path.name,
    })
    del df, enriched
    gc.collect()

pd.DataFrame(summaries)



Procesando training...


Material features: 100%|██████████| 11610568/11610568 [01:46<00:00, 108825.29it/s]



Procesando validation...


Material features: 100%|██████████| 3317307/3317307 [00:30<00:00, 107443.23it/s]



Procesando testing...


Material features: 100%|██████████| 1658652/1658652 [00:16<00:00, 102823.77it/s]


Unnamed: 0,split,rows,cols,output
0,training,11610568,55,training_enriched.parquet
1,validation,3317307,55,validation_enriched.parquet
2,testing,1658652,55,testing_enriched.parquet


In [7]:
sample_path = ENRICHED_DIR / 'training_enriched.parquet'
if sample_path.exists():
    preview = pd.read_parquet(sample_path).sample(5, random_state=42)
    display(preview[[
        'FEN', 'Evaluation', 'evaluation_cp', 'evaluation_cp_clipped', 'is_mate',
        'side_to_move', 'castling_state', 'ep_available', 'white_material', 'black_material',
        'material_difference', 'total_pieces', 'phase_bucket', 'pawn_advantage', 'queen_advantage'
    ]])
else:
    print('Aún no se ha materializado training_enriched.parquet')


Unnamed: 0,FEN,Evaluation,evaluation_cp,evaluation_cp_clipped,is_mate,side_to_move,castling_state,ep_available,white_material,black_material,material_difference,total_pieces,phase_bucket,pawn_advantage,queen_advantage
10673832,8/6kp/4N1p1/8/4KPP1/8/7P/2b5 b - - 18 50,138,138.0,138.0,False,black,none,False,6,5,1,9,endgame,1,0
8349146,r3b1k1/pq1n1p1p/2Q3p1/3P4/4P1PP/1N3P2/P2B1K2/7...,415,415.0,415.0,False,black,none,False,26,24,2,20,middlegame,2,0
2799363,r3r1k1/5p2/p2p2pp/1ppP4/2n2PbB/6N1/PbQ3BP/2R3K...,0,0.0,0.0,False,black,none,False,27,26,1,23,middlegame,-3,1
5895273,r5k1/p5pp/2p1qr2/8/2b5/2Q3P1/P3P1BP/R4RK1 w - ...,61,61.0,61.0,False,white,none,False,26,26,0,18,middlegame,0,0
1918047,rn2kb1r/pp3ppp/1qp2n2/3p4/3P4/1QNBPP2/PP3P1P/R...,86,86.0,86.0,False,white,both,False,35,35,0,28,opening,0,0
