In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

DATA_DIR = Path("../data/raw")

PD_KW = dict(dtype_backend="pyarrow") 

In [3]:
def load_csv(path: Path, usecols=None, dtypes=None, nrows=None):
    """
    Carga un CSV a DataFrame de forma robusta.
    - usecols: columnas a leer (si quieres probar con un subconjunto).
    - dtypes: mapeo de tipos por columna si ya los conoces.
    - nrows: para muestrear primeras N filas si necesitas una carga rápida.
    """
    if not path.exists():
        raise FileNotFoundError(f"No existe: {path}")

    try:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,    # evita inferencias parciales
            on_bad_lines="warn", # avisa si hay líneas corruptas
            **PD_KW               # usa backend pyarrow si está disponible
        )
    except TypeError:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,
            on_bad_lines="warn",
        )
    return df


In [4]:
path_chess   = DATA_DIR / "chessData.csv"
path_random  = DATA_DIR / "random_evals.csv"
path_tactic  = DATA_DIR / "tactic_evals.csv"

df_chess  = load_csv(path_chess)
#df_random = load_csv(path_random)
#df_tactic = load_csv(path_tactic)

In [5]:
def overview(df: pd.DataFrame, name: str = "DataFrame", show_cols=20):
    print("=" * 100)
    print(f"📦 {name}")
    print("- Dimensiones:", df.shape)                 
    print("- Columnas:", len(df.columns))
    print("- Primeras columnas:", list(df.columns[:show_cols]))
    print("\n🔹 Tipos de datos:")
    display(df.dtypes.to_frame("dtype").head(show_cols))

    print("\n🔹 Memoria estimada por columna (Top 10):")
    mem = df.memory_usage(deep=True).sort_values(ascending=False).head(10)
    display((mem / (1024**2)).round(3).rename("MB").to_frame())

    print("\n🔹 df.info():")
    df.info(memory_usage="deep")

    print("\n🔹 df.head():")
    display(df.head(3))

    print("\n🔹 Valores únicos por columna (muestra):")
    display(df.nunique(dropna=True).sort_values(ascending=False).head(10).to_frame("nunique"))

    # (complemento mínimo para Paso 4, solo conteo bruto de nulos)
    print("\n🔹 Conteo de nulos (Top 10):")
    display(df.isna().sum().sort_values(ascending=False).head(10).to_frame("nulos"))


In [6]:
#overview(df_random, "random_evals.csv")
#overview(df_tactic, "tactic_evals.csv")
overview(df_chess,  "chessData.csv")

📦 chessData.csv
- Dimensiones: (12958035, 2)
- Columnas: 2
- Primeras columnas: ['FEN', 'Evaluation']

🔹 Tipos de datos:


Unnamed: 0,dtype
FEN,string[pyarrow]
Evaluation,string[pyarrow]



🔹 Memoria estimada por columna (Top 10):


Unnamed: 0,MB
FEN,741.373
Evaluation,90.911
Index,0.0



🔹 df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12958035 entries, 0 to 12958034
Data columns (total 2 columns):
 #   Column      Dtype          
---  ------      -----          
 0   FEN         string[pyarrow]
 1   Evaluation  string[pyarrow]
dtypes: string[pyarrow](2)
memory usage: 832.3 MB

🔹 df.head():


Unnamed: 0,FEN,Evaluation
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,-10
1,rnbqkbnr/pppp1ppp/4p3/8/4P3/8/PPPP1PPP/RNBQKBN...,56
2,rnbqkbnr/pppp1ppp/4p3/8/3PP3/8/PPP2PPP/RNBQKBN...,-9



🔹 Valores únicos por columna (muestra):


Unnamed: 0,nunique
FEN,12954834
Evaluation,13544



🔹 Conteo de nulos (Top 10):


Unnamed: 0,nulos
FEN,0
Evaluation,0
