In [7]:
%pip install pandas
from pathlib import Path
import pandas as pd

DATA_DIR = Path("data/raw")

PD_KW = dict(dtype_backend="pyarrow") 


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/86/41/585a168330ff063014880a80d744219dbf1dd7a1c706e75ab3425a987384/pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata
  Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Obtaining dependency information for numpy>=1.26.0 from https://files.pythonhosted.org/packages/32/ee/de999f2625b80d043d6d2d628c07d0d5555a677a3cf78fdf868d409b8766/numpy-2.3.3-cp312-cp312-win_amd64.whl.metadata
  Downloading numpy-2.3.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.9 kB ? eta -:--:--
     -------------------- ------------------- 30.7/60.9 kB 1.3 MB/s eta 0:00:01
     ---------------------------------------- 60.9/60.9 kB 1.6 MB/s eta 0:00:00
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429

In [None]:
def load_csv(path: Path, usecols=None, dtypes=None, nrows=None):
    """
    Carga un CSV a DataFrame de forma robusta.
    - usecols: columnas a leer (si quieres probar con un subconjunto).
    - dtypes: mapeo de tipos por columna si ya los conoces.
    - nrows: para muestrear primeras N filas si necesitas una carga rÃ¡pida.
    """
    if not path.exists():
        raise FileNotFoundError(f"No existe: {path}")

    try:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,    # evita inferencias parciales
            on_bad_lines="warn", # avisa si hay lÃ­neas corruptas
            **PD_KW               # usa backend pyarrow si estÃ¡ disponible
        )
    except TypeError:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,
            on_bad_lines="warn",
        )
    return df


In [None]:
path_chess   = DATA_DIR / "chessData.csv"
path_random  = DATA_DIR / "random_evals.csv"
path_tactic  = DATA_DIR / "tactic_evals.csv"

df_chess  = load_csv(path_chess)
df_random = load_csv(path_random)
df_tactic = load_csv(path_tactic)

In [None]:
def overview(df: pd.DataFrame, name: str = "DataFrame", show_cols=20):
    print("=" * 100)
    print(f"ðŸ“¦ {name}")
    print("- Dimensiones:", df.shape)                 
    print("- Columnas:", len(df.columns))
    print("- Primeras columnas:", list(df.columns[:show_cols]))
    print("\nðŸ”¹ Tipos de datos:")
    display(df.dtypes.to_frame("dtype").head(show_cols))

    print("\nðŸ”¹ Memoria estimada por columna (Top 10):")
    mem = df.memory_usage(deep=True).sort_values(ascending=False).head(10)
    display((mem / (1024**2)).round(3).rename("MB").to_frame())

    print("\nðŸ”¹ df.info():")
    df.info(memory_usage="deep")

    print("\nðŸ”¹ df.head():")
    display(df.head(3))

    print("\nðŸ”¹ Valores Ãºnicos por columna (muestra):")
    display(df.nunique(dropna=True).sort_values(ascending=False).head(10).to_frame("nunique"))

    # (complemento mÃ­nimo para Paso 4, solo conteo bruto de nulos)
    print("\nðŸ”¹ Conteo de nulos (Top 10):")
    display(df.isna().sum().sort_values(ascending=False).head(10).to_frame("nulos"))
