In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys, pandas as pd
print(sys.executable, pd.__version__)
import pyarrow as pa
print(pa.__version__, pa.__file__)

c:\Users\samir\AppData\Local\Programs\Python\Python312\python.exe 2.3.3
21.0.0 c:\Users\samir\AppData\Local\Programs\Python\Python312\Lib\site-packages\pyarrow\__init__.py


In [3]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path("../data/raw")

PD_KW = dict(dtype_backend="pyarrow") 

In [4]:
def load_csv(path: Path, usecols=None, dtypes=None, nrows=None):
    """
    Carga un CSV a DataFrame de forma robusta.
    - usecols: columnas a leer (si quieres probar con un subconjunto).
    - dtypes: mapeo de tipos por columna si ya los conoces.
    - nrows: para muestrear primeras N filas si necesitas una carga rápida.
    """
    if not path.exists():
        raise FileNotFoundError(f"No existe: {path}")

    try:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,    # evita inferencias parciales
            on_bad_lines="warn", # avisa si hay líneas corruptas
            **PD_KW               # usa backend pyarrow si está disponible
        )
    except TypeError:
        df = pd.read_csv(
            path,
            usecols=usecols,
            dtype=dtypes,
            low_memory=False,
            on_bad_lines="warn",
        )
    return df


In [7]:
path_chess   = DATA_DIR / "chessData.csv"
path_random  = DATA_DIR / "random_evals.csv"
path_tactic  = DATA_DIR / "tactic_evals.csv"

df_chess  = load_csv(path_chess)
df_random = load_csv(path_random)
df_tactic = load_csv(path_tactic)

In [6]:
def overview(df: pd.DataFrame, name: str = "DataFrame", show_cols=20):
    print("=" * 100)
    print(f"📦 {name}")
    print("- Dimensiones:", df.shape)                 
    print("- Columnas:", len(df.columns))
    print("- Primeras columnas:", list(df.columns[:show_cols]))
    print("\n🔹 Tipos de datos:")
    display(df.dtypes.to_frame("dtype").head(show_cols))

    print("\n🔹 Memoria estimada por columna (Top 10):")
    mem = df.memory_usage(deep=True).sort_values(ascending=False).head(10)
    display((mem / (1024**2)).round(3).rename("MB").to_frame())

    print("\n🔹 df.info():")
    df.info(memory_usage="deep")

    print("\n🔹 df.head():")
    display(df.head(3))

    print("\n🔹 Valores únicos por columna (muestra):")
    display(df.nunique(dropna=True).sort_values(ascending=False).head(10).to_frame("nunique"))

    # (complemento mínimo para Paso 4, solo conteo bruto de nulos)
    print("\n🔹 Conteo de nulos (Top 10):")
    display(df.isna().sum().sort_values(ascending=False).head(10).to_frame("nulos"))


In [8]:
overview(df_chess,  "chessData.csv")
overview(df_random, "random_evals.csv")
overview(df_tactic, "tactic_evals.csv")


📦 chessData.csv
- Dimensiones: (12958035, 2)
- Columnas: 2
- Primeras columnas: ['FEN', 'Evaluation']

🔹 Tipos de datos:


Unnamed: 0,dtype
FEN,string[pyarrow]
Evaluation,string[pyarrow]



🔹 Memoria estimada por columna (Top 10):


Unnamed: 0,MB
FEN,741.373
Evaluation,90.911
Index,0.0



🔹 df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12958035 entries, 0 to 12958034
Data columns (total 2 columns):
 #   Column      Dtype          
---  ------      -----          
 0   FEN         string[pyarrow]
 1   Evaluation  string[pyarrow]
dtypes: string[pyarrow](2)
memory usage: 832.3 MB

🔹 df.head():


Unnamed: 0,FEN,Evaluation
0,rnbqkbnr/pppppppp/8/8/4P3/8/PPPP1PPP/RNBQKBNR ...,-10
1,rnbqkbnr/pppp1ppp/4p3/8/4P3/8/PPPP1PPP/RNBQKBN...,56
2,rnbqkbnr/pppp1ppp/4p3/8/3PP3/8/PPP2PPP/RNBQKBN...,-9



🔹 Valores únicos por columna (muestra):


Unnamed: 0,nunique
FEN,12954834
Evaluation,13544



🔹 Conteo de nulos (Top 10):


Unnamed: 0,nulos
FEN,0
Evaluation,0


📦 random_evals.csv
- Dimensiones: (1000273, 2)
- Columnas: 2
- Primeras columnas: ['FEN', 'Evaluation']

🔹 Tipos de datos:


Unnamed: 0,dtype
FEN,string[pyarrow]
Evaluation,string[pyarrow]



🔹 Memoria estimada por columna (Top 10):


Unnamed: 0,MB
FEN,60.42
Evaluation,7.507
Index,0.0



🔹 df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000273 entries, 0 to 1000272
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype          
---  ------      --------------    -----          
 0   FEN         1000273 non-null  string[pyarrow]
 1   Evaluation  1000273 non-null  string[pyarrow]
dtypes: string[pyarrow](2)
memory usage: 67.9 MB

🔹 df.head():


Unnamed: 0,FEN,Evaluation
0,rnbqkb1r/pppppppp/B4n2/8/4P3/8/PPPP1PPP/RNBQK1...,-459
1,rnbqkb1r/pppppppp/5n2/1B6/4P3/8/PPPP1PPP/RNBQK...,-125
2,rnbqkbnr/p1pppppp/8/1p6/4P3/8/PPPP1PPP/RNBQKBN...,198



🔹 Valores únicos por columna (muestra):


Unnamed: 0,nunique
FEN,1000273
Evaluation,12297



🔹 Conteo de nulos (Top 10):


Unnamed: 0,nulos
FEN,0
Evaluation,0


📦 tactic_evals.csv
- Dimensiones: (2628219, 3)
- Columnas: 3
- Primeras columnas: ['FEN', 'Evaluation', 'Move']

🔹 Tipos de datos:


Unnamed: 0,dtype
FEN,string[pyarrow]
Evaluation,string[pyarrow]
Move,string[pyarrow]



🔹 Memoria estimada por columna (Top 10):


Unnamed: 0,MB
FEN,143.59
Move,20.057
Evaluation,19.714
Index,0.0



🔹 df.info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2628219 entries, 0 to 2628218
Data columns (total 3 columns):
 #   Column      Dtype          
---  ------      -----          
 0   FEN         string[pyarrow]
 1   Evaluation  string[pyarrow]
 2   Move        string[pyarrow]
dtypes: string[pyarrow](3)
memory usage: 183.4 MB

🔹 df.head():


Unnamed: 0,FEN,Evaluation,Move
0,r2qkbr1/pb1nn3/1ppp3p/8/3P1p2/2PB1N1P/PPQN1PP1...,#+2,d3g6
1,r2qkb2/pb1nn3/1ppp2rp/8/3P1p2/2P2N1P/PPQN1PP1/...,#+1,c2g6
2,r2qkbr1/pb1nn3/1ppp2Bp/8/3P1p2/2P2N1P/PPQN1PP1...,#+1,g8g6



🔹 Valores únicos por columna (muestra):


Unnamed: 0,nunique
FEN,2628219
Evaluation,8014
Move,1942



🔹 Conteo de nulos (Top 10):


Unnamed: 0,nulos
Move,83768
FEN,0
Evaluation,0
