In [9]:
import re
from pathlib import Path
import pandas as pd

def detect_format(path: Path) -> str:
    with path.open(encoding="utf-8-sig") as f:
        line1 = f.readline()
        line2 = f.readline()
    # Your fixed-width export has a second line made of dashes defining columns
    if re.search(r"-{3,}", line2) and "," not in line2:
        print('Detected fixed-width format based on second line dashes.')
        return "fwf"

    return "csv"

def fwf_schema_from_header(path: Path):
    with path.open(encoding="utf-8-sig") as f:
        header_line = f.readline().rstrip("\n")
        dash_line = f.readline().rstrip("\n")

    colspecs = [(m.start(), m.end()) for m in re.finditer(r"-+", dash_line)]
    colspecs[-1] = (colspecs[-1][0], None)

    raw_names = [header_line[s:] if e is None else header_line[s:e] for s, e in colspecs]

    names, seen = [], {}
    for nm in map(str.strip, raw_names):
        seen[nm] = seen.get(nm, -1) + 1
        names.append(nm if seen[nm] == 0 else f"{nm}_{seen[nm]}")
    return colspecs, names


def build_filtered_parquet(
    raw_path: Path,
    parquet_out: Path,
    chunksize: int = 200_000,
    filter_col: str = "C_TypeAppl",
    filter_value=0,
):
    import pyarrow as pa
    import pyarrow.parquet as pq

    fmt = detect_format(raw_path)
    writer = None

    if parquet_out.exists():
        parquet_out.unlink()  # rebuild cleanly

    if fmt == "fwf":
        colspecs, names = fwf_schema_from_header(raw_path)
        reader = pd.read_fwf(
            raw_path,
            colspecs=colspecs,
            names=names,
            skiprows=2,
            na_values=["NULL"],
            encoding="utf-8-sig",
            chunksize=chunksize,
        )
    else:
        SKIP_COLS = {
            "I_Distance",
            "I_Duree",
            "I_EstArretImprevu",
            "I_HeureArret",
            "I_HExploitArret",
            "I_IdArretImprevu",
            "I_Latitude",
            "I_Longitude",
        }
        
        # If it's a real CSV, tune these for speed/memory:
        reader = pd.read_csv(
            raw_path,
            low_memory=False,
            chunksize=chunksize,
            usecols=lambda c: c not in SKIP_COLS,
            # usecols=[...],           # strongly recommended if you donâ€™t need all columns
            # dtype={...},             # strongly recommended to avoid object dtype
        )

    total_in, total_out = 0, 0
    for i, chunk in enumerate(reader, start=1):
        total_in += len(chunk)

        # Filter per chunk (avoid holding full df)
        if filter_col in chunk.columns:
            chunk = chunk[chunk[filter_col] == filter_value]
        else:
            raise KeyError(f"Column {filter_col} not found in file columns: {list(chunk.columns)[:20]}...")

        total_out += len(chunk)

        if len(chunk) == 0:
            continue

        table = pa.Table.from_pandas(chunk, preserve_index=False)

        if writer is None:
            writer = pq.ParquetWriter(parquet_out, table.schema, compression="snappy")
        else:
            table = table.cast(writer.schema, safe=False)
            
        writer.write_table(table)

        if i % 10 == 0:
            print(f"[chunk {i}] read={total_in:,} kept={total_out:,}")

    if writer is not None:
        writer.close()

    print(f"Done. Total read={total_in:,}, kept={total_out:,}. Parquet: {parquet_out}")

In [10]:
from pathlib import Path
import pandas as pd

HERE = Path.cwd().resolve()
PROJECT_ROOT = HERE.parent
RAW_BIG = PROJECT_ROOT / "data" / "sae_arrets_full.csv"   # change name
PARQUET_CACHE = PROJECT_ROOT / "data" / "sae_arrets_full.parquet"

FORCE_REBUILD_PARQUET = False

if PARQUET_CACHE.exists() and not FORCE_REBUILD_PARQUET:
    df = pd.read_parquet(PARQUET_CACHE)
    print("Loaded cached parquet:", df.shape)
else:
    build_filtered_parquet(RAW_BIG, PARQUET_CACHE, chunksize=200_000)
    df = pd.read_parquet(PARQUET_CACHE)
    print("Loaded rebuilt parquet:", df.shape)

print(df.head(3))

[chunk 10] read=2,000,000 kept=1,995,215
[chunk 20] read=4,000,000 kept=3,986,667
[chunk 30] read=6,000,000 kept=5,981,477
[chunk 40] read=8,000,000 kept=7,917,556
[chunk 50] read=10,000,000 kept=9,838,948
Done. Total read=11,704,166, kept=11,476,521. Parquet: /Users/ching-chichou/TPG/data/sae_arrets_full.parquet
Loaded rebuilt parquet: (11476521, 100)
   IdCourse  IdArret  RangArretAsc  RangArretDesc  DateCourse HDepartTheo  \
0  89604463        3            15             18  2025-11-01    00:19:09   
1  89604463        3            15             18  2025-11-01    00:19:09   
2  89604463        3            15             18  2025-11-01    00:19:09   

  HArriveeTheo  DistanceTheo  TempsInterArretTheo  EcartDepart  ...  \
0     00:19:09          5502                   77           18  ...   
1     00:19:09          5502                   77           18  ...   
2     00:19:09          5502                   77           18  ...   

   C_SensTheo  C_ServiceVoiture  C_TempsBattement  