### For this to run you need to install : pip install "pyarrow>=14,<16"  or: uv add "pyarrow>=14,<16"

### And : pip install "numpy<2" or : uv add "numpy<2". 

### If using uv, you need to set requires-python = "==3.12.*" in pyproject.toml (or your python version specifically)

In [1]:
import pandas as pd
import pyarrow

print(pd.__version__)
print(pyarrow.__version__)

3.0.1
14.0.2


In [2]:
import pandas as pd
from pathlib import Path


BASE_DIR = Path.cwd()
events_filepath = BASE_DIR / "data/raw/events_data"
output_path = BASE_DIR / "data/clean/events.parquet"

dtypes = {
    "event_date": "Int64",
    "event_timestamp": "Int64",
    "event_name": "string",
    "platform": "string",
    "language": "string",
    "user_id": "Int64",
    "user_pseudo_id": "string",
    "tour_id": "Int64",
    "story_id": "Int64",
    "lang_id": "Int64",
    "audio_time_played": "string",
    "audio_time_paused": "string",
}

dfs = []

for file in events_filepath.iterdir():
    if file.suffix.lower() != ".csv":
        continue

    print(f"Reading file: {file.name}")
    df = pd.read_csv(file, dtype=dtypes)

    # reduce memory immediately
    for c in ["event_name", "platform", "language"]:
        if c in df.columns:
            df[c] = df[c].astype("category")

    dfs.append(df)

events_data = pd.concat(dfs, ignore_index=True)

# save compressed parquet
events_data.to_parquet(output_path, engine="pyarrow", compression="snappy")

print("Saved to Parquet.")

Reading file: events_data_2025-07.csv
Reading file: events_data_2025-08.csv
Reading file: events_data_2025-09.csv
Reading file: events_data_2025-10.csv
Saved to Parquet.


In [3]:
events_data = pd.read_parquet(output_path)

In [4]:
events_data

Unnamed: 0,event_date,event_timestamp,event_name,platform,language,user_id,user_pseudo_id,tour_id,story_id,lang_id,audio_time_played,audio_time_paused
0,20250718,1752821825892003,click_listen_now,ANDROID,pt-pt,,cdedf0cb5ee77ff6598c1ccc1988e297,616,,2,,
1,20250718,1752869346776007,click_purchases_tab,ANDROID,it-it,,3c63844bb1b9b300c1b2199aaca9fc14,,,,,
2,20250718,1752843552913022,story_listened_20,ANDROID,it-it,,ad7744bf9d7f13f4464dba7b2059aed4,869,53297,4,,
3,20250718,1752818578211035,story_listened_60,ANDROID,pl-pl,,7eceffaeaaaa494e051d46c6ef418572,403,12529,2,,
4,20250718,1752824669559024,tour_download_progress,ANDROID,es-es,,73f18dd1750ad51d6486a1af895471c4,644,,2,,
...,...,...,...,...,...,...,...,...,...,...,...,...
13471330,20251003,1759480602471002,story_listened_80,ANDROID,fr-fr,,fec9afe36ab7454e11c0f763f4afed92,858,52417,6,,
13471331,20251003,1759481420845009,story_listened_20,ANDROID,fr-fr,,fec9afe36ab7454e11c0f763f4afed92,858,52432,6,,
13471332,20251003,1759479561482002,story_completed,ANDROID,fr-fr,,fec9afe36ab7454e11c0f763f4afed92,858,52394,6,,
13471333,20251003,1759478811852000,play,ANDROID,fr-fr,,fec9afe36ab7454e11c0f763f4afed92,858,52380,6,00:07,
