In [None]:
from pathlib import Path
import requests
import zipfile
import pandas as pd
import numpy as np

def fetch_raw_data(year: int, month: int) -> str:
    base_url = "https://s3.amazonaws.com/tripdata"
    patterns = [
        f"{year}{month:02}-citibike-tripdata.csv.zip",
        f"{year}{month:02}-citibike-tripdata.zip",
    ]

    raw_dir = Path("..") / "data" / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

    # 1) Download ZIP
    zip_path = None
    for fname in patterns:
        url = f"{base_url}/{fname}"
        resp = requests.get(url, stream=True)
        if resp.status_code == 200:
            zip_path = raw_dir / fname
            with open(zip_path, "wb") as f:
                for chunk in resp.iter_content(8_192):
                    f.write(chunk)
            print(f"Downloaded {url}")
            break
        else:
            print(f"{url} returned {resp.status_code}")
    if not zip_path:
        raise FileNotFoundError(f"No CSV ZIP found for {year}-{month:02}")

    # 2) Extract CSV(s)
    with zipfile.ZipFile(zip_path, "r") as z:
        z.extractall(raw_dir)
    print(f"Extracted to {raw_dir}")

    # 3) Collect CSV files
    # top‐level pattern:
    csvs = list(raw_dir.glob(f"{year}{month:02}*-citibike-tripdata*.csv"))
    # fallback in raw_dir root:
    if not csvs:
        csvs = list(raw_dir.glob("*.csv"))
    # also check in the extracted folder, e.g. "202410-citibike-tripdata"
    folder = raw_dir / zip_path.stem
    if folder.is_dir():
        csvs += list(folder.glob("*.csv"))

    if not csvs:
        raise FileNotFoundError(f"No CSVs found after extracting {zip_path}")

    # 4) Read & concatenate
    dfs = []
    for csv in csvs:
        print(f"Reading {csv.relative_to(raw_dir)}")
        dfs.append(pd.read_csv(csv))
    df = pd.concat(dfs, ignore_index=True)

    # 5) Enforce strictly numeric station IDs
    for col in ("start_station_id", "end_station_id"):
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    df.dropna(subset=["start_station_id", "end_station_id"], inplace=True)
    df["start_station_id"] = df["start_station_id"].astype(np.int64)
    df["end_station_id"]   = df["end_station_id"].astype(np.int64)

    # 6) Write out Parquet
    out_path = raw_dir / f"rides_{year}_{month:02}.parquet"
    df.to_parquet(out_path, index=False)
    print(f"Converted to parquet: {out_path}")

    # 7) Cleanup ZIP and CSVs (but leave any other files/folders intact)
    try:
        zip_path.unlink()
        for csv in csvs:
            csv.unlink()
        print("Cleaned up ZIP and CSV files")
    except Exception as e:
        print(f"Cleanup warning: {e}")

    return str(out_path)


In [None]:
fetch_raw_data(2024, 10)

https://s3.amazonaws.com/tripdata/202410-citibike-tripdata.csv.zip returned 404
Downloaded https://s3.amazonaws.com/tripdata/202410-citibike-tripdata.zip
Extracted to ..\data\raw
→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_1.csv


  dfs.append(pd.read_csv(csv))


→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_2.csv


  dfs.append(pd.read_csv(csv))


→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_3.csv


  dfs.append(pd.read_csv(csv))


→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_4.csv


  dfs.append(pd.read_csv(csv))


→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_5.csv


  dfs.append(pd.read_csv(csv))


→ Reading 202410-citibike-tripdata\202410-citibike-tripdata_6.csv


  dfs.append(pd.read_csv(csv))


Converted to parquet: ..\data\raw\rides_2024_10.parquet
Cleaned up ZIP and CSV files


'..\\data\\raw\\rides_2024_10.parquet'