In [1]:
from pathlib import Path
import requests
import pandas as pd
import zipfile
import io

def fetch_raw_data(year: int, month: int) -> str:
    url = f"https://s3.amazonaws.com/tripdata/JC-{year}{month:02}-citibike-tripdata.csv.zip"
    response = requests.get(url)
    if response.status_code == 200:
        zip_bytes = io.BytesIO(response.content)
        with zipfile.ZipFile(zip_bytes, 'r') as zip_ref:
            csv_name = zip_ref.namelist()[0]  # There should be only one file inside
            df = pd.read_csv(zip_ref.open(csv_name))
            path = Path("..") / "data" / "raw" / f"rides_{year}_{month:02}.parquet"
            path.parent.mkdir(parents=True, exist_ok=True)
            df.to_parquet(path, index=False)
            print(f"Saved: {path}")
            return str(path)
    else:
        raise Exception(f"Failed to fetch data from {url}")


In [2]:
fetch_raw_data(2023, 1)

Saved: ..\data\raw\rides_2023_01.parquet


'..\\data\\raw\\rides_2023_01.parquet'