## Download and Ingest Data

In [None]:
import duckdb
import requests
from pathlib import Path

In [17]:
BASE_URL = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download"

In [18]:
def download_and_convert_files(taxi_type):
    data_dir = Path("data") / taxi_type
    data_dir.mkdir(exist_ok=True, parents=True)

    for year in [2019, 2020]:
        for month in range(1, 13):
            parquet_filename = f"{taxi_type}_tripdata_{year}-{month:02d}.parquet"
            parquet_filepath = data_dir / parquet_filename

            if parquet_filepath.exists():
                print(f"Skipping {parquet_filename} (already exists)")
                continue

            # Download CSV.gz file
            csv_gz_filename = f"{taxi_type}_tripdata_{year}-{month:02d}.csv.gz"
            csv_gz_filepath = data_dir / csv_gz_filename

            response = requests.get(f"{BASE_URL}/{taxi_type}/{csv_gz_filename}", stream=True)
            response.raise_for_status()

            with open(csv_gz_filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

            print(f"Converting {csv_gz_filename} to Parquet...")
            con = duckdb.connect()
            con.execute(f"""
                COPY (SELECT * FROM read_csv_auto('{csv_gz_filepath}'))
                TO '{parquet_filepath}' (FORMAT PARQUET)
            """)
            con.close()

            # Remove the CSV.gz file to save space
            csv_gz_filepath.unlink()
            print(f"Completed {parquet_filename}")

In [19]:
def update_gitignore():
    gitignore_path = Path(".gitignore")

    # Read existing content or start with empty string
    content = gitignore_path.read_text() if gitignore_path.exists() else ""

    # Add data/ if not already present
    if 'data/' not in content:
        with open(gitignore_path, 'a') as f:
            f.write('\n# Data directory\ndata/\n' if content else '# Data directory\ndata/\n')

In [20]:
update_gitignore()

In [21]:
for taxi_type in ["yellow", "green"]:
        download_and_convert_files(taxi_type)

Skipping yellow_tripdata_2019-01.parquet (already exists)
Skipping yellow_tripdata_2019-02.parquet (already exists)
Skipping yellow_tripdata_2019-03.parquet (already exists)
Skipping yellow_tripdata_2019-04.parquet (already exists)
Skipping yellow_tripdata_2019-05.parquet (already exists)
Skipping yellow_tripdata_2019-06.parquet (already exists)
Skipping yellow_tripdata_2019-07.parquet (already exists)
Skipping yellow_tripdata_2019-08.parquet (already exists)
Skipping yellow_tripdata_2019-09.parquet (already exists)
Skipping yellow_tripdata_2019-10.parquet (already exists)
Skipping yellow_tripdata_2019-11.parquet (already exists)
Skipping yellow_tripdata_2019-12.parquet (already exists)
Skipping yellow_tripdata_2020-01.parquet (already exists)
Skipping yellow_tripdata_2020-02.parquet (already exists)
Skipping yellow_tripdata_2020-03.parquet (already exists)
Skipping yellow_tripdata_2020-04.parquet (already exists)
Skipping yellow_tripdata_2020-05.parquet (already exists)
Skipping yello

In [22]:
con = duckdb.connect("taxi_rides_ny/taxi_rides_ny.duckdb")
con.execute("CREATE SCHEMA IF NOT EXISTS prod")

<_duckdb.DuckDBPyConnection at 0x107e4f9b0>

In [23]:
for taxi_type in ["yellow", "green"]:
        con.execute(f"""
            CREATE OR REPLACE TABLE prod.{taxi_type}_tripdata AS
            SELECT * FROM read_parquet('data/{taxi_type}/*.parquet', union_by_name=true)
        """)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [24]:
con.close()