In [None]:
import os
from io import BytesIO
from datetime import datetime
import requests as req
import pandas as pd
from dotenv import load_dotenv
import boto3

load_dotenv()

months = ["01", "02", "03", "04", "05"]
year = "2023"

bucket_name = "datalake-prd-tlc-trips"


ingest_dt = datetime.utcnow()
ingest_year = f"{ingest_dt.year:04d}"
ingest_month = f"{ingest_dt.month:02d}"
ingest_day = f"{ingest_dt.day:02d}"

session = boto3.Session(
    aws_access_key_id=os.environ["AWS_ACCESS_KEY"],
    aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"],
    region_name=os.environ["AWS_REGION"],
)
s3_client = session.client("s3")

for month in months:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year}-{month}.parquet"
    resp = req.get(url, timeout=60)
    resp.raise_for_status()

    df = pd.read_parquet(BytesIO(resp.content))

    buf = BytesIO()
    df.to_parquet(buf, index=False)
    buf.seek(0)

    s3_key = (
        f"landing-zone/yellow_tripdata/"
        f"year={ingest_year}/month={ingest_month}/day={ingest_day}/"
        f"yellow_tripdata_{year}-{month}.parquet"
    )
    s3_client.upload_fileobj(buf, bucket_name, s3_key)
    print(f"{year}-{month} -> s3://{bucket_name}/{s3_key}")


url_lookup = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
resp_lkp = req.get(url_lookup, timeout=60)
resp_lkp.raise_for_status()

df_lookup = pd.read_csv(BytesIO(resp_lkp.content))

buf = BytesIO()
df_lookup.to_csv(buf, index=False)
buf.seek(0)

s3_key_lookup = (
    f"landing-zone/taxi_zone/"
    f"year={ingest_year}/month={ingest_month}/day={ingest_day}/"
    f"taxi_zone_lookup.csv"
)
s3_client.upload_fileobj(buf, bucket_name, s3_key_lookup)
print(f"lookup -> s3://{bucket_name}/{s3_key_lookup}")