In [1]:
import os
import pandas as pd

# Local path where we keep our working sample
local_csv = "../data/raw/nyc_taxi_sample.csv"

if os.path.exists(local_csv):
    # 1) If sample already exists locally â†’ just load it (fast)
    print("Loading existing NYC taxi sample from disk...")
    df_sample = pd.read_csv(local_csv)
else:
    # 2) First run: download a real NYC taxi CSV (no parquet / pyarrow needed)
    print("Downloading NYC taxi sample CSV (this is a real NYC taxi dataset)...")
    source_url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/taxis.csv"
    df = pd.read_csv(source_url)

    # Optionally, you could sample here if the dataset was huge.
    # This one is small, so we just copy it as-is.
    df_sample = df.copy()

    # Make sure the folder exists and save the sample for future runs
    os.makedirs(os.path.dirname(local_csv), exist_ok=True)
    df_sample.to_csv(local_csv, index=False)
    print(f"Sample saved to {local_csv}")

print("Data loaded. Shape:", df_sample.shape)
df_sample.head()


Downloading NYC taxi sample CSV (this is a real NYC taxi dataset)...
Sample saved to ../data/raw/nyc_taxi_sample.csv
Data loaded. Shape: (6433, 14)


Unnamed: 0,pickup,dropoff,passengers,distance,fare,tip,tolls,total,color,payment,pickup_zone,dropoff_zone,pickup_borough,dropoff_borough
0,2019-03-23 20:21:09,2019-03-23 20:27:24,1,1.6,7.0,2.15,0.0,12.95,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,Manhattan
1,2019-03-04 16:11:55,2019-03-04 16:19:00,1,0.79,5.0,0.0,0.0,9.3,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,Manhattan
2,2019-03-27 17:53:01,2019-03-27 18:00:25,1,1.37,7.5,2.36,0.0,14.16,yellow,credit card,Alphabet City,West Village,Manhattan,Manhattan
3,2019-03-10 01:23:59,2019-03-10 01:49:51,1,7.7,27.0,6.15,0.0,36.95,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,Manhattan
4,2019-03-30 13:27:42,2019-03-30 13:37:14,3,2.16,9.0,1.1,0.0,13.4,yellow,credit card,Midtown East,Yorkville West,Manhattan,Manhattan
