In [1]:
# Package installation and imports - run this first
import importlib.util
packages = [ 'pandas', 'pyarrow', 'requests', 'tqdm', 'matplotlib', 
            'seaborn', 'numpy', 'boto3', 'botocore', 'geopandas', 'shapely', 
            'fiona', 'pyproj', 'rtree','folium', 'meteostat']
for package_name in packages:
    is_present = importlib.util.find_spec(package_name)
    if is_present is None:
        print(f"{package_name} is not installed")
        !pip install {package_name}
        print(f"{package_name} is now installed")
    else:
        print(f"{package_name} is installed")

pandas is installed
pyarrow is installed
requests is installed
tqdm is installed
matplotlib is installed
seaborn is installed
numpy is installed
boto3 is installed
botocore is installed
geopandas is installed
shapely is installed
fiona is installed
pyproj is installed
rtree is installed
folium is installed
meteostat is installed


In [2]:
from pathlib import Path
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import pandas as pd
from collections import OrderedDict
import time, os
from meteostat import Point, Hourly
from typing import Optional, Tuple
import requests
try:
    from tqdm import tqdm
    _HAS_TQDM = True
except Exception:
    _HAS_TQDM = False

In [3]:
years_st = [2023,2024]
url_base = 'https://d37ci6vzurychx.cloudfront.net/trip-data'
datasts = ['fhvhv', 'yellow', 'fhv']   # taxi data sets to be downloaded
months = list(range(1, 13))             # all months of the year
file_formt = 'parquet'                 # TLC shares files as parquet
out_put_rt = Path('data/tlc')          # folder to download data to

start = datetime(min(years_st), 1, 1)      # start time for weather download
end   = datetime(max(years_st), 12, 31, 23, 59) 

weather_folder = Path("data/weather/hourly")  # Folder to download weather data to
weather_folder.mkdir(parents=True, exist_ok=True)

skip_flag = True # flag to avoid multiple downloads of the same file
check_remote_size = True  # if no Content-Length from remote server , use local file

out_put_rt.mkdir(parents=True, exist_ok=True)
print('Output root:', out_put_rt.resolve())

Output root: C:\Users\kolobet01\MIT805-Semester-Project-Assignment\data\tlc


In [4]:
# define url that combines the data link, year, months and extention to get correct data fro TLC
def build_url(dataset: str, year: int, month: int, ext: str) -> str:
    ds = dataset.lower()
    return f"{url_base}/{ds}_tripdata_{year}-{month:02d}.{ext}"

In [5]:
#Return remote Content-Length to compare with local lenth if file has previously been downloaded
def Remote_File_content_length(url: str, timeout: float = 30.0):

    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True)
        if r.status_code == 200:
            cl = r.headers.get("Content-Length")
            return int(cl) if cl is not None else None
    except Exception:
        return None
    return None

In [6]:
# function to download file from rmote server
# download file, show progress, retry and save to folder
def data_download(url: str, dest: Path, max_retries: int = 3, backoff: float = 2.0):
    headers = {'User-Agent': 'nyc-tlc-downloader/size-only/1.0'}
    for attempt in range(1, max_retries+1):
        try:
            with requests.get(url, stream=True, timeout=60, headers=headers) as r:
                if r.status_code == 404:
                    print('file not found:', url)
                    return None
                r.raise_for_status()
                total = int(r.headers.get('Content-Length', 0))
                dest.parent.mkdir(parents=True, exist_ok=True)
                pbar = tqdm(total=total, unit='B', unit_scale=True, desc=dest.name) if (_HAS_TQDM and total) else None
                with dest.open('wb') as f:
                    for chunk in r.iter_content(chunk_size=1024*1024):
                        if chunk:
                            f.write(chunk)
                            if pbar: pbar.update(len(chunk))
                if pbar: pbar.close()
                return dest.stat().st_size
        except Exception as e:
            wait = backoff ** attempt
            print(f'Error downloading {url} (attempt {attempt}/{max_retries}): {e}. Retrying in {wait:.1f}s...')
            time.sleep(wait)
    print(f'Failed to download after {max_retries} attempts:', url)
    return None

In [7]:
# search through local file before downloading and return if found

def find_local_file(out_dir: Path, dataset: str, year: int, month: int, prefer_ext: str = 'parquet'):
    cand = out_dir / f"{dataset}_tripdata_{year}-{month:02d}.parquet"
    return (cand, 'parquet') if cand.exists() else None

In [8]:
# =========================
# MAIN LOOP 
# =========================

dt_records = []

for yr in years_st:
    for dtset in datasts:
        out_pt_directy = out_put_rt / str(yr) / dtset
        out_pt_directy.mkdir(parents=True, exist_ok=True)
        for m in months:
            # If local exists, compare local size to remote size (HEAD)
            existing = find_local_file(out_pt_directy, dtset, yr, m, file_formt)
            if skip_flag and existing:
                local_path, local_ext = existing
                url_same = build_url(dtset, yr, m, local_ext)
                
                remote_size = Remote_File_content_length(url_same)
                local_size  = local_path.stat().st_size
                if (remote_size is not None and remote_size == local_size) or (remote_size is None and check_remote_size):
                    print(f"[skip] {local_path.name} (local file exists {'matched' if remote_size else 'not available, use local'})")
                    dt_records.append(OrderedDict([
                        ('dataset', dtset),
                        ('year', yr),
                        ('month', m),
                        ('format', local_ext),
                        ('filename', str(local_path)),
                        ('size_bytes', local_size),
                    ]))
                    continue
                else:
                    print(f"[RE-DOWNLOAD] {local_path.name} (size mismatch: local={local_size}, remote={remote_size})")

            # download file from remote server with format
            url = build_url(dtset, yr, m, file_formt)
            dest = out_pt_directy / f"{dtset}_tripdata_{yr}-{m:02d}.{file_formt}"
            print('\nDownloading:', url)
            size_bytes = data_download(url, dest)

            if size_bytes is None:
                print(f"[Skip No Remote] Missing {yr}-{m:02d} for {dtset} in {file_formt}.")
                continue

            dt_records.append(OrderedDict([
                ('dataset', dtset),
                ('year', yr),
                ('month', m),
                ('format', file_formt),
                ('filename', str(dest)),
                ('size_bytes', size_bytes),
            ]))



[skip] fhvhv_tripdata_2023-01.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-02.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-03.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-04.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-05.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-06.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-07.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-08.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-09.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-10.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-11.parquet (local file exists matched)
[skip] fhvhv_tripdata_2023-12.parquet (local file exists matched)
[skip] yellow_tripdata_2023-01.parquet (local file exists matched)
[skip] yellow_tripdata_2023-02.parquet (local file exists matched)
[skip] yellow_tripdata_2023-03.parquet (local file exists matched)
[skip] 

In [9]:
#  Write file sizes to CSV and cumpute total size file TLC data
df = pd.DataFrame(dt_records)
manifest_csv  = out_put_rt / f"manifest_{min(years_st)}_{max(years_st)}_size_only.csv"
if not df.empty:
    df.to_csv(manifest_csv, index=False)
    total_size_gb = df['size_bytes'].sum() / (1024**3)
    print(f'\nData Record Entries: {len(df)}')
    print(f'Total size : {total_size_gb:.2f} GB')
    try:
        from IPython.display import display
        by_ds = (df.groupby(['dataset','format'])['size_bytes'].sum().reset_index())
        by_ds['size_GB'] = by_ds['size_bytes'] / (1024**3)
        display(by_ds[['dataset','format','size_GB']].sort_values('size_GB', ascending=False))
    except Exception:
        pass
else:
    print(' empty set ')


Data Record Entries: 72
Total size : 12.39 GB


Unnamed: 0,dataset,format,size_GB
1,fhvhv,parquet,10.800386
2,yellow,parquet,1.237491
0,fhv,parquet,0.35601


In [10]:
# NYC point (lat, lon). We'll request records in NYC local time to match TLC timestamps.
nyc = Point(40.7128, -74.0060)

print("Fetching hourly weather from Meteostat… (Jan 2023 → Dec 2024)")
wx = Hourly(nyc, START, END, timezone="America/New_York").fetch()  # hourly DF indexed by time

# Partition & write monthly Parquet for reproducibility / manageable file sizes
wx = wx.copy()
wx["year"]  = wx.index.year
wx["month"] = wx.index.month

written = 0
total_rows = 0
for (y, m), part in wx.groupby(["year", "month"], sort=True):
    dest = OUT_WEATHER / f"weather_hourly_{y}-{m:02d}.parquet"
    if dest.exists():
        print(f"[SKIP] {dest.name} already exists")
        continue
    part.drop(columns=["year", "month"]).to_parquet(dest, index=True)
    written += 1
    total_rows += len(part)

print(f"Done. Wrote {written} files, {total_rows:,} rows total under {OUT_WEATHER}")

Fetching hourly weather from Meteostat… (Jan 2023 → Dec 2024)


NameError: name 'START' is not defined

In [None]:
def download_loopup_file(url: str, dest: Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    remote = Remote_File_content_length(url)
    if dest.exists():
        local = dest.stat().st_size
        if (remote and remote == local) or (remote is None):
            print(f"[Skip] {dest.name} (exists, size check {'matched' if remote else 'n/a'})")
            return
        print(f"[RE-DOWNLOAD] {dest.name} (local {local} vs remote {remote})")
    print("Downloading:", url)
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    print(f"[OK] {dest.name} → {dest.stat().st_size:,} bytes")


In [None]:
geo_data_dir = Path("data/geo")
geo_data_dir.mkdir(parents=True, exist_ok=True)

url_look_table = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
url_zone_shape  = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"  # Shapefile zip

zone_lookup_csv = geo_data_dir / "taxi_zone_lookup.csv"
zones_zip  = geo_data_dir / "taxi_zones.zip"
zone_shape  = geo_data_dir / "taxi_zones_shp"

download_loopup_file(url_look_table, zone_lookup_csv)
download_loopup_file(url_zone_shape,  zones_zip)

In [None]:
# Collect monthly parquet files
w_files = sorted(weather_folder.glob("*.parquet"))

if not w_files:
    raise SystemExit(f"No weather parquet files found in: {WEATHER_DIR.resolve()}")

# Build single-line summary
size_bytes = sum(p.stat().st_size for p in w_files)
df = pd.DataFrame([{
    "dataset": "weather_hourly",
    "format": "parquet",
    "files": len(w_files),
    "size_GB": size_bytes / (1024**3),
}])


print(f"Hourly Weather Entries: {len(w_files)}")
print(f"Total size : {size_bytes / (1024**3):.2f} GB")


try:
    from IPython.display import display
    display(df[["dataset", "format", "files", "size_GB"]])
except Exception:
    print(df[["dataset", "format", "files", "size_GB"]].to_string(index=False))