In [1]:
#****************MIT805 Assigment part 1  *************************
#****************Name: Tankiso Kolobe     *************************
#****************Studen no. 25723007     **************************

In [2]:
# Package installation and imports - run this first
import importlib.util, sys, subprocess
packages = [ 'pandas', 'pyarrow', 'requests', 'tqdm', 
             'geopandas', 'shapely', 'IPython',
            'fiona', 'pyproj', 'folium', 'meteostat',
           'pathlib', 'datetime', 'collections']
for package_name in packages:
    is_present = importlib.util.find_spec(package_name)
    if is_present is None:
        print(f"{package_name} is not installed")
        !pip install {package_name}
        print(f"{package_name} is now installed")
    else:
        print(f"{package_name} is installed")

pandas is installed
pyarrow is installed
requests is installed
tqdm is installed
geopandas is installed
shapely is installed
IPython is installed
fiona is installed
pyproj is installed
folium is installed
meteostat is installed
pathlib is installed
datetime is installed
collections is installed


In [3]:
from pathlib import Path
from datetime import datetime
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import pandas as pd
from collections import OrderedDict
import time, os
from meteostat import Point, Hourly
from typing import Optional, Tuple
import requests
from IPython.display import display
try:
    from tqdm import tqdm
    _HAS_TQDM = True
except Exception:
    _HAS_TQDM = False

In [4]:
years_st = [2023,2024]
url_base = 'https://d37ci6vzurychx.cloudfront.net/trip-data'
datasts = ['fhvhv', 'yellow', 'fhv']   # taxi data sets to be downloaded
months = list(range(1, 13))             # all months of the year
file_formt = 'parquet'                 # TLC shares files as parquet
out_put_rt = Path('data/tlc')          # folder to download data to

start = datetime(min(years_st), 1, 1)      # start time for weather download
end   = datetime(max(years_st), 12, 31, 23, 59) 

weather_folder = Path("data/weather/hourly")  # Folder to download weather data to
weather_folder.mkdir(parents=True, exist_ok=True)

skip_flag = True # flag to avoid multiple downloads of the same file
check_remote_size = True  # if no Content-Length from remote server , use local file

out_put_rt.mkdir(parents=True, exist_ok=True)
print('Output root:', out_put_rt.resolve())

Output root: C:\Users\kolobet01\MIT805-Semester-Project-Assignment\data\tlc


In [5]:
# define url that combines the data link, year, months and extention to get correct data fro TLC
def build_url(dataset: str, year: int, month: int, ext: str) -> str:
    ds = dataset.lower()
    return f"{url_base}/{ds}_tripdata_{year}-{month:02d}.{ext}"

In [6]:
#Return remote Content-Length to compare with local lenth if file has previously been downloaded
def Remote_File_content_length(url: str, timeout: float = 30.0):

    try:
        #request hearder from url and allow redirection
        rqst = requests.head(url, timeout=timeout, allow_redirects=True)
        if rqst.status_code == 200:
            cl = rqst.headers.get("Content-Length") #get actual content lenght header
            return int(cl) if cl is not None else None #convert to int if present; return None if header missing
    except Exception:
        return None
    return None

In [7]:
# function to download file from rmote server
# download file, show progress, retry and save to folder


def data_download(url: str, dest: Path, max_retries: int = 3, backoff: float = 2.0):
    #make sure folder exists 
    dest.parent.mkdir(parents=True, exist_ok=True)
    attempt = 0
    while attempt < max_retries: # retries download in cases of network cuts
        try:
            # fetch url, stream to avoind overloding and limit hangeing downloads
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status() # for http error 
                #open folder to write to
                with dest.open("wb") as file:
                    
                    for chunk in r.iter_content(chunk_size=1024*1024): #download in chunks
                        if chunk:
                            file.write(chunk) # write file
            #return bytes size                   
            return dest.stat().st_size  # success
        except Exception as e:
            attempt += 1
            if attempt >= max_retries:
                print(f"[Download failed] {url} → {e.__class__.__name__}: {e}")
                return None
            time.sleep(backoff ** attempt) #Exponential backoff duration.

In [8]:
# search through local file before downloading and return if found

def find_local_file(out_dir: Path, dataset: str, year: int, month: int, prefer_ext: str = 'parquet'):
    cand = out_dir / f"{dataset}_tripdata_{year}-{month:02d}.parquet"
    return (cand, 'parquet') if cand.exists() else None

In [9]:
# MAIN LOOP 
dt_records = []

for yr in years_st: # iteration over the years
    for dtset in datasts: # iteration over datesets or taxi types
        out_pt_directy = out_put_rt / str(yr) / dtset # output folder per and per taxi type
        out_pt_directy.mkdir(parents=True, exist_ok=True)
        for m in months: # iterations in mothns per per dataset
            
            # If local exists, compare local size to remote size (HEAD)
            existing_file = find_local_file(out_pt_directy, dtset, yr, m, file_formt)
            if skip_flag and existing_file:
                local_path, local_ext = existing_file
                url_same = build_url(dtset, yr, m, local_ext) # url for header check
                
                remote_size = Remote_File_content_length(url_same)
                local_size  = local_path.stat().st_size # local size for comparison

                #check remote size, compare with local
                if (remote_size is not None and remote_size == local_size) or (remote_size is None and check_remote_size):
                    #update data records
                    dt_records.append(OrderedDict([
                        ('dataset', dtset),
                        ('year', yr),
                        ('month', m),
                        ('format', local_ext),
                        ('filename', str(local_path)),
                        ('size_bytes', local_size),
                    ]))
                    continue
                else:
                    #re-attemt a mismatched file
                    print(f"[Re-download] {local_path.name} (size mismatch: local={local_size}, remote={remote_size})")

            # build url for requsted file
            url = build_url(dtset, yr, m, file_formt)
            dest = out_pt_directy / f"{dtset}_tripdata_{yr}-{m:02d}.{file_formt}" # path for downloaded data
            print('\nDownloading:', url)
            size_bytes = data_download(url, dest) #data download 

            if size_bytes is None:
                print(f"[Skip No Remote] Missing {yr}-{m:02d} for {dtset} in {file_formt}.")
                continue
            #add successful download to data records
            dt_records.append(OrderedDict([
                ('dataset', dtset),
                ('year', yr),
                ('month', m),
                ('format', file_formt),
                ('filename', str(dest)),
                ('size_bytes', size_bytes),
            ]))



In [10]:
#  Write file sizes to CSV and cumpute total size file TLC data
df = pd.DataFrame(dt_records)
#folder for the data per year
record_csv  = out_put_rt / f"data_records_{min(years_st)}_{max(years_st)}_size.csv"
if not df.empty:
    df.to_csv(record_csv, index=False) # write data records to csv
    total_size_GB = df['size_bytes'].sum() / (1024**3) #total size in GB
    print(f'\nData Record Entries: {len(df)}')
    print(f'Total size : {total_size_GB:.2f} GB')
    # group be taxityepe or datas set
    by_ds = (df.groupby(['dataset','format'])['size_bytes'].sum().reset_index())
    by_ds['size_GB'] = by_ds['size_bytes'] / (1024**3)
    display(by_ds[['dataset','format','size_GB']].sort_values('size_GB', ascending=False))
else:
    print(' empty set ')


Data Record Entries: 72
Total size : 12.39 GB


Unnamed: 0,dataset,format,size_GB
1,fhvhv,parquet,10.800386
2,yellow,parquet,1.237491
0,fhv,parquet,0.35601


In [11]:
#

In [12]:
# dowload houly weather data for NYC

# NewYork City point (lat, lon). 
nyc = Point(40.7128, -74.0060)

print("Fetching hourly weather from Meteostat… (Jan 2023 → Dec 2024)")
weather_hourly = Hourly(nyc, start, end, timezone="America/New_York").fetch()  # fetch hourly weather for NYC 

# Partitioned parquet files
weather_hourly = weather_hourly.copy()

weather_hourly["year"]  = weather_hourly.index.year
weather_hourly["month"] = weather_hourly.index.month

saved_files = 0
total_rows = 0
#group by year and month
for (y, m), part in weather_hourly.groupby(["year", "month"], sort=True):
    dest = weather_folder / f"weather_hourly_{y}-{m:02d}.parquet" # parquet file 
    if dest.exists(): # avoid redownload
        print(f"[Skip] {dest.name} already exists")
        continue
    # wremove helper columns before writing 
    part.drop(columns=["year", "month"]).to_parquet(dest, index=True)
    saved_files += 1
    total_rows += len(part)

print(f"Done. {saved_files} files saved, {total_rows:,} rows total under {weather_folder}")

Fetching hourly weather from Meteostat… (Jan 2023 → Dec 2024)
[Skip] weather_hourly_2023-01.parquet already exists
[Skip] weather_hourly_2023-02.parquet already exists
[Skip] weather_hourly_2023-03.parquet already exists
[Skip] weather_hourly_2023-04.parquet already exists
[Skip] weather_hourly_2023-05.parquet already exists
[Skip] weather_hourly_2023-06.parquet already exists
[Skip] weather_hourly_2023-07.parquet already exists
[Skip] weather_hourly_2023-08.parquet already exists
[Skip] weather_hourly_2023-09.parquet already exists
[Skip] weather_hourly_2023-10.parquet already exists
[Skip] weather_hourly_2023-11.parquet already exists
[Skip] weather_hourly_2023-12.parquet already exists
[Skip] weather_hourly_2024-01.parquet already exists
[Skip] weather_hourly_2024-02.parquet already exists
[Skip] weather_hourly_2024-03.parquet already exists
[Skip] weather_hourly_2024-04.parquet already exists
[Skip] weather_hourly_2024-05.parquet already exists
[Skip] weather_hourly_2024-06.parquet

In [13]:
#down load and safe zone look up table csv file

def download_loopup_file(url: str, dest: Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    remote = Remote_File_content_length(url)
    if dest.exists():
        local = dest.stat().st_size # loca file size
        if (remote and remote == local) or (remote is None): # skip if local file already exist
            
            return
        print(f"[Re-download] {dest.name} (local {local} vs remote {remote})")
    print("Downloading:", url)
    #perform a http request 
    with requests.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(dest, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    print(f"[OK] {dest.name} → {dest.stat().st_size:,} bytes")


In [14]:
geo_data_dir = Path("data/geo")
geo_data_dir.mkdir(parents=True, exist_ok=True)

url_look_table = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zone_lookup.csv"
url_zone_shape  = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"  # Shapefile zip

zone_lookup_csv = geo_data_dir / "taxi_zone_lookup.csv"
zones_zip  = geo_data_dir / "taxi_zones.zip"
zone_shape  = geo_data_dir / "taxi_zones_shp"

download_loopup_file(url_look_table, zone_lookup_csv)
download_loopup_file(url_zone_shape,  zones_zip)

In [15]:
#print total weather file size and count after download

weather_files = sorted(weather_folder.glob("*.parquet"))

if not weather_files:
    print(f"No weather files found in: {weather_folder.resolve()}")

# Build single-line summary
size_bytes = sum(file.stat().st_size for file in weather_files)
df = pd.DataFrame([{
    "dataset": "weather_hourly",
    "format": "parquet",
    "files": len(weather_files),
    "size_GB": size_bytes / (1024**3),
}])


print(f"Hourly Weather Entries: {len(weather_files)}")
print(f"Total size : {size_bytes / (1024**3):.4f} GB")

display(df[["dataset", "format", "files", "size_GB"]])


Hourly Weather Entries: 24
Total size : 0.0005 GB


Unnamed: 0,dataset,format,files,size_GB
0,weather_hourly,parquet,24,0.000478
