In [None]:
import os

import pickle
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

# Configure warnings and plotting
import warnings
warnings.filterwarnings('ignore')
# %matplotlib inline

In [None]:
# ---------------------------
# Load Reference Data
# Make sure you downloaded the data/shapefile folder before run the code
# ---------------------------

# Taxi zone shapefile: using only Manhattan zones for the study region
tz = gpd.read_file('data/shapefile/taxiZone/geo_export_bb555bf4-8fc5-4144-b5f6-615889d80884.shp')
mh_id = tz[tz.borough == 'Manhattan']['location_i'].unique()

# Congestion zone CSV file: used for zone classification later
cg = pd.read_csv('data/shapefile/taxiZone/congestZone.csv')
cg['location_i'] = cg['location_i'].round(0)
c_id = cg['location_i'].unique()

In [None]:
# ---------------------------
# Define Cleaning Functions
# ---------------------------

def time_check(df):
    """
    Process datetime columns and extract various time features.
    
    Parameters:
        df (DataFrame): Raw taxi trip records.
        
    Returns:
        DataFrame: With new time-related columns.
    """
    # rename columns
    df = df.rename(columns={'tpep_pickup_datetime': 'PU_time',
                            'tpep_dropoff_datetime': 'DO_time'})
    df['PU_time'] = pd.to_datetime(df['PU_time'])
    df['DO_time'] = pd.to_datetime(df['DO_time'])
    
    # Extract date and time features
    df['date'] = df['PU_time'].dt.date
    df['year'] = df['PU_time'].dt.year
    df['month'] = df['PU_time'].dt.month
    df['day'] = df['PU_time'].dt.day
    df['dow'] = df['PU_time'].dt.dayofweek
    df['dayID'] = df['month'].apply(lambda x: str(x).zfill(2)) + df['day'].apply(lambda x: str(x).zfill(2)) # for future table join
    df['pu_hour'] = df['PU_time'].dt.hour
    df['pu_min'] = df['PU_time'].dt.minute
    df['do_hour'] = df['DO_time'].dt.hour
    df['do_min'] = df['DO_time'].dt.minute
    
    # Compute trip duration in seconds and speed in miles/hour
    df['trip_duration'] = (df['DO_time'] - df['PU_time']).dt.total_seconds()
    df['speed'] = df['trip_distance'] / (df['trip_duration'] / 3600)
    return df

def zones_check(df):
    """
    Classify trips based on whether their pickup and drop-off locations fall within congested regions. 
    Refer to the paper for further details. 
    
    Parameters:
        df (DataFrame): Taxi trip records.
        
    Returns:
        DataFrame: With an added 'zones' column indicating zone combinations.
    """
    zones_conditions = [
        (df.PULocationID.isin(c_id)) & (df.DOLocationID.isin(c_id)),
        (df.PULocationID.isin(c_id)) & (~df.DOLocationID.isin(c_id)),
        (~df.PULocationID.isin(c_id)) & (df.DOLocationID.isin(c_id)),
        (~df.PULocationID.isin(c_id)) & (~df.DOLocationID.isin(c_id))
    ]
    zones = ['aa', 'ab', 'ba', 'bb']
    df['zones'] = np.select(zones_conditions, zones)
    return df

def basic_check(df, boundary_id):
    """
    Filter trips by spatial boundary, weekday, and basic validity of values.
    
    Parameters:
        df (DataFrame): Taxi trip records.
        boundary_id (array-like): Valid zone IDs (e.g., Manhattan).
        
    Returns:
        DataFrame: Filtered taxi trip records.
    """
    # Filter trips within the boundary for both pickup and drop-off
    df = df[df.PULocationID.isin(boundary_id) & df.DOLocationID.isin(boundary_id)]
    
    # Keep only weekday trips (Monday-Friday; dow: 0-4)
    df = df[df.dow <= 4]
    
    # Remove records with non-positive values
    valid = (df.trip_distance > 0) & (df.trip_duration > 0) & (df.speed > 0) & \
            (df.fare_amount > 0) & (df.total_amount > 0)
    df = df[valid]
    return df

def sanity_check(df):
    """
    Apply further sanity checks on fare, distance, speed, and duration.
    
    Parameters:
        df (DataFrame): Taxi trip records.
        
    Returns:
        DataFrame: Taxi trip records passing sanity checks.
    """
    condition = (
        (df.fare_amount >= 2.5) & (df.fare_amount <= 300) &
        (df.trip_distance >= df.trip_distance.quantile(0.01)) & (df.trip_distance <= 100) &
        (df.speed >= df.speed.quantile(0.01)) & (df.speed <= 80) &
        (df.trip_duration >= 5*60) & (df.trip_duration <= 100*60)
    )
    df = df[condition]
    return df

# ---------------------------
# Wrapper Function for Processing and Aggregation
# ---------------------------

def clean_and_aggregate(month, 
                        input_dir="data/taxi", 
                        cleaned_dir="data/taxi/cleaned_taxi", 
                        agg_dir="data/taxi/agg_yellow"):
    """
    Clean taxi trip data for a given month, aggregate hourly pick-up counts,
    and save both cleaned and aggregated data to CSV files.
    
    Parameters:
        month (str): Month string in the format 'YYYY-MM'.
        input_dir (str): Directory containing raw taxi data in parquet format.
        cleaned_dir (str): Directory to save cleaned taxi data.
        agg_dir (str): Directory to save aggregated taxi data.
    """
    
    # Construct file paths
    file_base = f"{input_dir}/yellow_tripdata"
    file_path = f"{file_base}_{month}.parquet"
    
    # Load raw data
    print(f"Processing month: {month}")
    df = pd.read_parquet(file_path)
    
    # Apply cleaning functions sequentially
    df = time_check(df)
    df = basic_check(df, boundary_id=mh_id)
    df = sanity_check(df)
    df = zones_check(df)
    
    # Furthr filter the DataFrame to include trips within the specified month
    start_date = pd.to_datetime(f"{month}-01")
    end_date = start_date + pd.offsets.MonthBegin(1)  # start of the next month
    df = df[(df.PU_time >= start_date) & (df.PU_time < end_date)]

    
    # Save cleaned data
    os.makedirs(cleaned_dir, exist_ok=True)
    cleaned_path = os.path.join(cleaned_dir, f"yellow_tripdata_{month}.csv")
    df.to_csv(cleaned_path, index=False)
    
    # Aggregate data: count hourly pick-ups per taxi zone and sum fare amounts
    agg = df.groupby(['date', 'year', 'dayID', 'pu_hour', 'PULocationID', 'dow', 'zones'], 
                     as_index=False).agg(PUn_trips=('DOLocationID', 'count'),
                                          fare_amount=('fare_amount', 'sum'))
    # Rename for clarity
    agg.rename(columns={'PULocationID': 'locID'}, inplace=True)
    
    # Save aggregated data
    os.makedirs(agg_dir, exist_ok=True)
    agg_path = os.path.join(agg_dir, f"yellow_aggtrip_{month}.csv")
    agg.to_csv(agg_path, index=False)
    
    print(f"Finished processing month: {month}. Cleaned and aggregated files saved.")


In [None]:
# ---------------------------
# Main Loop: Process All Months
# ---------------------------

if __name__ == "__main__":
    # List of months to process 
    months = [
        '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
        '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', 
        '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12', 
        '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06', '2019-07'
    ]

    
    # Loop over each month and process the data
    # may take 15~30 mins to proceed
    for month in months:
        clean_and_aggregate(month)