In [0]:
import os
import pandas as pd
from datetime import datetime

# Define local directory
local_dir = "/tmp/nyc_taxi_data_2019"

# Function to process data
def process_data(file_path, pickup_col, dropoff_col, distance_col, fare_col):
    # Read parquet file into DataFrame
    df = pd.read_parquet(file_path)
    
    # Select necessary columns and drop missing values
    df = df[[pickup_col, dropoff_col, distance_col, fare_col]].dropna()
    
    # Rename columns
    df.columns = ['pickup_datetime', 'dropoff_datetime', 'trip_distance', 'fare_amount']
    
    # Convert datetime columns
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
    
    # Derive new columns: trip duration (minutes) and average speed (mph)
    df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60
    df['average_speed'] = df['trip_distance'] / (df['trip_duration'] / 60)
    
    # Remove invalid data
    df = df[(df['trip_duration'] > 0) & (df['average_speed'].notnull())]
    
    # Aggregate data: total trips and average fare per day
    df['pickup_date'] = df['pickup_datetime'].dt.date
    agg_df = df.groupby('pickup_date').agg(
        total_trips=('trip_distance', 'count'),
        average_fare=('fare_amount', 'mean')
    ).reset_index()
    
    return df, agg_df

# Process each file and display results
for month in ['01', '02', '03']:
    for taxi_type, pickup_col, dropoff_col, distance_col, fare_col in [
        ('yellow', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount'),
        ('green', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'trip_distance', 'fare_amount'),
        ('fhvhv', 'pickup_datetime', 'dropoff_datetime', 'trip_miles', 'base_passenger_fare')
    ]:
        # Get file path
        file_path = os.path.join(local_dir, f'{taxi_type}_tripdata_2019-{month}.parquet')
        
        # Process data
        try:
            df, agg_df = process_data(file_path, pickup_col, dropoff_col, distance_col, fare_col)
            
            # Show DataFrame description
            print(f"Description of {taxi_type} data for 2019-{month}:")
            print(df.describe())
            
            # Show aggregated results
            print(agg_df.head())
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

print("Data processing completed.")

