In [0]:
import os
import pandas as pd
import sqlite3
from datetime import datetime

# Define local directory
local_dir = "/tmp/nyc_taxi_data_2019"

# Function to process data
def process_data(file_path, pickup_col, dropoff_col, distance_col, fare_col):
    df = pd.read_parquet(file_path)
    df = df[[pickup_col, dropoff_col, distance_col, fare_col]].dropna()
    df.columns = ['pickup_datetime', 'dropoff_datetime', 'trip_distance', 'fare_amount']
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
    df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60
    df['average_speed'] = df['trip_distance'] / (df['trip_duration'] / 60)
    df = df[(df['trip_duration'] > 0) & (df['average_speed'].notnull())]
    df['pickup_date'] = df['pickup_datetime'].dt.date
    agg_df = df.groupby('pickup_date').agg(
        total_trips=('trip_distance', 'count'),
        average_fare=('fare_amount', 'mean')
    ).reset_index()
    return df, agg_df

# Connect to SQLite database (or create it)
conn = sqlite3.connect('nyc_taxi_data.db')

# Process each file and load into SQLite
for month in ['01', '02', '03']:
    for taxi_type, pickup_col, dropoff_col, distance_col, fare_col in [
        ('yellow', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount'),
        ('green', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'trip_distance', 'fare_amount'),
        ('fhvhv', 'pickup_datetime', 'dropoff_datetime', 'trip_miles', 'base_passenger_fare')
    ]:
        file_path = os.path.join(local_dir, f'{taxi_type}_tripdata_2019-{month}.parquet')
        
        try:
            df, agg_df = process_data(file_path, pickup_col, dropoff_col, distance_col, fare_col)
            
            # Load data into SQLite tables
            df.to_sql(f'{taxi_type}_trips_{month}', conn, if_exists='replace', index=False)
            agg_df.to_sql(f'{taxi_type}_agg_{month}', conn, if_exists='replace', index=False)
            
            print(f"Data for {taxi_type} 2019-{month} loaded into SQLite.")
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# Close the connection
conn.close()

print("Data loading completed.")
