In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import geopandas as gpd
import os
import re

In [8]:
import pyarrow.parquet as pq

In [9]:
green_tripdata_2024 = r'C:\Users\yin li\OneDrive\Desktop\taxi_project\green_tripdata_2024'

df241 = pd.read_parquet(green_tripdata_2024 + '/' +'green_tripdata_2024-01.parquet')

print(df241.columns)

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge'],
      dtype='object')


In [10]:
df241.dtypes

VendorID                          int32
lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
store_and_fwd_flag               object
RatecodeID                      float64
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
ehail_fee                       float64
improvement_surcharge           float64
total_amount                    float64
payment_type                    float64
trip_type                       float64
congestion_surcharge            float64
dtype: object

In [11]:
df241.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,56551.0,56551,56551,53136.0,56551.0,56551.0,53136.0,56551.0,56551.0,56551.0,56551.0,56551.0,56551.0,0.0,56551.0,56551.0,53136.0,53133.0,53136.0
mean,1.870241,2024-01-16 21:13:00.132941,2024-01-16 21:31:10.415660,1.151611,96.077594,140.49985,1.309169,31.491124,16.929275,0.900947,0.576696,2.25651,0.191202,,0.984902,22.403186,1.323114,1.032673,0.777138
min,1.0,2023-12-31 14:38:47,2023-12-31 14:46:45,1.0,1.0,1.0,0.0,0.0,-70.0,-5.0,-0.5,-1.66,0.0,,-1.0,-76.5,1.0,1.0,-2.75
25%,2.0,2024-01-09 10:57:45,2024-01-09 11:20:21,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.44,1.0,1.0,0.0
50%,2.0,2024-01-17 07:20:09,2024-01-17 07:34:18,1.0,75.0,140.0,1.0,1.79,13.5,0.0,0.5,2.0,0.0,,1.0,18.42,1.0,1.0,0.0
75%,2.0,2024-01-24 15:28:22,2024-01-24 15:40:44,1.0,112.0,225.0,1.0,3.08,19.8,2.5,0.5,3.5,0.0,,1.0,26.6,2.0,1.0,2.75
max,2.0,2024-01-31 23:57:29,2024-02-01 19:17:30,99.0,265.0,265.0,9.0,201421.68,1422.6,10.25,4.25,110.0,24.05,,1.0,1424.1,5.0,2.0,2.75
std,0.336041,,,1.045251,57.862401,76.556276,0.978252,1417.460382,15.356032,1.344313,0.381998,2.847957,1.190748,,0.139863,16.956518,0.499731,0.17778,1.238011


In [12]:
# Folder containing the Parquet files
folder_path = r'C:\Users\yin li\OneDrive\Desktop\taxi_project\green_tripdata_2024'
output_folder = r'C:\Users\yin li\OneDrive\Desktop\taxi_project\cleaned_data'

# Define the set of RatecodeID values to keep
ratecodeid_set = {1.0, 2.0, 3.0, 4.0, 5.0}

# Function to calculate new trip_distance
def calculate_trip_distance(fare_amount):
    return (fare_amount - 2.5) / 2.5

# Function to extract the month from the filename
def extract_month(filename):
    match = re.search(r'green_tripdata_2024-(\d{2})', filename)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"Month not found in filename: {filename}")

# Function to clean the data
def clean_data(df, month):
    # Convert datetime columns
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    
    # Calculate trip duration in minutes
    df['duration_minutes'] = (df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']).dt.total_seconds() / 60
    
    # Define the start and end of the month
    start_date = f'2024-{month:02d}-01'
    end_date = f'2024-{month + 1:02d}-01' if month < 12 else '2025-01-01'
    
    # Filter by pickup datetime within the month
    df = df[(df['lpep_pickup_datetime'] >= start_date) & (df['lpep_pickup_datetime'] < end_date)]
    
    # Drop the ehail_fee column
    df = df.drop(columns=['ehail_fee'])
    
    # Filter by RatecodeID
    df = df[df['RatecodeID'].isin(ratecodeid_set)]
    
    # Identify and correct trip_distance over 100
    mask = df['trip_distance'] > 100
    df.loc[mask, 'trip_distance'] = df.loc[mask, 'fare_amount'].apply(calculate_trip_distance)
    
    return df

# List all Parquet files in the folder
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Print all file paths
print("All files:", all_files)

# Read and clean each Parquet file
dfs = []
for file in all_files:
    month = extract_month(file)
    df = pd.read_parquet(file)
    cleaned_df = clean_data(df, month)
    dfs.append(cleaned_df)

# Combine all cleaned DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

# Print the combined DataFrame
print("Combined DataFrame:")
print(combined_df)

# Calculate skewness for numeric columns
numeric_columns = combined_df.select_dtypes(include=['number']).columns
skewness = combined_df[numeric_columns].skew()



# Function to save DataFrame to Parquet file with structured path
def save_to_parquet(df, year, month, output_folder):
    month_str = f'{month:02d}'
    path = os.path.join(output_folder, f'year={year}', f'month={month_str}')
    os.makedirs(path, exist_ok=True)
    file_path = os.path.join(path, f'data_{year}_{month_str}.parquet')
    df.to_parquet(file_path)
    print(f"Saved to {file_path}")

# Split combined DataFrame by month and save each to the appropriate folder structure
for month in range(1, 13):
    monthly_df = combined_df[(combined_df['lpep_pickup_datetime'].dt.month == month)]
    if not monthly_df.empty:
        save_to_parquet(monthly_df, 2024, month, output_folder)

All files: ['C:\\Users\\yin li\\OneDrive\\Desktop\\taxi_project\\green_tripdata_2024\\green_tripdata_2024-01.parquet', 'C:\\Users\\yin li\\OneDrive\\Desktop\\taxi_project\\green_tripdata_2024\\green_tripdata_2024-02.parquet', 'C:\\Users\\yin li\\OneDrive\\Desktop\\taxi_project\\green_tripdata_2024\\green_tripdata_2024-03.parquet', 'C:\\Users\\yin li\\OneDrive\\Desktop\\taxi_project\\green_tripdata_2024\\green_tripdata_2024-04.parquet']
Combined DataFrame:
        VendorID lpep_pickup_datetime lpep_dropoff_datetime  \
0              2  2024-01-01 00:46:55   2024-01-01 00:58:25   
1              2  2024-01-01 00:31:42   2024-01-01 00:52:34   
2              2  2024-01-01 00:30:21   2024-01-01 00:49:23   
3              1  2024-01-01 00:30:20   2024-01-01 00:42:12   
4              2  2024-01-01 00:32:38   2024-01-01 00:43:37   
...          ...                  ...                   ...   
213578         2  2024-04-30 20:53:29   2024-04-30 21:07:07   
213579         2  2024-04-30 23:44:5

In [16]:


# Define the base directory
base_dir = r'C:\Users\yin li\OneDrive\Desktop\taxi_project\cleaned_data'

# Construct the full file path
file_path = base_dir + r'\year=2024\month=01\data_2024_01.parquet'

# Read the Parquet file
df2401 = pd.read_parquet(file_path)

# Display the first few rows of the DataFrame
df2401.head()



Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration_minutes
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.8,1.0,0.5,3.61,0.0,1.0,21.66,1.0,1.0,2.75,11.5
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.3,1.0,0.5,7.11,0.0,1.0,42.66,1.0,1.0,2.75,20.866667
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,N,1.0,74,262,1.0,3.08,19.8,1.0,0.5,3.0,0.0,1.0,28.05,1.0,1.0,2.75,19.033333
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,N,1.0,74,116,1.0,2.4,14.2,1.0,1.5,0.0,0.0,1.0,16.7,2.0,1.0,0.0,11.866667
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,N,1.0,74,243,1.0,5.14,22.6,1.0,0.5,6.28,0.0,1.0,31.38,1.0,1.0,0.0,10.983333
