In [10]:
import duckdb
import os

# Select all the .parquet files under the current directory
current_dir = os.getcwd()
parquet_files_path = os.path.join(current_dir, '*.parquet')

# Bridge connection with duckdb
con = duckdb.connect()

# Merge all the .parquet into one named merged_data.parquet
query = f"""
    COPY (SELECT * FROM '{parquet_files_path}') 
    TO 'merged_data.parquet' (FORMAT PARQUET)
"""
con.execute(query)

<duckdb.duckdb.DuckDBPyConnection at 0x13fcfa730>

In [1]:
import pandas as pd

df = pd.read_parquet('merged_data.parquet')

In [3]:
df.drop(columns=['ehail_fee'], inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-12-18 15:52:30,2019-12-18 15:54:39,N,1.0,264,264,5.0,0.00,3.5,0.50,0.5,0.01,0.0,0.3,4.81,1.0,1.0,0.00
1,2,2020-01-01 00:45:58,2020-01-01 00:56:39,N,5.0,66,65,2.0,1.28,20.0,0.00,0.0,4.06,0.0,0.3,24.36,1.0,2.0,0.00
2,2,2020-01-01 00:41:38,2020-01-01 00:52:49,N,1.0,181,228,1.0,2.47,10.5,0.50,0.5,3.54,0.0,0.3,15.34,1.0,1.0,0.00
3,1,2020-01-01 00:52:46,2020-01-01 01:14:21,N,1.0,129,263,2.0,6.30,21.0,3.25,0.5,0.00,0.0,0.3,25.05,2.0,1.0,2.75
4,1,2020-01-01 00:19:57,2020-01-01 00:30:56,N,1.0,210,150,1.0,2.30,10.0,0.50,0.5,0.00,0.0,0.3,11.30,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982745,2,2024-10-31 22:39:22,2024-10-31 23:55:41,N,5.0,130,205,2.0,0.00,2.0,0.00,0.0,0.40,0.0,0.0,2.40,1.0,2.0,0.00
4982746,2,2024-10-31 23:14:51,2024-10-31 23:23:29,N,1.0,41,238,1.0,1.34,10.0,1.00,0.5,3.05,0.0,1.0,18.30,1.0,1.0,2.75
4982747,2,2024-10-31 23:27:01,2024-10-31 23:47:22,N,1.0,196,215,1.0,6.35,29.6,1.00,0.5,0.00,0.0,1.0,32.10,2.0,1.0,0.00
4982748,2,2024-10-31 23:21:53,2024-10-31 23:33:24,N,1.0,74,244,1.0,3.25,15.6,1.00,0.5,3.62,0.0,1.0,21.72,1.0,1.0,0.00


In [4]:
numerical_df = df.select_dtypes(include=['number'])

mean_values = numerical_df.mean()
median_values = numerical_df.median()
mode_values = numerical_df.mode().iloc[0]
summary_df = pd.DataFrame({
    'Mean': mean_values,
    'Median': median_values,
    'Mode': mode_values
})
print(summary_df)


                             Mean  Median  Mode
VendorID                 1.845679    2.00   2.0
RatecodeID               1.167289    1.00   1.0
PULocationID            99.305639   75.00  74.0
DOLocationID           135.669466  135.00  74.0
passenger_count          1.291475    1.00   1.0
trip_distance            3.265491    1.83   0.0
fare_amount             15.221848   11.00   6.5
extra                    0.573812    0.00   0.0
mta_tax                  0.519278    0.50   0.5
tip_amount               1.744677    1.00   0.0
tolls_amount             0.224583    0.00   0.0
improvement_surcharge    0.522669    0.30   0.3
total_amount            19.334775   14.95   7.3
payment_type             1.392982    1.00   1.0
trip_type                1.036631    1.00   1.0
congestion_surcharge     0.653107    0.00   0.0


In [5]:
def find_outliers_iqr(column):
    Q1 = column.quantile(0.25)  # First quartile (25th percentile)
    Q3 = column.quantile(0.75)  # Third quartile (75th percentile)
    IQR = Q3 - Q1  
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (column < lower_bound) | (column > upper_bound)

numeric_cols = df.select_dtypes(include=['number']).columns
numeric_cols = [col for col in numeric_cols if col != 'DOLocationID']

# Identify outliers in numerical columns
outlier_summary = []
total_outliers = 0
outlier_mask = pd.Series(False, index=df.index)
for col in numeric_cols:
    outliers = find_outliers_iqr(df[col])
    count = outliers.sum()
    outlier_summary.append({"Column Name": col, "Outlier Count": count})
    total_outliers += count 
    outlier_mask |= outliers 

outlier_summary_df = pd.DataFrame(outlier_summary)
print("\nSummary of Outliers in Numerical Columns:\n")
print(outlier_summary_df.to_string(index=False))
print(f"\nTotal Number of Outliers: {total_outliers}")

# Drop rows with any outliers
no_outliers_df = df[~outlier_mask].reset_index(drop=True)
print(f"\nShape of the new dataset without outliers: {no_outliers_df.shape}")



Summary of Outliers in Numerical Columns:

          Column Name  Outlier Count
             VendorID         598286
           RatecodeID         171845
         PULocationID         321269
      passenger_count         555639
        trip_distance         347300
          fare_amount         289396
                extra         165288
              mta_tax         337842
           tip_amount         143485
         tolls_amount         131366
improvement_surcharge           4075
         total_amount         260034
         payment_type           7483
            trip_type         142015
 congestion_surcharge         921579

Total Number of Outliers: 4396902

Shape of the new dataset without outliers: (1673170, 19)
