In [1]:
import pandas as pd
import torch
print(torch.__version__)
import dask.dataframe as dd
import matplotlib.pyplot as plt

2.3.0+cpu


Reading the parquet file into a pandas dataframe

In [2]:
file_path = r'c:/A5/yellow_tripdata_20222_merged.parquet'
df = pd.read_parquet(file_path)

print(df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2022-07-01 00:20:06   2022-07-01 00:39:13              1.0   
1         2  2022-07-01 00:29:11   2022-07-01 00:38:00              1.0   
2         1  2022-07-01 00:03:56   2022-07-01 00:11:49              1.0   
3         1  2022-07-01 00:18:36   2022-07-01 00:52:44              1.0   
4         1  2022-07-01 00:15:50   2022-07-01 00:22:21              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0          10.10         1.0                  N            70            33   
1           1.67         1.0                  N           162            48   
2           0.90         1.0                  N            48           142   
3          14.80         1.0                  N            70           265   
4           1.20         1.0                  N           161           234   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


Looking at the info inside the datframe and looking at some entries

In [3]:
df.info()
print(df.tail(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19838515 entries, 0 to 19838514
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee         

Checking for duplicates in the dataset

duplicate_rows = df[df.duplicated()]
print(duplicate_rows)

Dropping the dupicates

df = df.drop_duplicates()
print(len(df))

In [4]:
cardinality = df.nunique()
print("Cardinality of Columns:")
print(cardinality)

Cardinality of Columns:
VendorID                        4
tpep_pickup_datetime     10010406
tpep_dropoff_datetime    10006217
passenger_count                10
trip_distance                7219
RatecodeID                      7
store_and_fwd_flag              2
PULocationID                  262
DOLocationID                  262
payment_type                    6
fare_amount                 11787
extra                         134
mta_tax                        31
tip_amount                   5873
tolls_amount                 1722
improvement_surcharge           5
total_amount                22600
congestion_surcharge            9
airport_fee                     4
dtype: int64


Checking to see where the fare amount is more than the total amount and drop them.

In [5]:
print(len(df[df['fare_amount'] > df['total_amount']]))

df = df[df['fare_amount'] <= df['total_amount']]

print(len(df))


142931
19695584


Checking to see if there are pickup times that are after the drop off times

In [6]:
invalidtimes_rows = df[df['tpep_pickup_datetime'] > df['tpep_dropoff_datetime']]
print(len(invalidtimes_rows))


same_times_rows = df[df['tpep_pickup_datetime'] == df['tpep_dropoff_datetime']]
print(len(same_times_rows))

df = df[df['tpep_pickup_datetime'] <= df['tpep_dropoff_datetime']]
df = df[df['tpep_pickup_datetime'] != df['tpep_dropoff_datetime']]
print(len(df))


4773
9676
19681135


Checking columns for negative values 

In [7]:

columns_to_check = [
    'passenger_count', 'trip_distance', 'fare_amount', 'extra', 
    'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 
    'total_amount', 'congestion_surcharge'
]

negative_counts = {}

for column in columns_to_check:
    negative_counts[column] = len(df[df[column] < 0])

for column, count in negative_counts.items():
    print(f"Number of negative values in {column}: {count}")

Number of negative values in passenger_count: 0
Number of negative values in trip_distance: 0
Number of negative values in fare_amount: 1347
Number of negative values in extra: 271
Number of negative values in mta_tax: 667
Number of negative values in tip_amount: 0
Number of negative values in tolls_amount: 200
Number of negative values in improvement_surcharge: 914
Number of negative values in total_amount: 1299
Number of negative values in congestion_surcharge: 657


dropping the negative avlues from the dataset

In [8]:
for column in columns_to_check:
    df = df[df[column] >= 0]

Trip distances that are zero and drop them

In [9]:
print(len(df[df['trip_distance']==0]))
df = df[df['trip_distance'] != 0]
print(len(df))

244740
18743665


Checking for missing values, missing values count gone to zero after dropping negative value columns
Airport Fee still has negatives but considering it is logical such a fee may not be recorded not need to drop the rows as the column will be dropped

In [10]:

missing_values = df.isnull().sum()

print(missing_values)

VendorID                 0
tpep_pickup_datetime     0
tpep_dropoff_datetime    0
passenger_count          0
trip_distance            0
RatecodeID               0
store_and_fwd_flag       0
PULocationID             0
DOLocationID             0
payment_type             0
fare_amount              0
extra                    0
mta_tax                  0
tip_amount               0
tolls_amount             0
improvement_surcharge    0
total_amount             0
congestion_surcharge     0
airport_fee              0
dtype: int64


Passenger count more than the stated maximum yellow taxi count on website

In [11]:
print(len(df[df['passenger_count']>5]))
df = df[df['passenger_count'] < 6]

211317


Checking where there is no passengers on the journey

In [12]:
print(len(df[df['passenger_count']==0]))
df = df[df['passenger_count'] != 0]
print(len(df))

351620
18180728


In the taxi zone lookup table there are two codes that are not in the specified borughs we need which are 264 and 265

In [13]:
print(len(df[df['PULocationID'] == 264]))
print(len(df[df['PULocationID'] == 265]))
print(len(df[df['PULocationID'] == 1]))
#dropping the rows
df = df[~df['PULocationID'].isin([264, 265, 1])]

252432
4259
1244


In [14]:
print(len(df[df['DOLocationID'] == 264]))
print(len(df[df['DOLocationID'] == 265]))
print(len(df[df['DOLocationID'] == 1]))
#dropping the rows
df = df[~df['DOLocationID'].isin([264, 265, 1])]

21171
58574
48753


Finding the rows where the payment was disputed which is rate code 99

In [15]:
print(len(df[df['RatecodeID'] == 99]))
df = df[df['RatecodeID'] != 99]
print(len(df))

57046
17737249


Dropping all columns that are not needed as per data quality plan.

In [16]:
columns_to_keep = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'PULocationID', 'DOLocationID']

df = df[columns_to_keep]
print(len(df))
print(df.head())

17737249
  tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  PULocationID  \
0  2022-07-01 00:20:06   2022-07-01 00:39:13              1.0            70   
1  2022-07-01 00:29:11   2022-07-01 00:38:00              1.0           162   
2  2022-07-01 00:03:56   2022-07-01 00:11:49              1.0            48   
4  2022-07-01 00:15:50   2022-07-01 00:22:21              1.0           161   
6  2022-07-01 00:53:53   2022-07-01 01:08:10              1.0           162   

   DOLocationID  
0            33  
1            48  
2           142  
4           234  
6           148  


Checking the current memory use of the dataframe

In [17]:
memory_usage = df.memory_usage(deep=True).sum()
memory_usage_MB = memory_usage / (1024 ** 2)

print(f"Memory usage: {memory_usage_MB:.2f} MB")

Memory usage: 811.95 MB


Changing to category types for Locations and passenger counts

In [18]:
df['passenger_count'] = df['passenger_count'].astype('category')
df['PULocationID'] = df['PULocationID'].astype('category')
df['DOLocationID'] = df['DOLocationID'].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 17737249 entries, 0 to 19712051
Data columns (total 5 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   tpep_pickup_datetime   datetime64[us]
 1   tpep_dropoff_datetime  datetime64[us]
 2   passenger_count        category      
 3   PULocationID           category      
 4   DOLocationID           category      
dtypes: category(3), datetime64[us](2)
memory usage: 490.6 MB
None


In [19]:
df.to_parquet('cleaned_taxi2_2022.parquet', index=False)

print("DataFrame saved")


DataFrame saved
