# TLC Data Preprocessing

- The TLC data information page as well as Taxi Zone Data which needs to be downloaded prior can be found via this link: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
- Data dictionary can be downloaded via this link: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf
- Trip Record user guide can be download via this link: https://www.nyc.gov/assets/tlc/downloads/pdf/trip_record_user_guide.pdf

In [1]:
import pandas as pd
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col

In [2]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("TLC Preprocess")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/26 22:40:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/08/26 22:40:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/08/26 22:40:17 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/08/26 22:40:17 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [3]:
SAMPLE_SIZE = 0.05

In [4]:
sdf_jul = spark.read.parquet('../data/landing/tlc_data/2023-07.parquet')
df = sdf_jul.sample(SAMPLE_SIZE, seed =0).toPandas()

df.to_csv('../data/sample_data/sample_data.csv', index=False)
df.to_parquet('../data/sample_data/sample_data.parquet')

24/08/26 22:40:31 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [5]:
sdf_jul = spark.read.parquet('../data/landing/tlc_data/2023-07.parquet')
sdf_aug = spark.read.parquet('../data/landing/tlc_data/2023-08.parquet')
sdf_sep = spark.read.parquet('../data/landing/tlc_data/2023-09.parquet')
sdf_oct = spark.read.parquet('../data/landing/tlc_data/2023-10.parquet')
sdf_nov = spark.read.parquet('../data/landing/tlc_data/2023-11.parquet')
sdf_dec = spark.read.parquet('../data/landing/tlc_data/2023-12.parquet')

In [6]:
sdf_jul.limit(5)

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
HV0003,B03404,B03404,2023-07-01 00:04:21,2023-07-01 00:07:59,2023-07-01 00:08:30,2023-07-01 00:33:33,72,26,4.79,1503,22.34,0.0,0.61,1.98,0.0,0.0,0.0,20.42,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:40:25,2023-07-01 00:40:35,2023-07-01 00:42:10,2023-07-01 01:08:06,26,37,6.4,1556,25.83,0.0,0.71,2.29,0.0,0.0,0.0,23.03,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:20:31,2023-07-01 00:24:05,2023-07-01 00:25:00,2023-07-01 00:42:38,263,232,5.71,1058,26.51,0.0,0.73,2.35,2.75,0.0,6.46,18.61,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:42:50,2023-07-01 00:47:37,2023-07-01 00:48:57,2023-07-01 00:57:04,79,233,1.75,487,16.32,0.0,0.45,1.45,2.75,0.0,0.0,8.13,N,N,,N,N
HV0005,B03406,,2023-07-01 00:00:17,,2023-07-01 00:05:35,2023-07-01 00:36:07,88,237,7.218,1832,83.97,0.0,2.31,7.45,2.75,0.0,0.0,59.05,N,N,N,N,N


In [7]:
sdf_jul.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [8]:
sdf_aug.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [9]:
sdf_sep.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [10]:
# now, we want to ensure everything has consistent casing to make our lives easier
consistent_col_casing = [F.col(col_name).alias(col_name.lower()) for col_name in sdf_jul.columns]
sdf_jul = sdf_jul.select(*consistent_col_casing)
sdf_aug = sdf_aug.select(*consistent_col_casing)
sdf_sep = sdf_sep.select(*consistent_col_casing)
sdf_oct = sdf_oct.select(*consistent_col_casing)
sdf_nov = sdf_nov.select(*consistent_col_casing)
sdf_dec = sdf_dec.select(*consistent_col_casing)

sdf_jul.printSchema()
sdf_schema = sdf_jul.schema


root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [12]:
sdf_malformed = spark.read.parquet('../data/landing/tlc_data//2023-07.parquet')

# select all columns from the existing malformed dataframe and cast it to the required schema
sdf_malformed = sdf_malformed \
    .select([F.col(c).cast(sdf_schema[i].dataType) for i, c in enumerate(sdf_malformed.columns)])

In [15]:
for month in range(7, 13):
    input_path = f'../data/landing/tlc_data/2023-{str(month).zfill(2)}.parquet'
    output_path = f'../data/landing/tlc_data/raw/2023-{str(month).zfill(2)}.parquet'
    
    sdf = spark.read.schema(sdf_schema).parquet(input_path).coalesce(1)

    sdf.write.mode('overwrite').parquet(output_path)

                                                                                

In [16]:
sdf = spark.read.schema(sdf_schema).parquet('../data/landing/tlc_data/*')
#sdf.show(1, vertical=True, truncate=100)
sdf.limit(10)
sdf.count()


117277281

In [17]:
sdf.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- pulocationid: integer (nullable = true)
 |-- dolocationid: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_

In [18]:
# Filter based on the minimum possible values for numerical features
sdf1 = sdf.where((F.col('trip_miles') > 0.5) &
               (F.col('trip_time') > 60) &
               (F.col('trip_time') < 18000) &
               (F.col('trip_time') > 0) &
               (F.col('driver_pay') > 0) &
               (F.col('base_passenger_fare') >= 0) &
               (F.col('tolls') >= 0) &
               (F.col('bcf') >= 0) &
               (F.col('sales_tax') >= 0) &
               (F.col('congestion_surcharge') >= 0) &
               (F.col('airport_fee') >= 0) &
               (F.col('tips') >= 0) &
               (F.col('pulocationid') >= 1) &
               (F.col('pulocationid') <= 263))

sdf1.limit(10)


hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,pulocationid,dolocationid,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,shared_request_flag,shared_match_flag,access_a_ride_flag,wav_request_flag,wav_match_flag
HV0003,B03404,B03404,2023-07-01 00:04:21,2023-07-01 00:07:59,2023-07-01 00:08:30,2023-07-01 00:33:33,72,26,4.79,1503,22.34,0.0,0.61,1.98,0.0,0.0,0.0,20.42,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:40:25,2023-07-01 00:40:35,2023-07-01 00:42:10,2023-07-01 01:08:06,26,37,6.4,1556,25.83,0.0,0.71,2.29,0.0,0.0,0.0,23.03,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:20:31,2023-07-01 00:24:05,2023-07-01 00:25:00,2023-07-01 00:42:38,263,232,5.71,1058,26.51,0.0,0.73,2.35,2.75,0.0,6.46,18.61,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:42:50,2023-07-01 00:47:37,2023-07-01 00:48:57,2023-07-01 00:57:04,79,233,1.75,487,16.32,0.0,0.45,1.45,2.75,0.0,0.0,8.13,N,N,,N,N
HV0005,B03406,,2023-07-01 00:00:17,,2023-07-01 00:05:35,2023-07-01 00:36:07,88,237,7.218,1832,83.97,0.0,2.31,7.45,2.75,0.0,0.0,59.05,N,N,N,N,N
HV0003,B03404,B03404,2023-06-30 23:59:34,2023-07-01 00:03:57,2023-07-01 00:05:57,2023-07-01 00:22:42,233,148,2.08,1005,18.45,0.0,0.51,1.64,2.75,0.0,0.0,12.18,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:25:50,2023-07-01 00:32:45,2023-07-01 00:34:47,2023-07-01 00:41:31,114,79,0.6,404,7.29,0.0,0.2,0.65,2.75,0.0,0.0,6.03,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:40:40,2023-07-01 00:44:53,2023-07-01 00:45:03,2023-07-01 01:03:54,113,262,5.38,1131,26.19,0.0,0.72,2.32,2.75,0.0,0.0,17.7,N,N,,N,N
HV0003,B03404,B03404,2023-07-01 00:19:27,2023-07-01 00:25:09,2023-07-01 00:30:12,2023-07-01 00:48:24,87,148,2.19,1092,25.93,0.0,0.71,2.3,2.75,0.0,0.0,18.21,N,N,,N,Y
HV0003,B03404,B03404,2023-07-01 00:41:14,2023-07-01 00:54:37,2023-07-01 00:54:46,2023-07-01 01:10:18,148,164,2.3,932,23.48,0.0,0.65,2.08,2.75,0.0,2.89,16.44,N,N,,N,Y


In [19]:
# Filter the columns to cover 99.99 percentile of the data for numerical features

sdf1 = sdf1.where((F.col('trip_miles') <= sdf1.selectExpr('percentile(trip_miles, 0.9999)').collect()[0][0]) &
                (F.col('trip_time') <= sdf1.selectExpr('percentile(trip_time, 0.9999)').collect()[0][0]) &
                (F.col('driver_pay') <= sdf1.selectExpr('percentile(driver_pay, 0.9999)').collect()[0][0]) &
                (F.col('base_passenger_fare') <= sdf1.selectExpr('percentile(base_passenger_fare, 0.9999)').collect()[0][0]) &
                (F.col('tolls') <= sdf1.selectExpr('percentile(tolls, 0.9999)').collect()[0][0]) &
                (F.col('bcf') <= sdf1.selectExpr('percentile(bcf, 0.9999)').collect()[0][0]) &
                (F.col('sales_tax') <= sdf1.selectExpr('percentile(sales_tax, 0.9999)').collect()[0][0]) &
                (F.col('tips') <= sdf1.selectExpr('percentile(tips, 0.9999)').collect()[0][0]))



                                                                                

In [20]:
# check the descriptive statistics of all numerical columns

sdf1.select(['trip_miles',
             'trip_time',
             'driver_pay',
             'base_passenger_fare',
             'tolls',
             'bcf',
             'sales_tax',
             'congestion_surcharge',
             'airport_fee',
             'tips',
             'pulocationid']).describe()

24/08/26 22:52:02 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

summary,trip_miles,trip_time,driver_pay,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,pulocationid
count,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0,115853423.0
mean,5.158834254004041,1228.724071251654,20.11342025837233,25.45252662393041,1.175423290608885,0.7344419268493326,2.135880328445123,1.0825833777910905,0.2236606819981486,1.201207227860866,138.77059743845462
stddev,5.776672446901602,870.156396822642,16.570533327477843,21.04386075636584,3.953851073971956,0.6480690832890761,1.7431651702132864,1.3391951000752085,0.7169882941720426,3.3396878500136147,74.60562972353408
min,0.501,61.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,100.428,10037.0,263.22,349.7,44.08,10.15,25.62,11.0,10.0,54.67,263.0


In [21]:

sdf1 = sdf1.drop("shared_request_flag", "shared_match_flag", "access_a_ride_flag", "wav_request_flag", "wav_match_flag")

# Extract the date as a standalone feature
sdf1 = sdf1.withColumn("pickup_date", 
                   F.to_date(F.col("pickup_datetime")))

sdf1 = sdf1.withColumn("dropoff_date",
                   F.to_date(F.col("dropoff_datetime")))
sdf1.limit(5)

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,pulocationid,dolocationid,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,pickup_date,dropoff_date
HV0003,B03404,B03404,2023-07-01 00:04:21,2023-07-01 00:07:59,2023-07-01 00:08:30,2023-07-01 00:33:33,72,26,4.79,1503,22.34,0.0,0.61,1.98,0.0,0.0,0.0,20.42,2023-07-01,2023-07-01
HV0003,B03404,B03404,2023-07-01 00:40:25,2023-07-01 00:40:35,2023-07-01 00:42:10,2023-07-01 01:08:06,26,37,6.4,1556,25.83,0.0,0.71,2.29,0.0,0.0,0.0,23.03,2023-07-01,2023-07-01
HV0003,B03404,B03404,2023-07-01 00:20:31,2023-07-01 00:24:05,2023-07-01 00:25:00,2023-07-01 00:42:38,263,232,5.71,1058,26.51,0.0,0.73,2.35,2.75,0.0,6.46,18.61,2023-07-01,2023-07-01
HV0003,B03404,B03404,2023-07-01 00:42:50,2023-07-01 00:47:37,2023-07-01 00:48:57,2023-07-01 00:57:04,79,233,1.75,487,16.32,0.0,0.45,1.45,2.75,0.0,0.0,8.13,2023-07-01,2023-07-01
HV0005,B03406,,2023-07-01 00:00:17,,2023-07-01 00:05:35,2023-07-01 00:36:07,88,237,7.218,1832,83.97,0.0,2.31,7.45,2.75,0.0,0.0,59.05,2023-07-01,2023-07-01


In [22]:
# Check earliest and latest of datetime variables
print("pickup_datetime:\n\tMin:", sdf1.agg({"pickup_datetime": "min"}).collect()[0][0], 
    "\n\tMax:", sdf1.agg({"pickup_datetime": "max"}).collect()[0][0]),

print("\ndropoff_datetime:\n\tMin:", sdf1.agg({"dropoff_datetime": "min"}).collect()[0][0],
     "\n\tMax:", sdf1.agg({"dropoff_datetime": "max"}).collect()[0][0], '\n')


                                                                                

pickup_datetime:
	Min: 2023-07-01 00:00:00 
	Max: 2023-12-31 23:59:59





dropoff_datetime:
	Min: 2023-07-01 00:03:09 
	Max: 2024-01-01 02:18:05 



                                                                                

In [23]:
# Check other variables
sdf1.select(['trip_miles', 
           'trip_time', 
           'base_passenger_fare', 
           'tolls', 
           'bcf', 
           'sales_tax', 
           'congestion_surcharge',
           'airport_fee',
           'tips',
           'driver_pay',]).describe().show(vertical=True)




-RECORD 0-----------------------------------
 summary              | count               
 trip_miles           | 115853423           
 trip_time            | 115853423           
 base_passenger_fare  | 115853423           
 tolls                | 115853423           
 bcf                  | 115853423           
 sales_tax            | 115853423           
 congestion_surcharge | 115853423           
 airport_fee          | 115853423           
 tips                 | 115853423           
 driver_pay           | 115853423           
-RECORD 1-----------------------------------
 summary              | mean                
 trip_miles           | 5.158834254004041   
 trip_time            | 1228.7240712516539  
 base_passenger_fare  | 25.45252662393041   
 tolls                | 1.175423290608885   
 bcf                  | 0.7344419268493326  
 sales_tax            | 2.135880328445123   
 congestion_surcharge | 1.0825833777910905  
 airport_fee          | 0.22366068199814862 
 tips     

                                                                                

In [24]:
sdf1
sdf1.count()

                                                                                

115853423

In [25]:
sdf_count = sdf.count()
sdf1_count = sdf1.count()

# total records removed
diff= sdf_count-sdf1_count
print(diff)
percentage_removed = (diff/sdf_count)*100
print(percentage_removed)
print(sdf1.count())

                                                                                

1423858
1.2140953370158711




115853423


                                                                                

# Feature Engineering

#### Extract hourly pickup and dropoff demand in each location id

In [26]:
# Create columns for date and hour
sdf1 = (sdf1
  .withColumn("pickup_date", F.col("pickup_datetime").cast("date"))
  .withColumn("pickup_hour", F.hour(F.col("pickup_datetime")))
  .withColumn("dropoff_date", F.col("dropoff_datetime").cast("date"))
  .withColumn("dropoff_hour", F.hour(F.col("dropoff_datetime")))
)
sdf1

hvfhs_license_num,dispatching_base_num,originating_base_num,request_datetime,on_scene_datetime,pickup_datetime,dropoff_datetime,pulocationid,dolocationid,trip_miles,trip_time,base_passenger_fare,tolls,bcf,sales_tax,congestion_surcharge,airport_fee,tips,driver_pay,pickup_date,dropoff_date,pickup_hour,dropoff_hour
HV0003,B03404,B03404,2023-07-01 00:04:21,2023-07-01 00:07:59,2023-07-01 00:08:30,2023-07-01 00:33:33,72,26,4.79,1503,22.34,0.0,0.61,1.98,0.0,0.0,0.0,20.42,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-07-01 00:40:25,2023-07-01 00:40:35,2023-07-01 00:42:10,2023-07-01 01:08:06,26,37,6.4,1556,25.83,0.0,0.71,2.29,0.0,0.0,0.0,23.03,2023-07-01,2023-07-01,0,1
HV0003,B03404,B03404,2023-07-01 00:20:31,2023-07-01 00:24:05,2023-07-01 00:25:00,2023-07-01 00:42:38,263,232,5.71,1058,26.51,0.0,0.73,2.35,2.75,0.0,6.46,18.61,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-07-01 00:42:50,2023-07-01 00:47:37,2023-07-01 00:48:57,2023-07-01 00:57:04,79,233,1.75,487,16.32,0.0,0.45,1.45,2.75,0.0,0.0,8.13,2023-07-01,2023-07-01,0,0
HV0005,B03406,,2023-07-01 00:00:17,,2023-07-01 00:05:35,2023-07-01 00:36:07,88,237,7.218,1832,83.97,0.0,2.31,7.45,2.75,0.0,0.0,59.05,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-06-30 23:59:34,2023-07-01 00:03:57,2023-07-01 00:05:57,2023-07-01 00:22:42,233,148,2.08,1005,18.45,0.0,0.51,1.64,2.75,0.0,0.0,12.18,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-07-01 00:25:50,2023-07-01 00:32:45,2023-07-01 00:34:47,2023-07-01 00:41:31,114,79,0.6,404,7.29,0.0,0.2,0.65,2.75,0.0,0.0,6.03,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-07-01 00:40:40,2023-07-01 00:44:53,2023-07-01 00:45:03,2023-07-01 01:03:54,113,262,5.38,1131,26.19,0.0,0.72,2.32,2.75,0.0,0.0,17.7,2023-07-01,2023-07-01,0,1
HV0003,B03404,B03404,2023-07-01 00:19:27,2023-07-01 00:25:09,2023-07-01 00:30:12,2023-07-01 00:48:24,87,148,2.19,1092,25.93,0.0,0.71,2.3,2.75,0.0,0.0,18.21,2023-07-01,2023-07-01,0,0
HV0003,B03404,B03404,2023-07-01 00:41:14,2023-07-01 00:54:37,2023-07-01 00:54:46,2023-07-01 01:10:18,148,164,2.3,932,23.48,0.0,0.65,2.08,2.75,0.0,2.89,16.44,2023-07-01,2023-07-01,0,1


In [27]:
# aggregate and count number of hourly instances in each location id
pickup_hourly_demand_location = sdf1.groupBy("pulocationid", "pickup_date", "pickup_hour").count().orderBy(["pulocationid"])
pickup_hourly_demand = sdf1.groupBy("pickup_date", "pickup_hour").count().orderBy(["pickup_date", "pickup_hour"])
dropoff_hourly_demand = sdf1.groupBy("dolocationid", "dropoff_date", "dropoff_hour").count()

pickup_hourly_demand.show()
pickup_hourly_demand.count()

                                                                                

+-----------+-----------+-----+
|pickup_date|pickup_hour|count|
+-----------+-----------+-----+
| 2023-07-01|          0|33809|
| 2023-07-01|          1|26914|
| 2023-07-01|          2|21115|
| 2023-07-01|          3|17051|
| 2023-07-01|          4|14159|
| 2023-07-01|          5|11827|
| 2023-07-01|          6|13210|
| 2023-07-01|          7|15708|
| 2023-07-01|          8|19051|
| 2023-07-01|          9|22786|
| 2023-07-01|         10|24856|
| 2023-07-01|         11|27103|
| 2023-07-01|         12|28176|
| 2023-07-01|         13|28834|
| 2023-07-01|         14|30289|
| 2023-07-01|         15|31437|
| 2023-07-01|         16|31861|
| 2023-07-01|         17|33489|
| 2023-07-01|         18|35206|
| 2023-07-01|         19|35360|
+-----------+-----------+-----+
only showing top 20 rows



                                                                                

4416

In [28]:
pickup_hourly_demand_location.show()



+------------+-----------+-----------+-----+
|pulocationid|pickup_date|pickup_hour|count|
+------------+-----------+-----------+-----+
|           1| 2023-09-04|         17|    1|
|           1| 2023-09-01|         10|    1|
|           1| 2023-07-01|         10|    1|
|           1| 2023-12-21|          5|    1|
|           1| 2023-09-29|         17|    1|
|           1| 2023-07-23|          3|    1|
|           2| 2023-07-21|         11|    1|
|           2| 2023-10-13|         21|    1|
|           2| 2023-10-15|         15|    1|
|           2| 2023-07-15|         21|    1|
|           2| 2023-07-11|         20|    1|
|           2| 2023-07-07|         12|    1|
|           2| 2023-09-05|         11|    1|
|           2| 2023-07-04|         14|    1|
|           2| 2023-09-07|          0|    1|
|           2| 2023-07-04|         10|    1|
|           2| 2023-07-10|         17|    1|
|           2| 2023-11-11|         15|    1|
|           2| 2023-09-16|         16|    1|
|         

                                                                                

In [30]:
pickup_hourly_demand.write.csv("../data/curated/pickup_hourly_demand.csv", header=True, mode='overwrite')

                                                                                

In [31]:
pickup_hourly_demand_location.write.csv("../data/curated/pickup_hourly_demand_location.csv", header=True, mode='overwrite')

                                                                                