In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, when, expr, format_number, dayofweek

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("CuratedData")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '4g')
    .config('spark.executor.memory', '2g')
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/18 18:54:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import sys
sys.path.insert(0, '../scripts')
from utils import dropNull, filteringOnCondition, featureExtracting, extractingDateAndTime, reFormat, joiningWeatherData, joiningBoroughs


In [3]:
raw_data = spark.read.parquet('../data/raw/*')


In [4]:
null_counts = raw_data.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in raw_data.columns])
null_counts.show()

                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|vendorid|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|ratecodeid|store_and_fwd_flag|pulocationid|dolocationid|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       0|                   0|                    0|         780804|            0|    780804|            780804|           0|           0|           0|          0|    0|      0|         

In [5]:
curated_data = dropNull(raw_data)
curated_data.count()

                                                                                

18035802

In [6]:
curated_data = filteringOnCondition(curated_data)
curated_data.count()
curated_data.show(10, truncate=100)




+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+---------------+
|vendorid|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|ratecodeid|store_and_fwd_flag|pulocationid|dolocationid|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|is_valid_record|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+---------------+
|       1| 2023-07-01 00:29:59|  2023-07-01 00:40:15|              1|          1.8|         1|                 N|         140|         263|

                                                                                

In [7]:
curated_data = featureExtracting(curated_data)
curated_data = extractingDateAndTime(curated_data)
curated_data.show(20, truncate=100)


+--------------------+--------+------+-------+---------------------+---------------+-------------+------------+-----------+-----+----------+------------+
|tpep_pickup_datetime|pu_month|pu_day|pu_hour|tpep_dropoff_datetime|passenger_count|trip_distance|pulocationid|fare_amount|extra|tip_amount|total_amount|
+--------------------+--------+------+-------+---------------------+---------------+-------------+------------+-----------+-----+----------+------------+
| 2023-07-01 00:29:59|       7|     1|      0|  2023-07-01 00:40:15|              1|          1.8|         140|       12.1|  3.5|       5.1|        22.2|
| 2023-07-01 00:38:29|       7|     1|      0|  2023-07-01 00:48:53|              1|         2.36|         142|       13.5|  1.0|       3.7|        22.2|
| 2023-07-01 00:14:16|       7|     1|      0|  2023-07-01 00:29:13|              1|         4.36|          68|       19.8|  1.0|      4.96|       29.76|
| 2023-07-01 00:29:32|       7|     1|      0|  2023-07-01 00:54:14|        

In [8]:

curated_data = curated_data.withColumn(
    "time_travel_minutes",
    expr("""
        (CAST((unix_timestamp(tpep_dropoff_datetime) - unix_timestamp(tpep_pickup_datetime)) AS DOUBLE) / 60)
    """)   
)
curated_data = curated_data.withColumn(
    'is_weekend',
    F.when(F.dayofweek(F.col('tpep_pickup_datetime')).isin([6, 7]), 1).otherwise(0)
)

In [9]:
taxi_zones_df = spark.read.csv('../data/taxi_zones/taxi+_zone_lookup.csv', header=True, inferSchema=True)
taxi_zones_df = taxi_zones_df.withColumnRenamed('LocationID', 'pulocationid')
curated_data = joiningBoroughs(curated_data,taxi_zones_df)
curated_data.show()


+------------+--------------------+--------+------+-------+---------------------+---------------+-------------+-----------+-----+----------+------------+-------------------+----------+-----------------+
|pulocationid|tpep_pickup_datetime|pu_month|pu_day|pu_hour|tpep_dropoff_datetime|passenger_count|trip_distance|fare_amount|extra|tip_amount|total_amount|time_travel_minutes|is_weekend|      pu_location|
+------------+--------------------+--------+------+-------+---------------------+---------------+-------------+-----------+-----+----------+------------+-------------------+----------+-----------------+
|         140| 2023-07-01 00:29:59|       7|     1|      0|  2023-07-01 00:40:15|              1|          1.8|       12.1|  3.5|       5.1|        22.2| 10.266666666666667|         1|        Manhattan|
|         142| 2023-07-01 00:38:29|       7|     1|      0|  2023-07-01 00:48:53|              1|         2.36|       13.5|  1.0|       3.7|        22.2|               10.4|         1|    

In [10]:
# columns = curated_data.columns

# # Count nulls in all columns
# null_counts = {column: curated_data.filter(col(column).isNull()).count() for column in columns}

# # Print the results
# for column, count in null_counts.items():
#     print(f"Number of null values in '{column}': {count}")


In [11]:
curated_data = curated_data.withColumn("tpep_pickup_datetime", F.date_format("tpep_pickup_datetime", "yyyy-MM-dd"))
weather_df = spark.read.csv("../data/hourly_weather_2023.csv", header=True, inferSchema=True)
curated_data = joiningWeatherData(curated_data, weather_df)
curated_data = dropNull(curated_data)

In [12]:
curated_data.count()

                                                                                

13838582

In [13]:
curated_data = reFormat(curated_data)
curated_data

pu_location,pu_month,pu_day,pu_hour,is_weekend,time_travel_minutes,passenger_count,trip_distance,fare_amount,extra,tip_amount,total_amount,wind_speed,dew_point,atmospheric_pressure,temperature
Manhattan,7,1,0,1,10.2667,1,1.8,12.1,3.5,5.1,22.2,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,10.4,1,2.36,13.5,1.0,3.7,22.2,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,14.95,1,4.36,19.8,1.0,4.96,29.76,0.0,13.3,1017.1,23.9
LaGuardia Airport,7,1,0,1,24.7,1,8.67,38.7,6.0,7.0,64.0,0.0,13.3,1017.1,23.9
LaGuardia Airport,7,1,0,1,23.1,1,9.11,39.4,6.0,11.19,68.89,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,10.0333,1,3.08,14.9,1.0,3.48,20.88,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,16.7833,3,3.88,19.8,1.0,1.0,25.8,0.0,13.3,1017.1,23.9
Queens,7,1,0,1,16.8667,1,8.1,33.8,1.0,5.0,41.3,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,22.3167,1,5.33,25.4,1.0,6.08,36.48,0.0,13.3,1017.1,23.9
Manhattan,7,1,0,1,10.6833,1,2.4,13.5,3.5,3.7,22.2,0.0,13.3,1017.1,23.9


24/08/18 18:54:50 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [22]:
curated_data.count()

                                                                                

13838582

In [14]:
distinct_locations = curated_data.select("pu_location").distinct().collect()

# Extract and print the distinct pu_location values
locations = [row["pu_location"] for row in distinct_locations]
locations

                                                                                

['Newark Airport',
 'Queens',
 'LaGuardia Airport',
 'Brooklyn',
 'Staten Island',
 'Manhattan',
 'JFK Airport',
 'Bronx']

In [16]:

# Directory to save the files
save_dir = "../data/curated/"

# Loop over each location, filter the DataFrame, and save the result
for location in locations:
    # Filter DataFrame for the current location
    filtered_df = curated_data.filter(curated_data.pu_location == location)
    
    # Define the file path, removing spaces for simplicity
    file_path = f"{save_dir}{location.replace(' ', '_').lower()}.parquet"
    
    # Save the filtered DataFrame
    filtered_df.write.mode("overwrite").parquet(file_path)

    print(f"Data for {location} saved to {file_path}")

                                                                                

Data for Newark Airport saved to ../data/curated/newark_airport.parquet


                                                                                

Data for Queens saved to ../data/curated/queens.parquet


                                                                                

Data for LaGuardia Airport saved to ../data/curated/laguardia_airport.parquet


                                                                                

Data for Brooklyn saved to ../data/curated/brooklyn.parquet


                                                                                

Data for Staten Island saved to ../data/curated/staten_island.parquet


                                                                                

Data for Manhattan saved to ../data/curated/manhattan.parquet


                                                                                

Data for JFK Airport saved to ../data/curated/jfk_airport.parquet




Data for Bronx saved to ../data/curated/bronx.parquet


                                                                                

In [21]:
locations = ["bronx", "brooklyn", "manhattan", "staten_island", "queens", 
             "laguardia_airport", "jfk_airport", "newark_airport"]

# Directory where the files were saved
save_dir = "../data/curated/"

# Initialize a variable to keep the sum of all counts
total_count = 0

# Loop over each location, read the corresponding file, and accumulate the total length
for location in locations:
    # Define the file path
    file_path = f"{save_dir}{location}.parquet"
    
    # Read the Parquet file into a DataFrame
    df = spark.read.parquet(file_path)
    
    # Count the number of rows in the DataFrame
    count = df.count()
    
    # Add the count to the total_count
    total_count += count
    
    # Print the count for the current location
    print(f"Total rows in {location.replace('_', ' ').title()}: {count}")

# Print the sum of all counts
print(f"\nTotal rows across all locations: {total_count}")


Total rows in Bronx: 1785
Total rows in Brooklyn: 42173
Total rows in Manhattan: 12503022
Total rows in Staten Island: 37
Total rows in Queens: 99144
Total rows in Laguardia Airport: 528527
Total rows in Jfk Airport: 663839
Total rows in Newark Airport: 55

Total rows across all locations: 13838582


In [24]:
file_path = '../data/curated/manhattan.parquet'
df = spark.read.parquet(file_path)
df

pu_location,pu_month,pu_day,pu_hour,is_weekend,time_travel_minutes,passenger_count,trip_distance,fare_amount,extra,tip_amount,total_amount,wind_speed,dew_point,atmospheric_pressure,temperature
Manhattan,12,1,0,1,13.6333,2,2.2,13.5,3.5,3.0,21.5,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,7.2667,2,2.2,11.4,1.0,2.0,18.4,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,16.3667,1,5.33,24.7,1.0,3.0,32.7,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,3.35,1,0.76,5.8,1.0,1.0,11.8,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,15.5333,1,3.33,17.7,1.0,3.4,26.1,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,7.65,1,2.1,12.1,3.5,3.42,20.52,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,5.9333,2,0.5,7.2,3.5,2.4,14.6,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,15.8167,1,2.6,14.2,3.5,3.85,23.05,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,14.3333,1,2.7,16.3,3.5,5.3,26.6,4.1,-2.8,1020.1,9.4
Manhattan,12,1,0,1,2.7333,1,0.8,5.8,3.5,2.15,12.95,4.1,-2.8,1020.1,9.4
