In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("default") \
        .master("local[*]") \
        .config("spark.executor.memory", "2048mb") \
        .getOrCreate()

23/02/27 07:40:21 WARN Utils: Your hostname, vuvuzella-ThinkPad-X1-Extreme resolves to a loopback address: 127.0.1.1; using 192.168.20.19 instead (on interface wlp0s20f3)
23/02/27 07:40:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/27 07:40:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [20]:
data_dir = "../data/raw"
filename = "fvhv_tripdata_2021-06.csv.gz"

In [21]:
df_raw_fvhv = spark.read.option("header", "true").csv(f"{data_dir}/fvhv/2021/{filename}")

In [22]:
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    DoubleType,
    StringType,
    TimestampType,
)
schema = StructType(
        [
            StructField("dispatching_base_num", IntegerType(), True),
            StructField("pickup_datetime", TimestampType(), True),
            StructField("dropoff_datetime", TimestampType(), True),
            StructField("PULocationID", IntegerType(), True),
            StructField("DOLocationID", IntegerType(), True),
            StructField("SR_Flag", StringType(), True),
            StructField("Affiliated_base_number", StringType(), True),
        ]
    )

In [23]:
data_dir = "../data/pq/fvhv/2021"
df_spark_parquet = spark.read.schema(schema).parquet(f"{data_dir}/*")

In [24]:
df_spark_parquet.printSchema()

root
 |-- dispatching_base_num: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)
 |-- Affiliated_base_number: string (nullable = true)



In [25]:
fvhv_columns = df_spark_parquet.columns
df_spark_parquet.createOrReplaceTempView("fvhv_2021")

In [26]:
from pyspark.sql import functions as f

In [29]:
df_spark_parquet \
    .withColumn("pickup_date", f.to_date(f.col("pickup_datetime"))) \
    .filter("pickup_date == '2021-06-15'") \
    .count()

                                                                                

452470

In [36]:
df_spark_parquet \
    .withColumn("duration_hours", (f.col("dropoff_datetime").cast("long") - f.col("pickup_datetime").cast("long")) / 3600 ) \
    .orderBy(f.col("duration_hours").desc()) \
    .show()



+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|    duration_hours|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+------------------+
|                null|2021-06-25 13:55:41|2021-06-28 08:48:25|          98|         265|      N|                B02872|  66.8788888888889|
|                null|2021-06-22 12:09:45|2021-06-23 13:42:44|         188|         198|      N|                B02765|25.549722222222222|
|                null|2021-06-27 10:32:29|2021-06-28 06:31:20|          78|         169|      N|                B02879|19.980833333333333|
|                null|2021-06-26 22:37:11|2021-06-27 16:49:01|         263|          36|      N|                  null|18.197222222222223|
|                null|2021-

                                                                                

In [42]:
import requests
import os
zone_data_url = "https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv"
result = requests.get(zone_data_url)
dest_path = "../data/raw/fvhv_zone"
fn = "taxi_zone_lookup.csv"
os.makedirs(dest_path)
open(f"{dest_path}/{fn}", "wb").write(result.content)

12322

In [43]:
fvhv_zone_data = "../data/raw/fvhv_zone/taxi_zone_lookup.csv"
df_spark_zone = spark.read.option("header", "true").csv(fvhv_zone_data)

In [46]:
df_spark_zone.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [60]:
df_zone_count = df_spark_parquet \
    .join(df_spark_zone, df_spark_parquet.PULocationID == df_spark_zone.LocationID, "inner") \
    .groupBy(f.col("Zone")) \
    .count().alias("zone_count")

df_zone_count.sort(f.col("count").desc()).show(truncate=False)

[Stage 34:>                                                       (0 + 12) / 12]

+-------------------------+------+
|Zone                     |count |
+-------------------------+------+
|Crown Heights North      |231279|
|East Village             |221244|
|JFK Airport              |188867|
|Bushwick South           |187929|
|East New York            |186780|
|TriBeCa/Civic Center     |164344|
|LaGuardia Airport        |161596|
|Union Sq                 |158937|
|West Village             |154698|
|Astoria                  |152493|
|Lower East Side          |151020|
|East Chelsea             |147673|
|Central Harlem North     |146402|
|Williamsburg (North Side)|143683|
|Park Slope               |143594|
|Stuyvesant Heights       |141427|
|Clinton East             |139611|
|West Chelsea/Hudson Yards|139431|
|Bedford                  |138428|
|Murray Hill              |137879|
+-------------------------+------+
only showing top 20 rows



                                                                                