## PySpark

Run PySpark local: `docker compose up`

In [1]:
import pyspark

import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark.sql import SparkSession

### Question 1

In [2]:
pyspark.__version__

'3.3.2'

In [3]:
spark = (
    SparkSession.builder
        .master("local[*]")
        .getOrCreate()
)

sc = spark.sparkContext

In [4]:
spark.version

'3.3.2'

### Question 2

In [5]:
from IPython.display import clear_output

!mkdir -p data

!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz \
    -O data/fhvhv_tripdata_2021-06.csv.gz

--2023-03-05 13:59:28--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-06.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e-a6da-4923-ad6f-35ff02446a51?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230305%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230305T135929Z&X-Amz-Expires=300&X-Amz-Signature=77d733be90d6db7716272e16ef9688d131cb731eb5fc82fc23994023f1cd1eeb&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dfhvhv_tripdata_2021-06.csv.gz&response-content-type=application%2Foctet-stream [following]
--2023-03-05 13:59:28--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/4564ad9e

In [6]:
!zcat data/fhvhv_tripdata_2021-06.csv.gz | head

dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag,Affiliated_base_number
B02764,2021-06-01 00:02:41,2021-06-01 00:07:46,174,18,N,B02764
B02764,2021-06-01 00:16:16,2021-06-01 00:21:14,32,254,N,B02764
B02764,2021-06-01 00:27:01,2021-06-01 00:42:11,240,127,N,B02764
B02764,2021-06-01 00:46:08,2021-06-01 00:53:45,127,235,N,B02764
B02510,2021-06-01 00:45:42,2021-06-01 01:03:33,144,146,N,
B02510,2021-06-01 00:18:15,2021-06-01 00:25:47,49,17,N,
B02510,2021-06-01 00:33:06,2021-06-01 00:42:46,49,225,N,
B02510,2021-06-01 00:46:27,2021-06-01 00:56:50,225,177,N,
B02764,2021-06-01 00:48:06,2021-06-01 01:04:10,209,45,N,B02764

gzip: stdout: Broken pipe


In [7]:
schema = T.StructType([
    T.StructField('dispatching_base_num',   T.StringType(),    True),
    T.StructField('pickup_datetime',        T.TimestampType(), True),
    T.StructField('dropoff_datetime',       T.TimestampType(), True),
    T.StructField('PULocationID',           T.IntegerType(),   True),
    T.StructField('DOLocationID',           T.IntegerType(),   True),
    T.StructField('SR_Flag',                T.StringType(),    True),
    T.StructField('Affiliated_base_number', T.StringType(),    True),
])

In [8]:
taxi_data = (
    spark.read
        .option("header", "true")
        .option("compression", "gzip")
        .schema(schema)
        .csv("data/fhvhv_tripdata_2021-06.csv.gz")
)

taxi_data.persist()  # cache dataframe to speed up all operations

taxi_data.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02764|2021-06-01 00:02:41|2021-06-01 00:07:46|         174|          18|      N|                B02764|
|              B02764|2021-06-01 00:16:16|2021-06-01 00:21:14|          32|         254|      N|                B02764|
|              B02764|2021-06-01 00:27:01|2021-06-01 00:42:11|         240|         127|      N|                B02764|
|              B02764|2021-06-01 00:46:08|2021-06-01 00:53:45|         127|         235|      N|                B02764|
|              B02510|2021-06-01 00:45:42|2021-06-01 01:03:33|         144|         146|      N|                  null|
+--------------------+------------------

In [9]:
taxi_data.repartition(12).write.parquet('data/fhvhv_tripdata_2021-06.parquet', mode='overwrite')

In [10]:
!du -sh data/fhvhv_tripdata_2021-06.parquet/*

24M	data/fhvhv_tripdata_2021-06.parquet/part-00000-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00001-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00002-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00003-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00004-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00005-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00006-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00007-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021-06.parquet/part-00008-530bc5d1-f2ca-4a0a-97cc-e6210c1320cd-c000.snappy.parquet
24M	data/fhvhv_tripdata_2021

### Question 3

In [11]:
# create sql view

taxi_data.createOrReplaceTempView('fhvhv_2021_06')

In [12]:
spark.sql("""
select count(*)
from fhvhv_2021_06 
where date_trunc('DD', pickup_datetime) = '2021-06-15'
""").show()

+--------+
|count(1)|
+--------+
|  452470|
+--------+



In [13]:
taxi_data.filter(F.col('pickup_datetime').cast('date') == '2021-06-15').count()

452470

### Question 4

In [14]:
spark.sql("""
select
    max(unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) / 3600 as diff_hour
from fhvhv_2021_06
""").show()

+----------------+
|       diff_hour|
+----------------+
|66.8788888888889|
+----------------+



In [15]:
intervals = taxi_data \
    .select(F.max(F.col('dropoff_datetime') - F.col('pickup_datetime')).alias('interval')) \
    .collect()

intervals[0].interval.total_seconds() / 3600

66.8788888888889

In [16]:
spark.sql("""
select max(diff_hour) from (
    select
        date_trunc('DD', pickup_datetime) as pickup_dt,
        max(unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) / 3600 as diff_hour
    from fhvhv_2021_06
    group by pickup_dt
    order by pickup_dt
)
""").show()

+----------------+
|  max(diff_hour)|
+----------------+
|66.8788888888889|
+----------------+



In [17]:
intervals = taxi_data \
    .select(
        F.col('pickup_datetime').cast('date').alias('pickup_date'),
        ((F.col('dropoff_datetime') - F.col('pickup_datetime')).cast('long') / 3600).alias('diff_hour'),
    ) \
    .groupby('pickup_date') \
    .agg(F.max('diff_hour').alias('max_diff_hour')) \
    .sort(F.col('max_diff_hour').desc()) \
    .collect()

intervals[0].max_diff_hour

66.8788888888889

### Question 5

http://localhost:4040/jobs/

### Question 6

In [18]:
!wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv \
    -O data/taxi_zone_lookup.csv

--2023-03-05 14:04:14--  https://github.com/DataTalksClub/nyc-tlc-data/releases/download/misc/taxi_zone_lookup.csv
Resolving github.com (github.com)... 140.82.121.4
Connecting to github.com (github.com)|140.82.121.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6ea97ed0e6a?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230305%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230305T140414Z&X-Amz-Expires=300&X-Amz-Signature=b1aefb3ad94ce1363eaa3043898931d5538f716a49db6771dd966a9e8b804b47&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=513814948&response-content-disposition=attachment%3B%20filename%3Dtaxi_zone_lookup.csv&response-content-type=application%2Foctet-stream [following]
--2023-03-05 14:04:14--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/513814948/5a2cc2f5-b4cd-4584-9c62-a6e

In [20]:
taxi_zone_lookup = (
    spark.read
        .option("header", "true")
        .option("inferSchema" , "true")
        .csv('data/taxi_zone_lookup.csv')
)

taxi_zone_lookup.persist()  # cache dataframe to speed up all operations

taxi_zone_lookup.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [21]:
taxi_zone_lookup.dtypes

[('LocationID', 'int'),
 ('Borough', 'string'),
 ('Zone', 'string'),
 ('service_zone', 'string')]

In [22]:
# create sql view

taxi_zone_lookup.createOrReplaceTempView('zones')

In [23]:
spark.sql("""
select
    *
from (
    select PULocationID, count(*) as cnt
    from fhvhv_2021_06
    group by PULocationID
) as db
join zones on db.PULocationID = zones.LocationID
order by cnt desc
limit 5
""").show()

+------------+------+----------+---------+-------------------+------------+
|PULocationID|   cnt|LocationID|  Borough|               Zone|service_zone|
+------------+------+----------+---------+-------------------+------------+
|          61|231279|        61| Brooklyn|Crown Heights North|   Boro Zone|
|          79|221244|        79|Manhattan|       East Village| Yellow Zone|
|         132|188867|       132|   Queens|        JFK Airport|    Airports|
|          37|187929|        37| Brooklyn|     Bushwick South|   Boro Zone|
|          76|186780|        76| Brooklyn|      East New York|   Boro Zone|
+------------+------+----------+---------+-------------------+------------+



In [24]:
taxi_data \
    .select(F.col('PULocationID').alias('LocationID')) \
    .groupby('LocationID') \
    .count() \
    .join(taxi_zone_lookup, on='LocationID') \
    .sort(F.col('count').desc()) \
    .head(5)

[Row(LocationID=61, count=231279, Borough='Brooklyn', Zone='Crown Heights North', service_zone='Boro Zone'),
 Row(LocationID=79, count=221244, Borough='Manhattan', Zone='East Village', service_zone='Yellow Zone'),
 Row(LocationID=132, count=188867, Borough='Queens', Zone='JFK Airport', service_zone='Airports'),
 Row(LocationID=37, count=187929, Borough='Brooklyn', Zone='Bushwick South', service_zone='Boro Zone'),
 Row(LocationID=76, count=186780, Borough='Brooklyn', Zone='East New York', service_zone='Boro Zone')]