In [1]:
%env SPARK_HOME=/usr/lib/spark

env: SPARK_HOME=/usr/lib/spark


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import findspark

findspark.init('/usr/lib/spark/')

## PySpark

Run PySpark local: `docker compose up`

In [4]:
import os

import pyspark

import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark.sql import SparkSession

### Question 1

In [5]:
pyspark.__version__

'3.0.3'

In [6]:
spark = (
    SparkSession.builder
        .config("spark.jars", "/usr/lib/iam-s3-credentials/iam-s3-credentials.jar")
        .config("spark.executor.cores", 2)
        .config("spark.executor.instances", 4)
        .config("spark.executor.memory", "2G")
        .config("fs.s3a.signing-algorithm", "")
        .config("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
        .config("fs.s3a.bucket.dataproc-examples.endpoint", "storage.yandexcloud.net")
        .config("fs.s3a.bucket.dataproc-examples.access.key", os.environ['AWS_ACCESS_KEY_ID'])
        .config("fs.s3a.bucket.dataproc-examples.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
        .getOrCreate()
)

sc = spark.sparkContext
sc

In [7]:
spark.version

'3.0.3'

### Question 2

In [8]:
schema = T.StructType([
    T.StructField('dispatching_base_num',   T.StringType(),    True),
    T.StructField('pickup_datetime',        T.TimestampType(), True),
    T.StructField('dropoff_datetime',       T.TimestampType(), True),
    T.StructField('PULocationID',           T.IntegerType(),   True),
    T.StructField('DOLocationID',           T.IntegerType(),   True),
    T.StructField('SR_Flag',                T.StringType(),    True),
    T.StructField('Affiliated_base_number', T.StringType(),    True),
])

In [9]:
"""
This block of code can work extremely slow. If cluster has many weak workers,
then this weak worker has to read the whole file (180Mb) alone from s3. Then
it has to be partitioned between other worker. So in my case it takes up to
5 minutes.
"""

taxi_data = (
    spark.read
        .option("header", "true")
        .option("compression", "gzip")
        .schema(schema)
        .csv("s3a://de-bucket-dev/data/fhvhv_tripdata_2021-06.csv.gz")
)

taxi_data.persist()  # cache dataframe to speed up all operations

taxi_data.show(5)

+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|Affiliated_base_number|
+--------------------+-------------------+-------------------+------------+------------+-------+----------------------+
|              B02764|2021-06-01 00:02:41|2021-06-01 00:07:46|         174|          18|      N|                B02764|
|              B02764|2021-06-01 00:16:16|2021-06-01 00:21:14|          32|         254|      N|                B02764|
|              B02764|2021-06-01 00:27:01|2021-06-01 00:42:11|         240|         127|      N|                B02764|
|              B02764|2021-06-01 00:46:08|2021-06-01 00:53:45|         127|         235|      N|                B02764|
|              B02510|2021-06-01 00:45:42|2021-06-01 01:03:33|         144|         146|      N|                  null|
+--------------------+------------------

In [11]:
# taxi_data.repartition(12).write.parquet('s3a://de-bucket-dev/output/fhvhv_tripdata_2021-06.parquet', mode='overwrite')

In [12]:
# !s3cmd ls -s -H s3a://de-bucket-dev/output/fhvhv_tripdata_2021-06.parquet/*

### Question 3

In [13]:
# create sql view

taxi_data.createOrReplaceTempView('fhvhv_2021_06')

In [14]:
spark.sql("""
select count(*)
from fhvhv_2021_06 
where date_trunc('DD', pickup_datetime) = '2021-06-15'
""").show()

+--------+
|count(1)|
+--------+
|  452470|
+--------+



In [15]:
taxi_data.filter(F.col('pickup_datetime').cast('date') == '2021-06-15').count()

452470

### Question 4

In [16]:
spark.sql("""
select
    max(unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) / 3600 as diff_hour
from fhvhv_2021_06
""").show()

+----------------+
|       diff_hour|
+----------------+
|66.8788888888889|
+----------------+



In [17]:
intervals = taxi_data \
    .select(F.max((F.unix_timestamp('dropoff_datetime') - F.unix_timestamp('pickup_datetime')) / 3600).alias('interval')) \
    .collect()

intervals[0].interval

66.8788888888889

In [18]:
spark.sql("""
select max(diff_hour) from (
    select
        date_trunc('DD', pickup_datetime) as pickup_dt,
        max(unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) / 3600 as diff_hour
    from fhvhv_2021_06
    group by pickup_dt
    order by pickup_dt
)
""").show()

+----------------+
|  max(diff_hour)|
+----------------+
|66.8788888888889|
+----------------+



In [19]:
intervals = taxi_data \
    .select(
        F.col('pickup_datetime').cast('date').alias('pickup_date'),
        ((F.unix_timestamp('dropoff_datetime') - F.unix_timestamp('pickup_datetime')) / 3600).alias('diff_hour'),
    ) \
    .groupby('pickup_date') \
    .agg(F.max('diff_hour').alias('max_diff_hour')) \
    .sort(F.col('max_diff_hour').desc()) \
    .collect()

intervals[0].max_diff_hour

66.8788888888889

### Question 5

http://localhost:4040/jobs/

### Question 6

In [20]:
taxi_zone_lookup = (
    spark.read
        .option("header", "true")
        .option("inferSchema" , "true")
        .csv("s3a://de-bucket-dev/data/taxi_zone_lookup.csv")
)

taxi_zone_lookup.persist()  # cache dataframe to speed up all operations

taxi_zone_lookup.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [21]:
taxi_zone_lookup.dtypes

[('LocationID', 'int'),
 ('Borough', 'string'),
 ('Zone', 'string'),
 ('service_zone', 'string')]

In [22]:
# create sql view

taxi_zone_lookup.createOrReplaceTempView('zones')

In [23]:
spark.sql("""
select
    *
from (
    select PULocationID, count(*) as cnt
    from fhvhv_2021_06
    group by PULocationID
) as db
join zones on db.PULocationID = zones.LocationID
order by cnt desc
limit 5
""").show()

+------------+------+----------+---------+-------------------+------------+
|PULocationID|   cnt|LocationID|  Borough|               Zone|service_zone|
+------------+------+----------+---------+-------------------+------------+
|          61|231279|        61| Brooklyn|Crown Heights North|   Boro Zone|
|          79|221244|        79|Manhattan|       East Village| Yellow Zone|
|         132|188867|       132|   Queens|        JFK Airport|    Airports|
|          37|187929|        37| Brooklyn|     Bushwick South|   Boro Zone|
|          76|186780|        76| Brooklyn|      East New York|   Boro Zone|
+------------+------+----------+---------+-------------------+------------+



In [24]:
taxi_data \
    .select(F.col('PULocationID').alias('LocationID')) \
    .groupby('LocationID') \
    .count() \
    .join(taxi_zone_lookup, on='LocationID') \
    .sort(F.col('count').desc()) \
    .head(5)

[Row(LocationID=61, count=231279, Borough='Brooklyn', Zone='Crown Heights North', service_zone='Boro Zone'),
 Row(LocationID=79, count=221244, Borough='Manhattan', Zone='East Village', service_zone='Yellow Zone'),
 Row(LocationID=132, count=188867, Borough='Queens', Zone='JFK Airport', service_zone='Airports'),
 Row(LocationID=37, count=187929, Borough='Brooklyn', Zone='Bushwick South', service_zone='Boro Zone'),
 Row(LocationID=76, count=186780, Borough='Brooklyn', Zone='East New York', service_zone='Boro Zone')]