In [1]:
%env SPARK_HOME=/usr/lib/spark
%env SPARK_KAFKA_VERSION=0.10

env: SPARK_HOME=/usr/lib/spark
env: SPARK_KAFKA_VERSION=0.10


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import findspark

findspark.init('/usr/lib/spark/')

In [4]:
import sys
import logging

logging.basicConfig(
    stream=sys.stderr,
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%y %H:%M:%S",
)

logger = logging.getLogger("py4j")
logger.info("Hello world!")

2023-03-23 00:54:48 Hello world!


## PySpark

You can find the updated instructions how to run Data Proc with Spark at directory [`week_5_batch_processing`](https://github.com/vbugaevskii/data-engineering-zoomcamp-cohort2023/blob/main/cohorts/2023/week_5_batch_processing/README.md).

In [5]:
import os

import pyspark

import pyspark.sql.types as T
import pyspark.sql.functions as F

from pyspark.sql import SparkSession

from pathlib import Path

In [6]:
pyspark.__version__

'3.0.3'

In [7]:
# NOTE: This works properly for pyspark==3.0.3

from IPython.display import clear_output

!rm -r jars || true
!mkdir -p jars
!./apache-maven-3.9.0/bin/mvn dependency:copy-dependencies -DoutputDirectory=jars

# clear_output()

[[1;34mINFO[m] Scanning for projects...
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m------------------< [0;36mcom.dataclub.zoomcamp.de:pyspark[0;1m >------------------[m
[[1;34mINFO[m] [1mBuilding pyspark 2.0[m
[[1;34mINFO[m]   from pom.xml
[[1;34mINFO[m] [1m--------------------------------[ jar ]---------------------------------[m
[[1;34mINFO[m] 
[[1;34mINFO[m] [1m--- [0;32mdependency:2.8:copy-dependencies[m [1m(default-cli)[m @ [36mpyspark[0;1m ---[m
[[1;34mINFO[m] Copying spark-sql-kafka-0-10_2.12-3.0.3.jar to /home/vbugaevskii/pyspark/jars/spark-sql-kafka-0-10_2.12-3.0.3.jar
[[1;34mINFO[m] Copying unused-1.0.0.jar to /home/vbugaevskii/pyspark/jars/unused-1.0.0.jar
[[1;34mINFO[m] Copying spark-token-provider-kafka-0-10_2.12-3.0.3.jar to /home/vbugaevskii/pyspark/jars/spark-token-provider-kafka-0-10_2.12-3.0.3.jar
[[1;34mINFO[m] Copying kafka-clients-2.4.1.jar to /home/vbugaevskii/pyspark/jars/kafka-clients-2.4.1.jar
[[1;34mINFO[m] Copying zstd-jni-1.4

#### Logging PySpark

You can see there is little output in Jupyter Notebook. The reason is that all output is printed to console. Unfrotunately, I failed to redirect console output to jupyter notebook.

In [8]:
# NOTE: jar_packages works properly for spark==3.0.3

jar_packages = [
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.3",
]

spark = (
    SparkSession.builder
        # .master("yarn")
        # .config("spark.jars", ','.join(map(str, Path("jars").glob("*.jar"))))
        .config("spark.jars.packages", ','.join(jar_packages))
        .config("spark.executor.cores", 2)
        .config("spark.executor.instances", 4)
        .config("spark.executor.memory", "2G")
        .getOrCreate()
)

sc = spark.sparkContext
sc

In [9]:
def read_from_kafka(topic: str) -> pyspark.sql.DataFrame:
    servers = [
        "rc1a-q38mpgujip0pbjir.mdb.yandexcloud.net:9092",
    ]

    stream = (
        spark.readStream
            .format("kafka")
            .option("kafka.bootstrap.servers", ",".join(servers))
            .option("kafka.sasl.mechanism", "SCRAM-SHA-512")
            .option("kafka.security.protocol", "SASL_PLAINTEXT")
            .option("kafka.sasl.jaas.config", f"org.apache.kafka.common.security.scram.ScramLoginModule required username=\"{os.environ['KAFKA_USER']}\" password=\"{os.environ['KAFKA_PASS']}\";")
            # .option("kafka.partition.assignment.strategy", "roundrobin")
            .option("subscribe", topic)
            .option("startingOffsets", "earliest")
            .option("checkpointLocation", "checkpoint")
            .load()
    )

    return stream

### Process Green Taxi

In [10]:
SCHEMA_GREEN = T.StructType([
    T.StructField('VendorID',              T.IntegerType(),   True),
    T.StructField('lpep_pickup_datetime',  T.TimestampType(), True),
    T.StructField('lpep_dropoff_datetime', T.TimestampType(), True),
    T.StructField('store_and_fwd_flag',    T.StringType(),    True),
    T.StructField('RatecodeID',            T.IntegerType(),   True),
    T.StructField('PULocationID',          T.IntegerType(),   True),
    T.StructField('DOLocationID',          T.IntegerType(),   True),
    T.StructField('passenger_count',       T.IntegerType(),   True),
    T.StructField('trip_distance',         T.FloatType(),     True),
    T.StructField('fare_amount',           T.FloatType(),     True),
    T.StructField('extra',                 T.FloatType(),     True),
    T.StructField('mta_tax',               T.FloatType(),     True),
    T.StructField('tip_amount',            T.FloatType(),     True),
    T.StructField('tolls_amount',          T.FloatType(),     True),
    T.StructField('ehail_fee',             T.FloatType(),     True),
    T.StructField('improvement_surcharge', T.FloatType(),     True),
    T.StructField('total_amount',          T.FloatType(),     True),
    T.StructField('payment_type',          T.IntegerType(),   True),
    T.StructField('trip_type',             T.IntegerType(),   True),
    T.StructField('congestion_surcharge',  T.FloatType(),     True),
])


def parse_green_ride_from_kafka_message(df_raw):
    assert df_raw.isStreaming is True, "DataFrame doesn't receive streaming data"

    df = df_raw.select(F.from_json(F.col("value").cast("string"), SCHEMA_GREEN).alias("value"))
    
    df = df.selectExpr(
        'value.VendorID',
        'value.lpep_pickup_datetime as pickup_datetime',
        'value.lpep_dropoff_datetime as dropoff_datetime',
        'value.PULocationID',
        'value.DOLocationID',
    )

    return df

In [11]:
df_taxi_green_raw = read_from_kafka("rides_green")
df_taxi_green_raw.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [12]:
df_taxi_green = parse_green_ride_from_kafka_message(df_taxi_green_raw)
df_taxi_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)



In [13]:
def sink_console(df, output_mode: str = 'complete', processing_time: str = '5 seconds'):
    write_query = (
        df.writeStream
            .outputMode(output_mode)
            .trigger(processingTime=processing_time)
            .format("console")
            .option("truncate", False)
            .start()
    )
    
    return write_query

In [14]:
sink_console(df_taxi_green, output_mode='append')

<pyspark.sql.streaming.StreamingQuery at 0x7fda746aeeb0>

### Process Fhv Taxi

In [15]:
SCHEMA_FHV = T.StructType([
    T.StructField('dispatching_base_num',   T.StringType(),    True),
    T.StructField('pickup_datetime',        T.TimestampType(), True),
    T.StructField('dropOff_datetime',       T.TimestampType(), True),
    T.StructField('PUlocationID',           T.IntegerType(),   True),
    T.StructField('DOlocationID',           T.IntegerType(),   True),
    T.StructField('SR_Flag',                T.StringType(),    True),
    T.StructField('Affiliated_base_number', T.StringType(),    True),
])


def parse_fhv_ride_from_kafka_message(df_raw):
    assert df_raw.isStreaming is True, "DataFrame doesn't receive streaming data"

    df = df_raw.select(F.from_json(F.col("value").cast("string"), SCHEMA_FHV).alias("value"))
    
    df = df.selectExpr(
        'value.dispatching_base_num as VendorID',
        'value.pickup_datetime as pickup_datetime',
        'value.dropoff_datetime as dropoff_datetime',
        'value.PUlocationID as PULocationID',
        'value.DOlocationID as DOLocationID',
    )

    return df

In [16]:
df_taxi_fhv_raw = read_from_kafka("rides_fhv")
df_taxi_fhv_raw.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [17]:
df_taxi_fhv = parse_fhv_ride_from_kafka_message(df_taxi_fhv_raw)
df_taxi_fhv.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)



In [18]:
sink_console(df_taxi_fhv, output_mode='append')

<pyspark.sql.streaming.StreamingQuery at 0x7fda74102ee0>

### Calculate Statistics

In [19]:
df_taxi = df_taxi_green.union(df_taxi_fhv).filter(F.col('PULocationID').isNotNull())
df_taxi

DataFrame[VendorID: string, pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: int, DOLocationID: int]

In [20]:
def sink_memory(df, query_name, query_template):
    write_query = (
        df.writeStream
            .queryName(query_name)
            .format('memory')
            .start()
    )
    
    query_str = query_template.format(table_name=query_name)
    query_results = spark.sql(query_str)

    return write_query, query_results

In [21]:
sql_query = """
select PUlocationID, count(*) as cnt
from {table_name}
group by PUlocationID
order by cnt desc
limit 5
"""

df_taxi_write, df_taxi_agg = sink_memory(df_taxi, "taxi_merged", sql_query)

In [22]:
df_taxi_write.status

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [24]:
df_taxi_agg.show()

+------------+----+
|PUlocationID| cnt|
+------------+----+
|          74|9903|
|          75|8215|
|          41|7937|
|           7|6343|
|          82|5596|
+------------+----+



In [25]:
df_taxi_write.stop()

### Write to Kafka

In [26]:
def prepare_dataframe_to_kafka_sink(df):
    return df.select(
        F.col('PULocationID').cast('string').alias('key'),
        F.to_json(F.struct([F.col(x) for x in df.columns])).alias("value"),
    )

In [27]:
df_taxi_write = prepare_dataframe_to_kafka_sink(df_taxi)
df_taxi_write

DataFrame[key: string, value: string]

In [28]:
def sink_kafka(df, topic, output_mode='append'):
    servers = [
        "rc1a-q38mpgujip0pbjir.mdb.yandexcloud.net:9092",
    ]
    
    write_query = (
        df.writeStream
            .format("kafka")
            .option("kafka.bootstrap.servers", ",".join(servers))
            .outputMode(output_mode)
            .option("topic", topic)
            .option("checkpointLocation", "checkpoint")
            .start()
    )

    return write_query

In [29]:
sink_kafka(df_taxi_write, 'rides_all')

<pyspark.sql.streaming.StreamingQuery at 0x7fda74132700>