In [74]:
from pyspark.sql import SparkSession

In [75]:
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [76]:
stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "submissions") \
  .load()

In [77]:
stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [78]:
raw_stream = stream_df \
    .writeStream \
    .format("memory") \
    .queryName("raw_submission_view") \
    .start()

In [79]:
from IPython.display import display, clear_output
import time

In [80]:
clear_output(wait=True)

In [81]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM raw_submission_view').show(20))
time.sleep(1)

+-------------------+--------------------+-----------+---------+------+--------------------+-------------+
|                key|               value|      topic|partition|offset|           timestamp|timestampType|
+-------------------+--------------------+-----------+---------+------+--------------------+-------------+
|[6E 6E 75 76 75 66]|[7B 22 61 75 74 6...|submissions|        0|     0|2021-05-30 11:27:...|            0|
|[6E 6E 76 32 38 67]|[7B 22 61 75 74 6...|submissions|        0|     1|2021-05-30 11:27:...|            0|
|[6E 6E 77 33 33 78]|[7B 22 61 75 74 6...|submissions|        0|     2|2021-05-30 11:27:...|            0|
|[6E 6E 77 6B 68 7A]|[7B 22 61 75 74 6...|submissions|        0|     3|2021-05-30 11:27:...|            0|
|[6E 6E 77 71 74 31]|[7B 22 61 75 74 6...|submissions|        0|     4|2021-05-30 11:27:...|            0|
|[6E 6E 77 78 64 31]|[7B 22 61 75 74 6...|submissions|        0|     5|2021-05-30 11:27:...|            0|
|[6E 6E 78 30 76 35]|[7B 22 61 75 74 

None

In [82]:
raw_stream.stop()

In [83]:
from pyspark.sql.types import StringType

In [84]:
string_stream_df = stream_df \
    .withColumn("key", stream_df["key"].cast(StringType())) \
    .withColumn("value", stream_df["value"].cast(StringType()))

In [85]:
string_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [86]:
string_stream = string_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("string_submission_view") \
    .start()

In [87]:
clear_output(wait=True)
display(spark.sql('SELECT value FROM string_submission_view WHERE key IS NOT NULL').show(20, False))
time.sleep(1)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                            

None

In [88]:
string_stream.stop()

In [89]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, BooleanType, IntegerType, DoubleType, StringType

In [90]:
schema_inventory = StructType([
    StructField("author_fullname", StringType(),  True),
    StructField("title", StringType(),  True),
    StructField("subreddit_name_prefixed", StringType(), True),
    StructField("name", StringType(),  True),
    StructField("upvote_ratio", DoubleType(), True),
    StructField("ups", IntegerType(), True),
    StructField("score", IntegerType(), True),
    StructField("author_premium", BooleanType(), True),
    StructField("created", IntegerType(), True),
    StructField("domain", StringType(),  True),
    StructField("url_overridden_by_dest", StringType(),  True),
    StructField("over_18", BooleanType(), True),
    StructField("subreddit_id", StringType(),  True),
    StructField("permalink", StringType(),  True),
    StructField("parent_whitelist_status", StringType(),  True),
    StructField("url", StringType(),  True),
    StructField("created_utc", IntegerType(), True)
])


In [91]:
json_stream_df = string_stream_df\
    .withColumn("value", F.from_json("value", schema_inventory))

In [92]:
json_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- author_fullname: string (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- subreddit_name_prefixed: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- upvote_ratio: double (nullable = true)
 |    |-- ups: integer (nullable = true)
 |    |-- score: integer (nullable = true)
 |    |-- author_premium: boolean (nullable = true)
 |    |-- created: integer (nullable = true)
 |    |-- domain: string (nullable = true)
 |    |-- url_overridden_by_dest: string (nullable = true)
 |    |-- over_18: boolean (nullable = true)
 |    |-- subreddit_id: string (nullable = true)
 |    |-- permalink: string (nullable = true)
 |    |-- parent_whitelist_status: string (nullable = true)
 |    |-- url: string (nullable = true)
 |    |-- created_utc: integer (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable 

In [93]:
json_stream = json_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("extract_submissions") \
    .start()

In [102]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM extract_submissions').show(20))
time.sleep(1)

+------+--------------------+-----------+---------+------+--------------------+-------------+
|   key|               value|      topic|partition|offset|           timestamp|timestampType|
+------+--------------------+-----------+---------+------+--------------------+-------------+
|nnuvuf|{t2_7ummq, Syria:...|submissions|        0|     0|2021-05-30 11:27:...|            0|
|nnv28g|{t2_3t8c9oj4, Ben...|submissions|        0|     1|2021-05-30 11:27:...|            0|
|nnw33x|{t2_b8bk0, Turkis...|submissions|        0|     2|2021-05-30 11:27:...|            0|
|nnwkhz|{t2_c1tlg, Havert...|submissions|        0|     3|2021-05-30 11:27:...|            0|
|nnwqt1|{t2_a2wzwjox, Ind...|submissions|        0|     4|2021-05-30 11:27:...|            0|
|nnwxd1|{t2_7ummq, U.S. t...|submissions|        0|     5|2021-05-30 11:27:...|            0|
|nnx0v5|{t2_bpurbs2s, Col...|submissions|        0|     6|2021-05-30 11:27:...|            0|
|nnx58m|{t2_8yk3uzd, Viet...|submissions|        0|     7|20

None

In [95]:
json_stream.stop()

In [96]:
submission_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
            "value.author_fullname", \
            "value.title", \
            "value.subreddit_name_prefixed", \
            "value.name", \
            "value.upvote_ratio", \
            "value.ups", \
            "value.score", \
            "value.author_premium", \
            "value.created", \
            "value.domain", \
            "value.url_overridden_by_dest", \
            "value.over_18", \
            "value.subreddit_id", \
            "value.permalink", \
            "value.parent_whitelist_status", \
            "value.url", \
            "value.created_utc"
           )

In [97]:
submission_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- title: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- name: string (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- ups: integer (nullable = true)
 |-- score: integer (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- created: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- url_overridden_by_dest: string (nullable = true)
 |-- over_18: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- url: string (nullable = true)
 |-- created_utc: integer (nullable = true)



In [98]:
submission_stream = submission_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("submission_view") \
    .start()

In [103]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM submission_view').show(20))
time.sleep(1)

+---------+-----------+--------------------+---------------+--------------------+-----------------------+---------+------------+----+-----+--------------+-------+--------------------+----------------------+-------+------------+--------------------+-----------------------+--------------------+-----------+
|event_key|event_topic|     event_timestamp|author_fullname|               title|subreddit_name_prefixed|     name|upvote_ratio| ups|score|author_premium|created|              domain|url_overridden_by_dest|over_18|subreddit_id|           permalink|parent_whitelist_status|                 url|created_utc|
+---------+-----------+--------------------+---------------+--------------------+-----------------------+---------+------------+----+-----+--------------+-------+--------------------+----------------------+-------+------------+--------------------+-----------------------+--------------------+-----------+
|   nnuvuf|submissions|2021-05-30 11:27:...|       t2_7ummq|Syria: 2 killed, ...| 

None

In [104]:
clear_output(wait=True)
display(spark.sql('SELECT domain, COUNT(1) AS count FROM submission_view GROUP BY domain ORDER BY count DESC').show(5))
time.sleep(1)

+---------------+-----+
|         domain|count|
+---------------+-----+
|    reuters.com|   13|
|        bbc.com|    6|
|   france24.com|    5|
|theguardian.com|    4|
|dailymail.co.uk|    3|
+---------------+-----+
only showing top 5 rows



None

In [101]:
submission_stream.stop()

In [105]:
spark.stop()