In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [3]:
spark.version

'3.1.1'

In [4]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.2.0'

In [5]:
import configparser
import praw
from confluent_kafka import Producer
import socket
import json

In [6]:
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "comments") \
  .load()

In [7]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [8]:
from IPython.display import display, clear_output
import time

In [9]:
clear_output(wait=True)

In [10]:
raw_stream = df \
    .writeStream \
    .format("memory") \
    .queryName("comment_view") \
    .start()

In [11]:
display(spark.sql('SELECT * FROM comment_view').show(20))

+---+-----+-----+---------+------+---------+-------------+
|key|value|topic|partition|offset|timestamp|timestampType|
+---+-----+-----+---------+------+---------+-------------+
+---+-----+-----+---------+------+---------+-------------+



None

In [12]:
time.sleep(1)

In [13]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM comment_view').show(20))
time.sleep(1)

+--------------------+--------------------+--------+---------+------+--------------------+-------------+
|                 key|               value|   topic|partition|offset|           timestamp|timestampType|
+--------------------+--------------------+--------+---------+------+--------------------+-------------+
|[68 31 34 78 68 7...|[7B 22 69 64 22 3...|comments|        0|     0|2021-06-09 10:35:...|            0|
|[68 31 34 78 69 6...|[7B 22 69 64 22 3...|comments|        0|     1|2021-06-09 10:35:...|            0|
|[68 31 34 78 6A 3...|[7B 22 69 64 22 3...|comments|        0|     2|2021-06-09 10:35:...|            0|
|[68 31 34 78 6A 6...|[7B 22 69 64 22 3...|comments|        0|     3|2021-06-09 10:35:...|            0|
|[68 31 34 78 6A 7...|[7B 22 69 64 22 3...|comments|        0|     4|2021-06-09 10:35:...|            0|
|[68 31 34 78 6A 7...|[7B 22 69 64 22 3...|comments|        0|     5|2021-06-09 10:35:...|            0|
|[68 31 34 78 6B 6...|[7B 22 69 64 22 3...|comments|   

None

In [14]:
raw_stream.stop()

In [15]:
from pyspark.sql.types import StringType

In [16]:
string_stream_df = df \
    .withColumn("key", df["key"].cast(StringType())) \
    .withColumn("value", df["value"].cast(StringType()))

In [17]:
string_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [18]:
string_stream = string_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("string_comments_view") \
    .start()

In [19]:
clear_output(wait=True)
display(spark.sql('SELECT value FROM string_comments_view WHERE key IS NOT NULL').show(20, False))
time.sleep(1)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

None

In [20]:
string_stream.stop()

In [21]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, BooleanType, IntegerType, DoubleType, StringType

In [22]:
schema_inventory = StructType([
    StructField("body", StringType(),  True),
    StructField("link_title", StringType(),  True),
    StructField("name", StringType(), True),
    StructField("link_author", StringType(),  True),
    StructField("ups", IntegerType(), True),
    StructField("created", IntegerType(), True),
    StructField("subreddit_id", StringType(),  True),
    StructField("permalink", StringType(),  True),
    StructField("link_url", StringType(),  True)
])

In [23]:
json_stream_df = string_stream_df\
    .withColumn("value", F.from_json("value", schema_inventory))

In [24]:
json_stream_df.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- body: string (nullable = true)
 |    |-- link_title: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- link_author: string (nullable = true)
 |    |-- ups: integer (nullable = true)
 |    |-- created: integer (nullable = true)
 |    |-- subreddit_id: string (nullable = true)
 |    |-- permalink: string (nullable = true)
 |    |-- link_url: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [25]:
json_stream = json_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("extract_comments") \
    .start()

In [26]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM extract_comments').show(20))
time.sleep(1)

+-------+--------------------+--------+---------+------+--------------------+-------------+
|    key|               value|   topic|partition|offset|           timestamp|timestampType|
+-------+--------------------+--------+---------+------+--------------------+-------------+
|h14xhwj|{Honestly, some p...|comments|        0|     0|2021-06-09 10:35:...|            0|
|h14xidi|{This was the mom...|comments|        0|     1|2021-06-09 10:35:...|            0|
|h14xj0b|{Mate, even if th...|comments|        0|     2|2021-06-09 10:35:...|            0|
|h14xjc7|{Hasbara, An unpu...|comments|        0|     3|2021-06-09 10:35:...|            0|
|h14xjpu|{I share your sen...|comments|        0|     4|2021-06-09 10:35:...|            0|
|h14xjzh|{He's got like "o...|comments|        0|     5|2021-06-09 10:35:...|            0|
|h14xkey|{Same guy. Differ...|comments|        0|     6|2021-06-09 10:35:...|            0|
|h14xkfb|{Because this art...|comments|        0|     7|2021-06-09 10:35:...|   

None

In [27]:
json_stream.stop()

In [28]:
comments_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
            "value.body", \
            "value.link_title", \
            "value.name", \
            "value.link_author", \
            "value.ups", \
            "value.created", \
            "value.subreddit_id", \
            "value.permalink", \
            "value.link_url"
           )

In [29]:
comments_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- link_title: string (nullable = true)
 |-- name: string (nullable = true)
 |-- link_author: string (nullable = true)
 |-- ups: integer (nullable = true)
 |-- created: integer (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- link_url: string (nullable = true)



In [75]:
comments_stream = comments_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("all_comments_view") \
    .start()

In [1]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM all_comments_view ORDER BY ups desc').show(20))
time.sleep(1)

NameError: name 'clear_output' is not defined

In [74]:
comments_stream.stop()

# Watermarked stream

In [48]:
window_duration = '60 seconds'
slide_duration = '10 seconds'

In [77]:
windowed_count_df = comments_stream_df \
    .withWatermark("event_timestamp", "1 minutes") \
    .groupBy(F.window(comments_stream_df.event_timestamp, window_duration, slide_duration), comments_stream_df.link_title) \
    .count()

In [78]:
count_stream = windowed_count_df \
    .writeStream \
    .format("memory") \
    .outputMode("Complete") \
    .queryName("count_view") \
    .start()

In [82]:
clear_output(wait=True)
display(spark.sql('SELECT * FROM count_view order by count desc').show(20))
time.sleep(1)

+--------------------+--------------------+-----+
|              window|          link_title|count|
+--------------------+--------------------+-----+
|{2021-06-09 10:35...|Twitter has block...|   20|
|{2021-06-09 10:35...|Twitter has block...|   20|
|{2021-06-09 10:35...|Twitter has block...|   20|
|{2021-06-09 10:35...|Twitter has block...|   19|
|{2021-06-09 10:35...|Twitter has block...|   19|
|{2021-06-09 10:35...|Twitter has block...|   19|
|{2021-06-09 10:35...|Uncovered atrocit...|   13|
|{2021-06-09 13:39...|Twitter has block...|   13|
|{2021-06-09 13:40...|Twitter has block...|   12|
|{2021-06-09 10:35...|Uncovered atrocit...|   11|
|{2021-06-09 12:15...|Twitter has block...|   11|
|{2021-06-09 10:35...|Uncovered atrocit...|   11|
|{2021-06-09 13:47...|Twitter has block...|   11|
|{2021-06-09 10:35...|Uncovered atrocit...|   11|
|{2021-06-09 10:35...|Uncovered atrocit...|   11|
|{2021-06-09 12:15...|Twitter has block...|   11|
|{2021-06-09 13:39...|Twitter has block...|   11|


None

In [72]:
count_stream.stop()

# Sink to Parquet 

In [83]:
comments_parquet = comments_stream_df \
    .writeStream \
    .format("parquet") \
    .option("checkpointLocation", "path/to/checkpoint/dir") \
    .option("path", "path/to/destination/dir") \
    .start()

In [84]:
comments_parquet.stop()

# Wordcloud

In [52]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [85]:
plt.figure(figsize=(20,9))
wordcloud = WordCloud(max_font_size=50, max_words=1000,
                      background_color="white").generate(comments_stream)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")

AttributeError: 'StreamingQuery' object has no attribute 'body'