In [None]:
from IPython.display import display, clear_output
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd

pd.options.display.max_columns = None
pd.options.display.max_rows = 30
pd.options.display.max_colwidth = 150

# SETTINGS
IN_PATH = "/home/jovyan/data-sets/twitter/"
OUT_PATH = ""
timestampformat = "EEE MMM dd HH:mm:ss zzzz yyyy"

spark = SparkSession.builder.appName("StructuredStreamingExample").getOrCreate()
schema = spark.read.json(IN_PATH).limit(10).schema

# regular spark reader
static_spark_reader = spark.read.schema(schema)

# streaming spark reader
stream_spark_reader = spark.readStream.schema(schema)

In [None]:
# Toggle between spark streaming and batch mode by changing the spark_reader below
spark_reader = static_spark_reader
# spark_reader = stream_spark_reader

In [None]:
df = (
    spark_reader.json(IN_PATH)
    .select(
        "id",
        # extract proper timestamp from created_at column
        f.to_timestamp(f.col("created_at"), timestampformat).alias("timestamp"),
        # extract user information
        f.col("user.screen_name").alias("user"),
        "text",
    )
    .coalesce(1)
)

distinct_user_count = df.select(f.approx_count_distinct("user"), f.current_timestamp())

if not df.isStreaming:
    print("Plain old, basic DataFrame - meh!")
    # Some actions only work on non-streaming DataFrames, like show and toPandas
    distinct_user_count.show()
    display(df.limit(25).toPandas())
else:
    print("We are streaming!")
    # Creating a DataSreamWriter and StreamingQuery
    # ===
    # Calling .writeStream on a DataFrame returns an instance of DataStreamWriter
    stream_writer = (
        distinct_user_count.writeStream
        # DataStream queries need to be named
        .queryName("distinct_user_count")
        .trigger(
            # processingTime="5 seconds",
            # Setting 'once' to True will make spark only process the stream 1 time - great for debugging
            once=True,  
        )
        .outputMode("complete")
        .format("memory")
    )
    # Calling .start on a DataStreamWriter return an instance of StreamingQuery
    query = stream_writer.start()

In [None]:
# .isStreaming can be used to determine if DataFrame is of Streaming kind or not
df.isStreaming

In [None]:
# .isActive shows if the query is actively running or not
query.isActive

In [None]:
# .start() transforms a DataStreamWriter to a StreamingQuery and starts the query execution
if not query.isActive:
    query = stream_writer.start()
    
# Calling .start on an already active StreamingQuery will raise an IllegalArgumentException
# -> 'Cannot start query with name {StreamingQuery.name} as a query with that name is already active'

In [None]:
# .stop() stops the query
query.stop()

In [None]:
# .lastProgress shows information on the last processed batch
query.lastProgress

In [None]:
# spark.sql can be used to request how the query is performing
display(spark.sql(f"SELECT * from {query.name}").toPandas())

In [None]:
# show live results for 2 minutes, refreshed every 1 second
from time import sleep
for x in range(0, 120):
    # spark.sql can be used to request how the query is performing
    display(spark.sql(f"SELECT * from {query.name}").toPandas())
    sleep(1)
    clear_output(wait=True)
else:
    print("Live view ended...")

In [None]:
# .show() will throw an error on Queries and Streaming DataFrames
distinct_user_count.show()

In [None]:
spark.stop()