In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, trim, lower

class StreamingWordCount:
    def __init__(self, spark):
        self.base_data_dir = "/Volumes/workspace/default/spark_streaming"
        self.spark = spark
    
    def getRawData(self):
        lines = self.spark.readStream \
                    .format("text") \
                    .option("lineSep", ".") \
                    .load(f"{self.base_data_dir}/data/text")
        
        return lines.select(explode(split(lines.value, " ")).alias("word"))
    
    def getQualityData(self, rawDF):
        qualityWordsDF = rawDF \
                            .select(lower(trim(rawDF.word)).alias("word")) \
                            .where("word is not null") \
                            .where("word rlike '[a-z]'")
        return qualityWordsDF
    
    def getWordCounts(self, qualityWordsDF):
        wordCountsDF = qualityWordsDF \
                            .groupBy("word") \
                            .count() \
                            .orderBy("count", ascending=False)
        return wordCountsDF

    def writeWordCounts(self, wordCountsDF):
        return wordCountsDF.writeStream \
                    .format("delta") \
                    .option("checkpointLocation", f"{self.base_data_dir}/checkpoint/word_count") \
                    .trigger(once=True) \
                    .outputMode("complete") \
                    .toTable("word_counts_streaming")
                    
    def wordCount(self):
        print("Starting streaming word counts ...")
        rawDF = self.getRawData()
        qualityWordsDF = self.getQualityData(rawDF)
        wordCountsDF = self.getWordCounts(qualityWordsDF)
        streaming_query = self.writeWordCounts(wordCountsDF)
        print("Done!")

        return streaming_query
