In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, trim, lower

class StreamingWordCount:
    def __init__(self, spark):
        self.base_data_dir = "/Volumes/workspace/default/spark_streaming"
        self.spark = spark
    
    def getRawData(self):
        lines = self.spark.read \
                    .format("text") \
                    .option("lineSep", ".") \
                    .load(f"{self.base_data_dir}/data/text")
        
        return lines.select(explode(split(lines.value, " ")).alias("word"))
    
    def getQualityData(self, rawDF):
        qualityWordsDF = rawDF \
                            .select(lower(trim(rawDF.word)).alias("word")) \
                            .where("word is not null") \
                            .where("word rlike '[a-z]'")
        return qualityWordsDF
    
    def getWordCounts(self, qualityWordsDF):
        wordCountsDF = qualityWordsDF \
                            .groupBy("word") \
                            .count() \
                            .orderBy("count", ascending=False)
        return wordCountsDF

    def writeWordCounts(self, wordCountsDF):
        wordCountsDF.write \
                    .mode("overwrite") \
                    .saveAsTable("word_counts")
    
    def wordCount(self):
        print("Calculating word counts ...")
        rawDF = self.getRawData()
        qualityWordsDF = self.getQualityData(rawDF)
        wordCountsDF = self.getWordCounts(qualityWordsDF)
        self.writeWordCounts(wordCountsDF)
        print("Done!")
