# Stream predictions
Using the random forest model created on historical data stream predictions on new data.

In [162]:
# Import required packages
import json
import pyspark
from pyspark import broadcast, SparkContext
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover, OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer, Bucketizer
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.ml.functions import vector_to_array
from pyspark.ml.pipeline import PipelineModel
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [163]:
# Set parameters
broker = "broker:29092"
num_topics = 20
cat_cols = ['domain','hour','day']

In [164]:
# Build the spark session
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

## Load the raw stream and convert to dataframe object for processing
Consume data from the submission topic, prepare data for ML model and use ML model to predict number of comments on submissions as they are received. Predictions to feed back as a topic named **comment predictions**

In [165]:
# Load raw stream data for submissions and convert to df useable in processing to prediction
# NOTE - no need to do this for comments stream as the model is pre-trained and only applied to submission data
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", broker) \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "submissions") \
  .load()

In [166]:
submission_stream_df = df \
    .withColumn("key", df["key"].cast(StringType())) \
    .withColumn("value", df["value"].cast(StringType()))

In [167]:
# Specify the structure of the value component
schema_submissions = StructType([
    StructField("id", StringType(),  True),
    StructField("author_fullname", StringType(),  True),
    StructField("title", StringType(),  True),
    StructField("subreddit_name_prefixed", StringType(),  True),
    StructField("name", StringType(), True),
    StructField("upvote_ratio", DoubleType(),  True),
    StructField("ups", IntegerType(), True),
    StructField("created", IntegerType(), True),
    StructField("domain", StringType(), True),
    StructField("url_overridden_by_dest", StringType(), True),
    StructField("over_18", StringType(), True),
    StructField("subreddit_id", StringType(),  True),
    StructField("permalink", StringType(),  True),
    StructField("parent_whitelist_status", StringType(),  True),
    StructField("url", StringType(),  True),
    StructField("created_utc", IntegerType(), True)
])

In [168]:
# To json to split our the values in message
json_stream_df = submission_stream_df.withColumn("value", F.from_json("value", schema_submissions))

In [169]:
tokenizer = Tokenizer(inputCol="title", outputCol="words")

# Define stopwords for removal from prediction
stop_words =StopWordsRemover.loadDefaultStopWords("english")
stop_words = stop_words + ['a','i']
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)

# Load the vectorizer trained on the original training data and used for the LDA model training and hence the rf model relying on it.
cvmodel = CountVectorizerModel.load('count_vectorizer_model')

# Load the LDA model trained on the original training data 
lda_model = LocalLDAModel.load('lda_distributed_model')

# Load the pre-trained random forest model
pipeline_model = PipelineModel.load('pipeline_model')

In [170]:
# Create the submissions dataframe, with predictions, to be streamed
submissions_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
            "value.id", \
            "value.author_fullname", \
            "value.title", \
            "value.subreddit_name_prefixed", \
            "value.name", \
            "value.upvote_ratio", \
            "value.ups", \
            "value.created", \
            "value.domain", \
            "value.url_overridden_by_dest", \
            "value.over_18", \
            "value.subreddit_id", \
            "value.permalink", \
            "value.parent_whitelist_status", \
            "value.url",
            "value.created_utc"
           )

submissions_stream_df = submissions_stream_df.select('id','title','domain','subreddit_id','event_timestamp') \
    .withColumn("title", F.regexp_replace(F.col("title"), '[^\sa-zA-Z]', '')) \
    .withColumn("hour", F.hour(F.col("event_timestamp"))).withColumn("day", F.dayofweek(F.col("event_timestamp"))) \
    .withColumn("hour", F.col("hour").astype(StringType())).withColumn("day", F.col("day").astype(StringType()))

submissions_stream_df = tokenizer.transform(submissions_stream_df)
submissions_stream_df = remover.transform(submissions_stream_df)
submissions_stream_df = cvmodel.transform(submissions_stream_df)
#submissions_stream_df = submissions_stream_df.select('vectors', 'id')
submissions_stream_df = lda_model.transform(submissions_stream_df)

submissions_stream_df = submissions_stream_df.withColumn("T_", vector_to_array("topicDistribution")) \
    .drop('vectors', 'topicDistribution') \
    .select(["id","domain","hour","day"] + [F.col(f"T_")[i] for i in range(0,num_topics)]) \
    .withColumnRenamed("T_[0]", "T_1") \
    .withColumnRenamed("T_[1]", "T_2") \
    .withColumnRenamed("T_[2]", "T_3") \
    .withColumnRenamed("T_[3]", "T_4") \
    .withColumnRenamed("T_[4]", "T_5") \
    .withColumnRenamed("T_[5]", "T_6") \
    .withColumnRenamed("T_[6]", "T_7") \
    .withColumnRenamed("T_[7]", "T_8") \
    .withColumnRenamed("T_[8]", "T_9") \
    .withColumnRenamed("T_[9]", "T_10") \
    .withColumnRenamed("T_[10]", "T_11") \
    .withColumnRenamed("T_[11]", "T_12") \
    .withColumnRenamed("T_[12]", "T_13") \
    .withColumnRenamed("T_[13]", "T_14") \
    .withColumnRenamed("T_[14]", "T_15") \
    .withColumnRenamed("T_[15]", "T_16") \
    .withColumnRenamed("T_[16]", "T_17") \
    .withColumnRenamed("T_[17]", "T_18") \
    .withColumnRenamed("T_[18]", "T_19") \
    .withColumnRenamed("T_[19]", "T_20")

# Apply model to new submissions
submissions_stream_df = pipeline_model.transform(submissions_stream_df)

In [171]:
# Create prediction stream
submissions_stream = submissions_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("submissions_view") \
    .start()

In [None]:
# View data in the stream using pyspark SQL
subsmissions_data = spark.sql('SELECT * FROM submissions_view')
print(subsmissions_data.count())
subsmissions_data.show(5)

In [175]:
# Stream data as a new topic
submissions_stream_df.selectExpr("CAST(id AS STRING) AS key", "to_json(struct(*)) AS value") \
   .writeStream \
   .format("kafka") \
   .outputMode("append") \
   .option("kafka.bootstrap.servers", broker) \
   .option("topic", "comment_predictions") \
    .option("checkpointLocation", "path/to/HDFS/dir") \
   .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f280616ab80>

In [176]:
spark.stop()