In [1]:
! pip install praw
! pip install gensim



In [2]:
import configparser
from confluent_kafka import Producer
import json
import logging
from multiprocessing import Process
import os
import praw
import socket
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import re
import sys
import scipy
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora



In [32]:
import pyspark
import pyspark.sql.functions as F
from pyspark import broadcast, SparkContext
from pyspark.sql import SQLContext
from pyspark.mllib.util import MLUtils
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover, OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer, Bucketizer
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA, LocalLDAModel
from pyspark.ml.functions import vector_to_array
from pyspark.ml.pipeline import PipelineModel

In [4]:
# Read passwords and secrets from config file
config_parser = configparser.ConfigParser()
config_parser.read("configuration/config.cfg")

['configuration/config.cfg']

In [5]:
from pyspark.sql.functions import month, year, mean, count, dayofweek, hour, col, min, max, avg, sum, when, lit, desc, unix_timestamp, from_unixtime, udf, regexp_replace

In [6]:
# Build the spark session
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

## Load the raw stream and convert to dataframe objects for processing

In [7]:
# Load raw stream data for submissions and convert to df useable in processing to prediction
## NOTE - no need to do this for comments stream as the model is pre-trained and only applied to submission data

df_s = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "earliest") \
  .option("subscribe", "submissions") \
  .load()

In [8]:
df_s.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [9]:
string_stream_df_s = df_s \
    .withColumn("key", df_s["key"].cast(StringType())) \
    .withColumn("value", df_s["value"].cast(StringType()))

In [10]:
# Specify the structure of the value component
schema_inventory_s = StructType([
    StructField("id", StringType(),  True),
    StructField("author_fullname", StringType(),  True),
    StructField("title", StringType(),  True),
    StructField("subreddit_name_prefixed", StringType(),  True),
    StructField("name", StringType(), True),
    StructField("upvote_ratio", DoubleType(),  True),
    StructField("ups", IntegerType(), True),
    StructField("created", IntegerType(), True),
    StructField("domain", StringType(), True),
    StructField("url_overridden_by_dest", StringType(), True),
    StructField("over_18", StringType(), True),
    StructField("subreddit_id", StringType(),  True),
    StructField("permalink", StringType(),  True),
    StructField("parent_whitelist_status", StringType(),  True),
    StructField("url", StringType(),  True),
    StructField("created_utc", IntegerType(), True)
])

In [11]:
# To json to split our the values in message
json_stream_df_s = string_stream_df_s.withColumn("value", F.from_json("value", schema_inventory_s))
json_stream_df_s.printSchema()

root
 |-- key: string (nullable = true)
 |-- value: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- author_fullname: string (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- subreddit_name_prefixed: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- upvote_ratio: double (nullable = true)
 |    |-- ups: integer (nullable = true)
 |    |-- created: integer (nullable = true)
 |    |-- domain: string (nullable = true)
 |    |-- url_overridden_by_dest: string (nullable = true)
 |    |-- over_18: string (nullable = true)
 |    |-- subreddit_id: string (nullable = true)
 |    |-- permalink: string (nullable = true)
 |    |-- parent_whitelist_status: string (nullable = true)
 |    |-- url: string (nullable = true)
 |    |-- created_utc: integer (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- 

In [12]:
# Create the final usable df for submissions
submissions_stream_df = json_stream_df_s \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
            "value.id", \
            "value.author_fullname", \
            "value.title", \
            "value.subreddit_name_prefixed", \
            "value.name", \
            "value.upvote_ratio", \
            "value.ups", \
            "value.created", \
            "value.domain", \
            "value.url_overridden_by_dest", \
            "value.over_18", \
            "value.subreddit_id", \
            "value.permalink", \
            "value.parent_whitelist_status", \
            "value.url",
            "value.created_utc"
           )

submissions_stream_df.printSchema()

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- id: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- title: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- name: string (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- ups: integer (nullable = true)
 |-- created: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- url_overridden_by_dest: string (nullable = true)
 |-- over_18: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- url: string (nullable = true)
 |-- created_utc: integer (nullable = true)



In [13]:
submissions_stream = submissions_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("all_submissions_view") \
    .start()

In [14]:
subs_data = spark.sql('SELECT * FROM all_submissions_view ORDER BY created desc')
print(subs_data.printSchema())
print(subs_data.count())
subs_data.show(5, truncate = 40)

root
 |-- event_key: string (nullable = true)
 |-- event_topic: string (nullable = true)
 |-- event_timestamp: timestamp (nullable = true)
 |-- id: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- title: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- name: string (nullable = true)
 |-- upvote_ratio: double (nullable = true)
 |-- ups: integer (nullable = true)
 |-- created: integer (nullable = true)
 |-- domain: string (nullable = true)
 |-- url_overridden_by_dest: string (nullable = true)
 |-- over_18: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- url: string (nullable = true)
 |-- created_utc: integer (nullable = true)

None
101
+---------+-----------+----------------------+------+---------------+----------------------------------------+-----------------------+---------+------------+---+-------+---

## Process new titles data ready for modelling and prediction

In [15]:
# Only keep the needed columns and remove duplicate rows - submissions
subs_data = subs_data.select('id','title','domain','subreddit_id','event_timestamp')
subs_data = subs_data.distinct()
subs_data.count()

101

In [16]:
#Remove punctuation and numbers from titles
subs_data = subs_data.withColumn("title", regexp_replace(col("title"), '[^\sa-zA-Z]', ''))
#prepare a tokenizer
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsDataFrame = tokenizer.transform(subs_data)
#define stopwords
stop_words =StopWordsRemover.loadDefaultStopWords("english")
stop_words = stop_words + ['a','i']
#apply the stop words remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stop_words)
wordsDataFrame = remover.transform(wordsDataFrame)

In [17]:
# Load the vectorizer trained on the original training data and used for the LDA model training anf hence the rf model relying on it.
cvmodel = CountVectorizerModel.load('count_vectorizer_model')
# Then apply it to new titles data
df_vect = cvmodel.transform(wordsDataFrame)
basics = df_vect.select('vectors', 'id')
basics.show()

+--------------------+------+
|             vectors|    id|
+--------------------+------+
|(68283,[829,869,2...|nxsltb|
|(68283,[126,239,4...|ny1upx|
|(68283,[0,49,335,...|ny5gqw|
|(68283,[0,37,122,...|ny1q1d|
|(68283,[2,58,103,...|nxrys7|
|(68283,[73,121,13...|nxwm2p|
|(68283,[6,35,47,1...|ny4fgk|
|(68283,[239,452,5...|ny72f0|
|(68283,[0,73,353,...|ny51k5|
|(68283,[524,547,6...|ny5f33|
|(68283,[0,3,6,7,2...|nxvavt|
|(68283,[4,17,366,...|nxv3dn|
|(68283,[0,1,21,14...|ny1bag|
|(68283,[101,153,2...|ny799w|
|(68283,[1187,2324...|nxv3yl|
|(68283,[68,89,774...|ny1gsq|
|(68283,[47,63,126...|ny5njo|
|(68283,[598,1430,...|nxyzq5|
|(68283,[66,189,24...|ny60on|
|(68283,[163,200,3...|nxxicm|
+--------------------+------+
only showing top 20 rows



In [25]:
# Load the LDA model trained on the original training data 
lda_model = LocalLDAModel.load('lda_distributed_model')
# Then apply it to new titles vectors
indiv = lda_model.transform(basics)

## Build other features into dataset

In [27]:
subs_data = subs_data.withColumn("hour", hour(col("event_timestamp"))).withColumn("day", dayofweek(col("event_timestamp")))
subs_data = subs_data.withColumn("hour", F.col("hour").astype(StringType())).withColumn("day", F.col("day").astype(StringType()))

In [28]:
# The training model used this number of topics, so need to know that for vector importance here
num_topics = 20
# Create partial features df by collecting topic distributions for each title
temp = indiv.select('topicDistribution')
# Create a list of sequential column titles to be used for populating this
X_titles = [f'T_{i}' for i in range(1, num_topics +1)]
X_titles = ['id'] + X_titles # adding the id at start
# Populate topic features
temp = indiv.withColumn("T_", vector_to_array("topicDistribution")).select(["id"] + [col("T_")[i] for i in range(20)]).drop('vectors', 'topicDistribution')
temp = temp.toDF(*X_titles)
temp.show(truncate =40)

+------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+---------------------+
|    id|                  T_1|                  T_2|                  T_3|                  T_4|                  T_5|                  T_6|                  T_7|                  T_8|                  T_9|                 T_10|                T_11|                 T_12|                 T_13|                 T_14|                 T_15|                 T_16|                 T_17|                 T_18|                 T_19|                 T_20|
+------+---------------------+---------------------+---------------------+---------------------+--------

In [29]:
# Pull together the topic features and other ones
rf_full = temp.join(subs_data, on = 'id', how= 'left').drop('id','subreddit_id','title','event_timestamp','created_utc','link_id')

In [33]:
# Load the pre-trained random forest model
pipeline_model = PipelineModel.load('pipeline_model')

# Required variables for the pipeline
cat_cols = ['domain','hour','day'] # categorical columns
# Change title of one hot encoded categoricals created in the pipeline
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]
# Define the numerical columns
features_cols = rf_full.columns
num_cols = [x for x in features_cols if x not in cat_cols]

# Apply it to the new feature dataframe fro new submissions
predicted_comments = pipeline_model.transform(rf_full)

In [34]:
preds= predicted_comments.select(['prediction'])

In [35]:
preds.show(10)

+----------+
|prediction|
+----------+
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 10 rows



In [36]:
spark.stop()