In [5]:
# Import relevant functions
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
import pyspark
from pyspark import broadcast, SparkContext
from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover, OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer, Bucketizer
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml.clustering import LDA
from pyspark.ml.functions import vector_to_array
from pyspark.sql import SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.mllib.util import MLUtils
import re
import scipy
import string
import sys

In [4]:
# Build the spark session
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [6]:
spark.version

'3.1.1'

In [7]:
sc = spark.sparkContext

## Load data from csv, strip to required size then save to parquet

In [8]:
# Load all submission csv files
data = spark.read.format('csv').options(header ='true').load(f"./data/submissions/*.csv")

In [9]:
print(data.printSchema())
data.count()

root
 |-- _c0: string (nullable = true)
 |-- id: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- title: string (nullable = true)
 |-- score: string (nullable = true)
 |-- author_premium: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- over_18: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- parent_whitelist_status: string (nullable = true)
 |-- url: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- upvote_ratio: string (nullable = true)

None


127600

In [10]:
data.head(5)

[Row(_c0='0', id='l8zd9o', author_fullname='t2_a30zb78q', title='موقع إخباري متنوع.. تعرف علي آخر الأخبار', score='1', author_premium='False', domain='nabakham.com', over_18='False', subreddit_id='t5_2qh13', permalink='/r/worldnews/comments/l8zd9o/موقع_إخباري_متنوع_تعرف_علي_آخر_الأخبار/', parent_whitelist_status='all_ads', url='https://nabakham.com/', created_utc='1612051145', num_comments='1', upvote_ratio='1.0'),
 Row(_c0='1', id='l8zd2a', author_fullname='t2_9xscy03j', title='New clinical trials raise fears the coronavirus is learning how to resist vaccines', score='1', author_premium='True', domain='google.co.uk', over_18='False', subreddit_id='t5_2qh13', permalink='/r/worldnews/comments/l8zd2a/new_clinical_trials_raise_fears_the_coronavirus/', parent_whitelist_status='all_ads', url='https://www.google.co.uk/amp/s/news.yahoo.com/amphtml/clinical-trials-raise-fears-coronavirus-040855671.html', created_utc='1612051128', num_comments='0', upvote_ratio='1.0'),
 Row(_c0='2', id='l8zaej'

In [11]:
# Only keep the needed columns and remove duplicate rows
df = data.select('id','title','domain','subreddit_id','created_utc','num_comments')
df = df.distinct()
df.count()

127600

In [12]:
# Fill nulls in the num_comments field with zero
df = df.na.fill(value=0,subset=["num_comments"])

In [13]:
df.show(5)

+------+--------------------+--------------------+------------+-----------+------------+
|    id|               title|              domain|subreddit_id|created_utc|num_comments|
+------+--------------------+--------------------+------------+-----------+------------+
|l8w5qa|Jessica Simpson T...|           bluzz.org|    t5_2qh13| 1612042018|           0|
|l8ni30|Pornhub Now Accep...|  cryptobriefing.com|    t5_2qh13| 1612019085|           0|
|l8n6ta|India Mulls Law T...|            ndtv.com|    t5_2qh13| 1612018166|           2|
|l8j19t|tìm hiểu drama là...|indexlink93447958...|    t5_2qh13| 1612003085|           0|
|l8co25|House Democrats p...|           bluzz.org|    t5_2qh13| 1611977317|           0|
+------+--------------------+--------------------+------------+-----------+------------+
only showing top 5 rows



In [16]:
# Remove punctuation and numbers from titles
df = df.withColumn("title", F.regexp_replace(F.col("title"), '[^\sa-zA-Z]', ''))
df.show(5)

+------+--------------------+--------------------+------------+-----------+------------+
|    id|               title|              domain|subreddit_id|created_utc|num_comments|
+------+--------------------+--------------------+------------+-----------+------------+
|l8w5qa|Jessica Simpson T...|           bluzz.org|    t5_2qh13| 1612042018|           0|
|l8ni30|Pornhub Now Accep...|  cryptobriefing.com|    t5_2qh13| 1612019085|           0|
|l8n6ta|India Mulls Law T...|            ndtv.com|    t5_2qh13| 1612018166|           2|
|l8j19t|tm hiu drama l g ...|indexlink93447958...|    t5_2qh13| 1612003085|           0|
|l8co25|House Democrats p...|           bluzz.org|    t5_2qh13| 1611977317|           0|
+------+--------------------+--------------------+------------+-----------+------------+
only showing top 5 rows



## Prepare titles and do topics modelling

In [17]:
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsDataFrame = tokenizer.transform(df)

In [18]:
wordsDataFrame.show()

+------+--------------------+--------------------+------------+-----------+------------+--------------------+
|    id|               title|              domain|subreddit_id|created_utc|num_comments|               words|
+------+--------------------+--------------------+------------+-----------+------------+--------------------+
|l8w5qa|Jessica Simpson T...|           bluzz.org|    t5_2qh13| 1612042018|           0|[jessica, simpson...|
|l8ni30|Pornhub Now Accep...|  cryptobriefing.com|    t5_2qh13| 1612019085|           0|[pornhub, now, ac...|
|l8n6ta|India Mulls Law T...|            ndtv.com|    t5_2qh13| 1612018166|           2|[india, mulls, la...|
|l8j19t|tm hiu drama l g ...|indexlink93447958...|    t5_2qh13| 1612003085|           0|[tm, hiu, drama, ...|
|l8co25|House Democrats p...|           bluzz.org|    t5_2qh13| 1611977317|           0|[house, democrats...|
|l7y8yl|Congos prime mini...|      abcnews.go.com|    t5_2qh13| 1611940865|           1|[congos, prime, m...|
|l7y6fc|  

In [19]:
stop_words =StopWordsRemover.loadDefaultStopWords("english")
stop_words = stop_words + ['a','i']

In [20]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords = stop_words)
wordsDataFrame = remover.transform(wordsDataFrame)

In [21]:
wordsDataFrame.show(truncate = 50)

+------+--------------------------------------------------+-------------------------------+------------+-----------+------------+--------------------------------------------------+--------------------------------------------------+
|    id|                                             title|                         domain|subreddit_id|created_utc|num_comments|                                             words|                                          filtered|
+------+--------------------------------------------------+-------------------------------+------------+-----------+------------+--------------------------------------------------+--------------------------------------------------+
|l8w5qa|Jessica Simpson Twins With Daughter Birdie in A...|                      bluzz.org|    t5_2qh13| 1612042018|           0|[jessica, simpson, twins, with, daughter, birdi...|[jessica, simpson, twins, daughter, birdie, ado...|
|l8ni30|             Pornhub Now Accepts Dogecoin Payments|             

In [22]:
cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
cvmodel = cv.fit(wordsDataFrame)

#save the countvectorizer to apply to new texts
cvmodel.write().overwrite().save('count_vectorizer_model')
# When need to relaod this model elsewhere use : loadedModel = CountVectorizerModel.load('count_vectorizer_model')

df_vect = cvmodel.transform(wordsDataFrame)
basics = df_vect.select('vectors', 'id')
basics.show()

+--------------------+------+
|             vectors|    id|
+--------------------+------+
|(68283,[464,1644,...|l8w5qa|
|(68283,[2208,2387...|l8ni30|
|(68283,[8,129,224...|l8n6ta|
|(68283,[0,47,65,9...|l8j19t|
|(68283,[88,712,84...|l8co25|
|(68283,[99,477,51...|l7y8yl|
|(68283,[1495,9609...|l7y6fc|
|(68283,[6,226,378...|l7qrh2|
|(68283,[768,895,1...|l7g5r4|
|(68283,[4,21,33,1...|l77aqb|
|(68283,[6,97,157,...|l6wpsu|
|(68283,[866,1391,...|l6vfrx|
|(68283,[7,27,246,...|l686x9|
|(68283,[1,191,456...|l5g6ya|
|(68283,[240,323,3...|l5cr2f|
|(68283,[2,5,183,2...|l51rbl|
|(68283,[117,208,2...|l51nju|
|(68283,[18,26,35,...|l4znqi|
|(68283,[38,43,190...|l4vc0j|
|(68283,[6388],[1.0])|l4sjox|
+--------------------+------+
only showing top 20 rows



In [23]:
basics.write.mode("overwrite").parquet('lda_basics')

In [24]:
basics= spark.read.parquet('lda_basics')

In [25]:
# Create the LDA model and fit it
num_topics = 20
lda = LDA(featuresCol='vectors',k=num_topics, seed=42)
#Train the LDA model
lda_model = lda.fit(basics)

# Save teh model to be applied later in stream pipeline
lda_model.write().overwrite().save('lda_distributed_model')
# When need to relaod this model elsewhere use : sameModel = DistributedLDAModel.load('lda_distributed_model')

In [26]:
# See the results of modelling
ldatopics = lda_model.describeTopics(10)
ldatopics.show(truncate = 50)

+-----+--------------------------------------------------+--------------------------------------------------+
|topic|                                       termIndices|                                       termWeights|
+-----+--------------------------------------------------+--------------------------------------------------+
|    0|[1527, 873, 615, 273, 0, 213, 221, 1, 27260, 27...|[7.055094649167458E-4, 6.077309513852824E-4, 6....|
|    1|[5475, 7414, 11627, 7057, 13477, 17586, 16497, ...|[3.0670203550234326E-4, 2.563892080466726E-4, 1...|
|    2|    [48, 12, 2, 81, 85, 919, 0, 8590, 11714, 4192]|[0.011627328242460442, 0.011284175883587764, 0....|
|    3|[0, 6, 7358, 1097, 9752, 9238, 9906, 9853, 9011...|[2.3048720922918541E-4, 2.2139110730733357E-4, ...|
|    4|[591, 2036, 88, 2537, 1637, 573, 2776, 0, 3525,...|[9.1564175254082E-4, 9.14738990840352E-4, 7.321...|
|    5|[3590, 0, 4277, 5088, 4685, 6321, 5360, 1866, 4...|[6.478889532414813E-4, 3.888782755041842E-4, 3....|
|    6|   

In [27]:
# For mapping words to the model term indices, first collect the words used in the input vectors
vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab) # saves the words to a broadcast variable for use in mapping
print("Number of words in word vectors: " + str(len(vocab)))
print("Example last words: ")
vocab[-6:-1]

Number of words in word vectors: 68283
Example last words: 


['leadup', 'batangas', 'intergenerational', 'rpartis', 'dundrennan']

In [29]:
# Now match input words to terms in model output topics
def map_termID_to_Word(termIndices):
    words = []
    for termID in termIndices:
        words.append(vocab_broadcast.value[termID])

    return words

udf_map_termID_to_Word = F.udf(map_termID_to_Word , ArrayType(StringType()))

ldatopics_mapped = ldatopics.withColumn("topic_desc", udf_map_termID_to_Word(ldatopics.termIndices))

In [30]:
ldatopics_mapped.show(truncate=False)

+-----+-------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+
|topic|termIndices                                                        |termWeights                                                                                                                                                                                                                          |topic_desc                                                                                                |
+-----+-------------------------------------------------------------------+---------------------------------------------------------------------------------------------------

In [31]:
# Use the model to allocate topic weightings for each title
indiv = lda_model.transform(basics)

In [32]:
indiv.show()

+--------------------+------+--------------------+
|             vectors|    id|   topicDistribution|
+--------------------+------+--------------------+
|(68283,[18,26,83,...|l8xslp|[0.00580927295834...|
|(68283,[214,267,2...|l8vqvn|[0.00421945021569...|
|       (68283,[],[])|l8usg7|[0.0,0.0,0.0,0.0,...|
|(68283,[0,66,121,...|l8t0zg|[0.00386671573789...|
|(68283,[0,10,20,2...|l8o2wz|[0.00464299987876...|
|(68283,[12,71,372...|l8k2ic|[0.00580927296207...|
|(68283,[1,815,236...|l8ip36|[0.00464299988399...|
|(68283,[5,11,478,...|l8bylm|[0.00464299990936...|
|(68283,[1,4,43,18...|l89f20|[0.00464299987924...|
|(68283,[1,5,63,21...|l88ld9|[0.00580927295893...|
|(68283,[3,4,126,1...|l7sxn2|[0.00516106957905...|
|(68283,[145,416,4...|l7si03|[0.00664368512307...|
|(68283,[11,218,34...|l7rn9f|[0.00421945022387...|
|(68283,[0,2,45,48...|l7pbej|[0.00464299987897...|
|   (68283,[0],[1.0])|l7ddrt|[0.02357359743736...|
|(68283,[678,895,1...|l763bt|[0.00464299987939...|
|(68283,[1120,2626...|l6vj09|[0

## Build features and target dataset

In [33]:
# convert epoch time into datetime, but keep epoch time as id
df= df.withColumn('timestamp', F.from_unixtime(df.created_utc,'yyyy-MM-dd HH:mm:ss')) 

In [36]:
df = df.withColumn("hour", F.hour(F.col("timestamp"))).withColumn("day", F.dayofweek(F.col("timestamp")))
df = df.withColumn("hour", F.col("hour").astype(StringType())).withColumn("day", F.col("day").astype(StringType()))

In [37]:
df.show(truncate=60)

+------+------------------------------------------------------------+-------------------------------+------------+-----------+------------+-------------------+----+---+
|    id|                                                       title|                         domain|subreddit_id|created_utc|num_comments|          timestamp|hour|day|
+------+------------------------------------------------------------+-------------------------------+------------+-----------+------------+-------------------+----+---+
|l8w5qa|  Jessica Simpson Twins With Daughter Birdie in Adorable Pic|                      bluzz.org|    t5_2qh13| 1612042018|           0|2021-01-30 21:26:58|  21|  7|
|l8ni30|                       Pornhub Now Accepts Dogecoin Payments|             cryptobriefing.com|    t5_2qh13| 1612019085|           0|2021-01-30 15:04:45|  15|  7|
|l8n6ta|India Mulls Law To Ban Cryptocurrencies Create Official D...|                       ndtv.com|    t5_2qh13| 1612018166|           2|2021-01-30 14:49

In [38]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- day: string (nullable = true)



In [39]:
# Create features df by combining topic distributions for each title, with the other features of domain, hour and day
temp = indiv.select('topicDistribution')
# And a list of sequential column titles
X_titles = [f'T_{i}' for i in range(1, num_topics +1)]
X_titles = ['id'] + X_titles # adding the id at start

In [41]:
temp = indiv.withColumn("T_", vector_to_array("topicDistribution")).select(["id"] + [F.col("T_")[i] for i in range(num_topics)]).drop('vectors', 'topicDistribution')
temp = temp.toDF(*X_titles)
temp.show()

+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    id|                 T_1|                 T_2|                 T_3|                 T_4|                 T_5|                 T_6|                 T_7|                 T_8|                 T_9|                T_10|                T_11|                T_12|                T_13|                T_14|                T_15|                T_16|                T_17|                T_18|                T_19|                T_20|
+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------

In [42]:
rf_full = temp.join(df, on = 'id', how= 'left').drop('id','subreddit_id','title','timestamp','created_utc')

In [43]:
rf_full.printSchema()

root
 |-- T_1: double (nullable = true)
 |-- T_2: double (nullable = true)
 |-- T_3: double (nullable = true)
 |-- T_4: double (nullable = true)
 |-- T_5: double (nullable = true)
 |-- T_6: double (nullable = true)
 |-- T_7: double (nullable = true)
 |-- T_8: double (nullable = true)
 |-- T_9: double (nullable = true)
 |-- T_10: double (nullable = true)
 |-- T_11: double (nullable = true)
 |-- T_12: double (nullable = true)
 |-- T_13: double (nullable = true)
 |-- T_14: double (nullable = true)
 |-- T_15: double (nullable = true)
 |-- T_16: double (nullable = true)
 |-- T_17: double (nullable = true)
 |-- T_18: double (nullable = true)
 |-- T_19: double (nullable = true)
 |-- T_20: double (nullable = true)
 |-- domain: string (nullable = true)
 |-- num_comments: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- day: string (nullable = true)



In [44]:
rf_full.count()

127600

In [45]:
# Convert the number of comments into numerical and create a grouped categorical
rf_full = rf_full.withColumn("num_comments", F.col("num_comments").astype(IntegerType()))
bucketizer = Bucketizer(splitsArray=[[0,5,25,float("inf")]],inputCols=["num_comments"], outputCols=["group_comments"])
rf_full = bucketizer.setHandleInvalid("keep").transform(rf_full)
rf_full = rf_full.drop('num_comments')
rf_full.show(20)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----+---+--------------+
|                 T_1|                 T_2|                 T_3|                 T_4|                 T_5|                 T_6|                 T_7|                 T_8|                 T_9|                T_10|                T_11|                T_12|                T_13|                T_14|                T_15|                T_16|                T_17|                T_18|                T_19|                T_20|              domain|hour|day|group_comments|
+--------------------+--------------------+--------------------+--

In [46]:
rf_full.printSchema()

root
 |-- T_1: double (nullable = true)
 |-- T_2: double (nullable = true)
 |-- T_3: double (nullable = true)
 |-- T_4: double (nullable = true)
 |-- T_5: double (nullable = true)
 |-- T_6: double (nullable = true)
 |-- T_7: double (nullable = true)
 |-- T_8: double (nullable = true)
 |-- T_9: double (nullable = true)
 |-- T_10: double (nullable = true)
 |-- T_11: double (nullable = true)
 |-- T_12: double (nullable = true)
 |-- T_13: double (nullable = true)
 |-- T_14: double (nullable = true)
 |-- T_15: double (nullable = true)
 |-- T_16: double (nullable = true)
 |-- T_17: double (nullable = true)
 |-- T_18: double (nullable = true)
 |-- T_19: double (nullable = true)
 |-- T_20: double (nullable = true)
 |-- domain: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- day: string (nullable = true)
 |-- group_comments: double (nullable = true)



In [47]:
# Drop rows where there are null values, to avaoid probelms with modelling
rf_full = rf_full.na.drop("any")

In [50]:
# Check all gone
print(rf_full.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in rf_full.columns]).show()) 

+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+------+----+---+--------------+
|T_1|T_2|T_3|T_4|T_5|T_6|T_7|T_8|T_9|T_10|T_11|T_12|T_13|T_14|T_15|T_16|T_17|T_18|T_19|T_20|domain|hour|day|group_comments|
+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+------+----+---+--------------+
|  0|  0|  0|  0|  0|  0|  0|  0|  0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|     0|   0|  0|             0|
+---+---+---+---+---+---+---+---+---+----+----+----+----+----+----+----+----+----+----+----+------+----+---+--------------+

None


In [52]:
rf_full.write.mode("overwrite").parquet('rf_full')

In [53]:
rf_full = spark.read.parquet('rf_full')

## Now build the ML model

In [54]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [55]:
# Need to one-hot-encode the domain, hour and day columns as they are categorical
#Create variable for all categorical columns
cat_cols = ['domain','hour','day']
#Create and empty list and put the string indexed and one-hot-encoded transformations into it
stages = []
for cat_col in cat_cols:
    col_indexer = StringIndexer(inputCol=cat_col, outputCol=f"{cat_col}_ind",handleInvalid='skip')
    col_encoder = OneHotEncoder(inputCols=[f"{cat_col}_ind"], outputCols=[f"{cat_col}_ohe"])
    stages += [col_indexer, col_encoder]

In [56]:
# Split data into train and test sets
train, test = rf_full.randomSplit([0.8, 0.2], seed=42)

# Seperate out the features from the data
X = rf_full.drop('group_comments')
y= rf_full.select('group_comments')

# Make list of numeric columns (here that is all columns in features not categorical)
features_cols = X.columns
num_cols = [x for x in features_cols if x not in cat_cols]

In [57]:
# Change title of one not encoded categoricals
cat_cols_ohe = [f"{cat_col}_ohe" for cat_col in cat_cols]

In [58]:
# Build all features into one vetor
assembler = VectorAssembler(inputCols=cat_cols_ohe + num_cols, outputCol="features")
stages += [assembler]

In [59]:
# Specify the model core and target variable
rf = RandomForestClassifier(labelCol="group_comments",featuresCol="features")
stages += [rf]

In [60]:
# Populate the pipeline
pipeline = Pipeline(stages=stages)

In [61]:
# Fit the model to training set via pipeline
from datetime import datetime 
start_time = datetime.now() 

pipeline_model = pipeline.fit(train)

print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))

Time elapsed (hh:mm:ss.ms) 0:01:28.924435


In [62]:
# Save the model for later use
pipeline_model.write().overwrite().save('pipeline_model')

In [63]:
# Test the model
test_preds = pipeline_model.transform(test)

In [64]:
evaluator = MulticlassClassificationEvaluator(labelCol=rf.getLabelCol(), predictionCol=rf.getPredictionCol(), metricName="accuracy")
accuracy = evaluator.evaluate(test_preds)
print("Test Error = %g" % (1.0 - accuracy))

Test Error = 0.174157


In [65]:
test_preds.select('prediction').show(5)

+----------+
|prediction|
+----------+
|       0.0|
|       0.0|
|       0.0|
|       0.0|
|       0.0|
+----------+
only showing top 5 rows



In [66]:
preds_and_labels = test_preds.select(['prediction','group_comments']).withColumn('label', F.col('group_comments').cast(FloatType())).orderBy('prediction')

In [67]:
# Select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

[[20196.     0.     0.]
 [ 2628.     0.     0.]
 [ 1631.     0.     0.]]


In [68]:
spark.stop()