In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
sc = pyspark.SparkContext()

from collections import Counter
from pyspark.mllib.clustering import LDA
from pyspark.mllib.linalg import Vectors

spark = SparkSession.builder.getOrCreate()
sc.setCheckpointDir('checkpoint/')

In [2]:
df = spark.read.parquet("data/stemstem.parquet")
rdd = df.rdd.zipWithIndex().map(lambda r: (r[1], r[0]["Words"]))

In [3]:
rdd = sc.parallelize(rdd.take(20000))

In [4]:
def count_words(rdd):
    def word_occurrences(l):
        counter = dict(Counter(l))
        return list(zip(counter.keys(), counter.values()))
    
    return rdd.mapValues(word_occurrences)

In [5]:
def wordify(rdd):
    lower_case_set = lambda s: set(map(lambda x: x.lower(), s))
    words = list(rdd.map(lambda p: set(p[1])).reduce(lambda x, y: lower_case_set(x) | lower_case_set(y)))
    return dict([(words[i], i) for i in range(len(words))])

In [6]:
def LDAify(rdd, words):
    return rdd.mapValues(lambda l: [(words[w.lower()], occurrences) for (w, occurrences) in l]).mapValues(lambda l: Vectors.sparse(len(l), dict(l))).map(lambda p: list(p))

words = wordify(rdd)

In [7]:
X = LDAify(count_words(rdd), words).persist()

In [8]:
model = LDA.train(X, k = 10, maxIterations = 500, checkpointInterval = 50)

In [10]:
words_list = list(words)
for w_indices, distribution in model.describeTopics():
    print("Topic :")
    for w_index, percentage in zip(w_indices, distribution):
        print("    " + words_list[w_index] + ": " + str(100*percentage) + "%")

Topic :
    kindl: 2.1175615225345483%
    inform: 1.7890527605247972%
    worth: 1.442970583590072%
    easi: 1.2845071261650427%
    mani: 1.2012422479011475%
    chapter: 1.1601126451879025%
    writer: 1.0988170309709078%
    recip: 0.8309617505719659%
    understand: 0.7516686393860216%
    edit: 0.7489933418825905%
    simpl: 0.7336425329819877%
    purchas: 0.7297252126795548%
    practic: 0.6749992806524435%
    includ: 0.668311002498433%
    price: 0.6520075505056812%
    provid: 0.6377582925959221%
    excel: 0.5764872570367678%
    refer: 0.5737211275856253%
    exampl: 0.5534777885026622%
    error: 0.5387547065138039%
    content: 0.5365687433924355%
    guid: 0.5069752766029406%
    titl: 0.5063829172900813%
    version: 0.5038386620128412%
    publish: 0.503375905318023%
    inspir: 0.49313486951648966%
Topic :
    stori: 28.24407402129553%
    veri: 10.956233937724699%
    charact: 10.200835269645586%
    enjoy: 9.31109676543948%
    author: 8.013717569546904%
    nice: