In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
sc = pyspark.SparkContext()

from collections import Counter
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

import pickle as pkl

spark = SparkSession.builder.getOrCreate()
sc.setCheckpointDir('checkpoint/')

STEMMIZER = "porter"
RDD_FILENAME = "stemstem.parquet"

In [2]:
def get_rdd(filename):
    df = spark.read.parquet("data/" + filename)
    rdd = df.rdd.zipWithIndex().map(lambda r: (r[1], r[0]["Words"]))
    return sc.parallelize(rdd.take(50000))

In [3]:
rdd = get_rdd(RDD_FILENAME)

In [4]:
def count_words(rdd):
    def word_occurrences(l):
        counter = dict(Counter(l))
        return list(zip(counter.keys(), counter.values()))
    
    return rdd.mapValues(word_occurrences)

In [5]:
def wordify(rdd):
    lower_case_set = lambda s: set(map(lambda x: x.lower(), s))
    words = list(rdd.map(lambda p: set(p[1])).reduce(lambda x, y: lower_case_set(x) | lower_case_set(y)))
    return dict([(words[i], i) for i in range(len(words))])

In [6]:
def LDAify(rdd, words):
    return rdd.mapValues(lambda l: [(words[w.lower()], occurrences) for (w, occurrences) in l]).mapValues(lambda l: Vectors.sparse(len(l), dict(l))).map(lambda p: list(p))

In [7]:
#words = wordify(rdd)
words = {}
with open('data/words_' + STEMMIZER + '_lda.pickle', 'rb') as handle:
    words = pkl.load(handle)

In [8]:
X = LDAify(count_words(rdd), words)

In [9]:
#model = LDA.train(X, k = 10, maxIterations = 500, checkpointInterval = 50)
model = LDAModel.load(sc, path = "data/" + STEMMIZER + "_kindle_lda")

In [18]:
def print_topics(words, model):
    words_list = list(words)
    for w_indices, distribution in model.describeTopics():
        print("Topic :")
        for w_index, percentage in zip(w_indices, distribution):
            print("    " + words_list[w_index] + ": " + str(100*percentage) + "%")

227