In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

from collections import Counter
from pyspark.mllib.clustering import LDA
from pyspark.mllib.linalg import Vectors

In [2]:
rdd = sc.parallelize([(0, ["this",  "is", "a", "test"]), (1, ["this", "is", "another", "one"])])

In [3]:
def count_words(rdd):
    def word_occurrences(l):
        counter = dict(Counter(l))
        return list(zip(counter.keys(), counter.values()))
    
    return rdd.mapValues(word_occurrences)

In [4]:
def wordify(rdd):
    words = list(rdd.map(lambda p: set(p[1])).reduce(lambda x, y: x | y))
    return dict([(words[i], i) for i in range(len(words))])

In [5]:
def LDAify(rdd, words):
    return rdd.mapValues(lambda l: [(words[w], occurrences) for (w, occurrences) in l]).mapValues(lambda l: Vectors.sparse(len(l), dict(l))).map(lambda p: list(p))

words = wordify(rdd)
X = LDAify(count_words(rdd), words).persist()

In [6]:
X.collect()

[[0, SparseVector(4, {0: 1.0, 1: 1.0, 3: 1.0, 4: 1.0})],
 [1, SparseVector(4, {2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0})]]

In [7]:
model = LDA.train(X, k = 1)

In [8]:
words_list = list(words)
for w_indices, distribution in model.describeTopics():
    print("Topic 1:")
    for w_index, percentage in zip(w_indices, distribution):
        print("    " + words_list[w_index] + ": " + str(100*percentage) + "%")

Topic 1:
    this: 25.0%
    is: 25.0%
    another: 12.5%
    one: 12.5%
