In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

from collections import Counter
from pyspark.mllib.clustering import LDA
from pyspark.mllib.linalg import Vectors

In [2]:
rdd = sc.parallelize([(0, ["this",  "is", "a", "test"]), (1, ["this", "is", "another", "one"])])

In [3]:
def count_words(rdd):
    def word_occurrences(l):
        counter = dict(Counter(l))
        return list(zip(counter.keys(), counter.values()))
    
    return rdd.mapValues(word_occurrences)

In [4]:
def wordify(rdd):
    words = list(rdd.map(lambda p: set(p[1])).reduce(lambda x, y: x | y))
    return dict([(words[i], i) for i in range(len(words))])

In [5]:
def LDAify(rdd, words):
    return rdd.mapValues(lambda l: [(words[w], occurrences) for (w, occurrences) in l]).mapValues(lambda l: Vectors.sparse(len(l), dict(l))).map(lambda p: list(p))

X = LDAify(count_words(rdd), wordify(rdd)).persist()

In [6]:
X.collect()

[[0, SparseVector(4, {0: 1.0, 2: 1.0, 3: 1.0, 4: 1.0})],
 [1, SparseVector(4, {1: 1.0, 3: 1.0, 4: 1.0, 5: 1.0})]]

In [7]:
LDA.train(X).describeTopics()

[([4, 3, 2, 0],
  [0.2500228842220148,
   0.2499776121906838,
   0.1250000033432096,
   0.1249999858826956]),
 ([4, 3, 1, 5],
  [0.25002347568668487,
   0.24997542819187868,
   0.12500049593028015,
   0.125000411078366]),
 ([3, 4, 0, 2],
  [0.25000969039715715,
   0.24998775121523947,
   0.12500090411976925,
   0.12500080439400732]),
 ([4, 3, 5, 1],
  [0.25000733879242626,
   0.24999316832976062,
   0.12499996619794299,
   0.12499995673080885]),
 ([3, 4, 5, 1],
  [0.25000920884261024,
   0.24998575990139751,
   0.12500134107049438,
   0.1250013284282274]),
 ([3, 4, 0, 5],
  [0.2500230516595658,
   0.2499768956598373,
   0.12500009935494977,
   0.12500005071740197]),
 ([4, 3, 0, 2],
  [0.250029535541181,
   0.24997358415863746,
   0.1249993741897159,
   0.12499933939559393]),
 ([3, 4, 5, 1],
  [0.2500133453872092,
   0.24998511647202484,
   0.1250005353089084,
   0.12500039607248978]),
 ([3, 4, 1, 5],
  [0.25000919116826,
   0.24999015417223672,
   0.12500038978046016,
   0.125000250755