In [1]:
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import Tokenizer, CountVectorizer, StopWordsRemover

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName('Metal Lyrics LDA') \
        .getOrCreate()

In [3]:
import string
from pyspark.sql.functions import monotonically_increasing_id as mid
from nltk.corpus import stopwords

documents = sc.wholeTextFiles('data_test/')
df = spark.createDataFrame(documents,['doc_name','doc_text'])

# This will return a new DF with all the columns + id
df = df.withColumn("id", mid())
tokenizer = Tokenizer(inputCol="doc_text", outputCol="words")
df = tokenizer.transform(df)

stop_words = StopWordsRemover.loadDefaultStopWords('english')
stop_words += StopWordsRemover.loadDefaultStopWords('german')
stop_words += StopWordsRemover.loadDefaultStopWords('spanish')
stop_words += StopWordsRemover.loadDefaultStopWords('french')
stop_words += ["i'm",' ','','-',"don't","you're","i'll","can't","it'",
              "we'll","it's","ne ","i've","you'll","let","there's","oh"]

remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_words)
removed = remover.transform(df)

cv = CountVectorizer(inputCol="filtered", outputCol="vectors")
model = cv.fit(removed)
df_vec = model.transform(removed)
# df_vec.show()

In [4]:
from pyspark.mllib.linalg import DenseVector
corpus = df_vec.select("id","vectors").rdd.map(lambda (x, y): [x,DenseVector(y.toArray())]).cache()

In [5]:
# Cluster the documents into five topics using LDA
NUM_TOPICS = 5
ldaModel = LDA.train(corpus, k=NUM_TOPICS)

In [7]:
NUM_WORDS = 20
topics = ldaModel.describeTopics(NUM_WORDS)
print "{} words in vocabulary".format(len(model.vocabulary))
print ""

for i, t in enumerate(topics):
    print "Top words for topic {}".format(i)
    word_indices, weights = t
    result = []
    for idx in range(len(word_indices)):
        #print "{} : {}".format(model.vocabulary[word_indices[idx]].encode('utf-8'), 
        #                       weights[idx])
        result.append(model.vocabulary[word_indices[idx]].encode('utf-8'))
    print ', '.join(result)
    print ""

36415 words in vocabulary

Top words for topic 0
eyes, life, time, like, back, never, one, go, know, see, wake, blood, hell, way, come, cry, head, could, take, feel

Top words for topic 1
life, never, light, us, one, time, world, day, come, give, death, left, right, last, blood, black, know, new, way, nothing

Top words for topic 2
know, see, go, time, like, never, come, love, one, forever, take, world, heart, still, away, me,, make, night, gonna, made

Top words for topic 3
like, see, time, feel, one, life, never, end, want, take, ready, know, got, way, world, get, make, mind, come, things

Top words for topic 4
night, see, one, time, know, never, take, dead, like, stand, inside, got, death, life, us, find, end, flesh, feel, break

