# LDA

In [22]:
from collections import defaultdict
from pyspark import SparkContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.sql import SQLContext
import re

num_of_stop_words = 50      # Number of most common words to remove, trying to eliminate stop words
num_topics = 10	            # Number of topics we are looking for
num_words_per_topic = 10    # Number of words to display for each topic
max_iterations = 35         # Max number of times to iterate before finishing

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
readJSON = spark.read.json('s3://bda-project-updated/electronics-result-new/electronics/')
data = readJSON.rdd.map(lambda x: x['reviewtext'])

# Create list of stop words
text = spark.read.text('s3://bda-project-updated/english.txt')
stop_words = [row[0] for row in text.select('value').collect()]

tokens = data \
    .map( lambda document: document.strip().lower()) \
    .map( lambda document: re.split("[\s;,#]", document)) \
    .map( lambda word: [x for x in word if x.isalpha()]) \
    .map( lambda word: [x for x in word if len(x) > 3] ) \
    .map( lambda word: [x for x in word if not x in stop_words] )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [31]:
# Get our vocabulary
# 1. Flat map the tokens -> Put all the words in one giant list instead of a list per document
# 2. Map each word to a tuple containing the word, and the number 1, signifying a count of 1 for that word
# 3. Reduce the tuples by key, i.e.: Merge all the tuples together by the word, summing up the counts
# 4. Reverse the tuple so that the count is first...
# 5. ...which will allow us to sort by the word count

termCounts = tokens \
    .flatMap(lambda document: document) \
    .map(lambda word: (word, 1)) \
    .reduceByKey( lambda x,y: x + y) \
    .map(lambda tuple: (tuple[1], tuple[0])) \
    .sortByKey(False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [32]:
# Identify a threshold to remove the top words, in an effort to remove stop words
threshold_value = termCounts.take(num_of_stop_words)[num_of_stop_words - 1][0]

# Only keep words with a count less than the threshold identified above, 
# and then index each one and collect them into a map
vocabulary = termCounts \
    .map(lambda x: x[1]) \
    .zipWithIndex() \
    .collectAsMap()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
# Convert the given document into a vector of word counts
def document_vector(document):
    id = document[1]
    counts = defaultdict(int)
    for token in document[0]:
        if token in vocabulary:
            token_id = vocabulary[token]
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))

# Process all of the documents into word vectors using the 
# `document_vector` function defined previously
documents = tokens.zipWithIndex().map(document_vector).map(list)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
# Get an inverted vocabulary, so we can look up the word by it's index value
inv_voc = {value: key for (key, value) in vocabulary.items()}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [36]:
lda_model = LDA.train(documents, k=num_topics, maxIterations=max_iterations)
topic_indices = lda_model.describeTopics(maxTermsPerTopic=num_words_per_topic)

topic_list = []
word_list = []

# Print topics, showing the top-weighted 10 terms for each topic
for i in range(len(topic_indices)):
    print("Topic #{0}\n".format(i + 1))
    for j in range(len(topic_indices[i][0])):
        print("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]].encode('utf-8'),
                                  topic_indices[i][1][j]))
        word_list.append(inv_voc[topic_indices[i][0][j]].encode('utf-8'))
    
    topic_list.append(word_list)
    word_list= []

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topic #1

b'drive'	0.02499142276430294

b'hard'	0.010542078501038587

b'time'	0.008502292670472802

b'unit'	0.008061507454537166

b'device'	0.007469092608352024

b'software'	0.006988397184021807

b'drives'	0.0063151674370618095

b'support'	0.006085957959205513

b'windows'	0.0058144892238416145

b'power'	0.005754585715728389

Topic #2

b'screen'	0.01793049076766526

b'keyboard'	0.01083160084474015

b'ipad'	0.009811850731221764

b'product'	0.009474236785574124

b'time'	0.007837898710240184

b'amazon'	0.007479651958124913

b'bought'	0.007450787673531195

b'mouse'	0.006826974030082504

b'cover'	0.005303458707456901

b'return'	0.005245568220509486

Topic #3

b'screen'	0.01116329978845032

b'ipad'	0.009551318388251955

b'cover'	0.009048056577971876

b'time'	0.006074960768553372

b'keyboard'	0.005998472881332877

b'bought'	0.0050184233251603675

b'product'	0.0044218598439526155

b'battery'	0.004322861333363934

b'nice'	0.004293579899552189

b'laptop'	0.00423316187745898

Topic #4

b'player'	0

In [60]:
# convert to string
string_topic_list = []
string_word_list = []

for x in topic_list:
    for y in x:
        word = y.decode()
        string_word_list.append(word)
    
    string_topic_list.append(string_word_list)
    string_word_list=[]

string_topic_list

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[['drive', 'hard', 'time', 'unit', 'device', 'software', 'drives', 'support', 'windows', 'power'], ['screen', 'keyboard', 'ipad', 'product', 'time', 'amazon', 'bought', 'mouse', 'cover', 'return'], ['screen', 'ipad', 'cover', 'time', 'keyboard', 'bought', 'product', 'battery', 'nice', 'laptop'], ['player', 'unit', 'time', 'power', 'cable', 'ipod', 'product', 'bought', 'device', 'play'], ['windows', 'drive', 'time', 'software', 'device', 'support', 'unit', 'tablet', 'apps', 'screen'], ['windows', 'router', 'laptop', 'card', 'support', 'wireless', 'software', 'product', 'bought', 'asus'], ['sound', 'headphones', 'music', 'speakers', 'speaker', 'volume', 'bass', 'quality', 'audio', 'noise'], ['time', 'bought', 'product', 'amazon', 'reviews', 'battery', 'price', 'unit', 'purchased', 'return'], ['time', 'battery', 'bought', 'unit', 'product', 'remote', 'player', 'amazon', 'return', 'hdmi'], ['camera', 'lens', 'quality', 'pictures', 'video', 'canon', 'light', 'image', 'nikon', 'picture']]

In [68]:
from pyspark.sql import Row

R = Row("0", "1", "2", '3', '4', '5', '6', '7', '8', '9')
lda_result_df = sc.parallelize([R(*r) for r in zip(*string_topic_list)]).toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Save result to cloud

In [74]:
lda_result_df.write.format("csv").save('s3://bda-project-updated/electronics-result-new/electronics-lda/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…