# LDA

In [1]:
from collections import defaultdict
from pyspark import SparkContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.sql import SQLContext
import re

num_of_stop_words = 50      # Number of most common words to remove, trying to eliminate stop words
num_topics = 10	            # Number of topics we are looking for
num_words_per_topic = 10    # Number of words to display for each topic
max_iterations = 35         # Max number of times to iterate before finishing

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,application_1619886977903_0001,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [17]:
# read in data
readJSON = spark.read.json('s3://bda-project-updated/electronics-result-new/electronics/')
data = readJSON.rdd.map(lambda x: x['reviewText'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
# Create list of stop words
text = spark.read.text('s3://bda-project-updated/english.txt')
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",
              'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't",
              'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down',
              'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't",
              'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his',
              'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's",
              'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
              'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd",
              "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their',
              'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we',
              "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would',
              "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
# Get our vocabulary
# 1. Flat map the tokens -> Put all the words in one giant list instead of a list per document
# 2. Map each word to a tuple containing the word, and the number 1, signifying a count of 1 for that word
# 3. Reduce the tuples by key, i.e.: Merge all the tuples together by the word, summing up the counts
# 4. Reverse the tuple so that the count is first...
# 5. ...which will allow us to sort by the word count

tokens = data \
    .map( lambda document: document.strip().lower()) \
    .map( lambda document: re.split("[\s;,#]", document)) \
    .map( lambda word: [x for x in word if x.isalpha()]) \
    .map( lambda word: [x for x in word if len(x) > 3] ) \
    .map( lambda word: [x for x in word if not x in stop_words] )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
termCounts = tokens \
    .flatMap(lambda document: document) \
    .map(lambda word: (word, 1)) \
    .reduceByKey( lambda x,y: x + y) \
    .map(lambda tuple: (tuple[1], tuple[0])) \
    .sortByKey(False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
# Identify a threshold to remove the top words, in an effort to remove stop words
threshold_value = termCounts.take(num_of_stop_words)[num_of_stop_words - 1][0]

# Only keep words with a count less than the threshold identified above, 
# and then index each one and collect them into a map
vocabulary = termCounts \
    .map(lambda x: x[1]) \
    .zipWithIndex() \
    .collectAsMap()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
# Convert the given document into a vector of word counts
def document_vector(document):
    id = document[1]
    counts = defaultdict(int)
    for token in document[0]:
        if token in vocabulary:
            token_id = vocabulary[token]
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))

# Process all of the documents into word vectors using the 
# `document_vector` function defined previously
documents = tokens.zipWithIndex().map(document_vector).map(list)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [27]:
# Get an inverted vocabulary, so we can look up the word by it's index value
inv_voc = {value: key for (key, value) in vocabulary.items()}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [28]:
lda_model = LDA.train(documents, k=num_topics, maxIterations=max_iterations)
topic_indices = lda_model.describeTopics(maxTermsPerTopic=num_words_per_topic)

topic_list = []
word_list = []

# Print topics, showing the top-weighted 10 terms for each topic
for i in range(len(topic_indices)):
    print("Topic #{0}\n".format(i + 1))
    for j in range(len(topic_indices[i][0])):
        print("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]].encode('utf-8'),
                                  topic_indices[i][1][j]))
        word_list.append(inv_voc[topic_indices[i][0][j]].encode('utf-8'))
    
    topic_list.append(word_list)
    word_list= []

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topic #1

b'like'	0.013832615758785137

b'unit'	0.01122279260796246

b'just'	0.010866719203352328

b'battery'	0.009727674089980461

b'good'	0.008719155284869443

b'great'	0.007466567936413694

b'also'	0.007188605271700628

b'really'	0.007139989083681824

b'easy'	0.0071309798007279795

b'even'	0.0069180301447734115

Topic #2

b'case'	0.04129960425997552

b'screen'	0.02184986788928766

b'cover'	0.01350622662595383

b'ipad'	0.012512423926108608

b'kindle'	0.012184034993807017

b'tablet'	0.011689993968689928

b'keyboard'	0.011502342815185286

b'like'	0.007729298433483138

b'back'	0.0065908061695951325

b'feel'	0.0064481064535695555

Topic #3

b'remote'	0.013256270777872567

b'batteries'	0.012015870304933429

b'just'	0.011849003881373596

b'device'	0.010868037598907324

b'will'	0.009209552696079131

b'battery'	0.00867423623302386

b'good'	0.006502043339898024

b'like'	0.006240017490926285

b'ipod'	0.005999394500770744

b'volume'	0.0057411784398774

Topic #4

b'drive'	0.02044425997761021

b'

In [29]:
# convert to string
string_topic_list = []
string_word_list = []

for x in topic_list:
    for y in x:
        word = y.decode()
        string_word_list.append(word)
    
    string_topic_list.append(string_word_list)
    string_word_list=[]

string_topic_list

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[['like', 'unit', 'just', 'battery', 'good', 'great', 'also', 'really', 'easy', 'even'], ['case', 'screen', 'cover', 'ipad', 'kindle', 'tablet', 'keyboard', 'like', 'back', 'feel'], ['remote', 'batteries', 'just', 'device', 'will', 'battery', 'good', 'like', 'ipod', 'volume'], ['drive', 'hard', 'phone', 'windows', 'will', 'works', 'card', 'just', 'computer', 'laptop'], ['will', 'light', 'good', 'great', 'just', 'radio', 'flash', 'works', 'know', 'time'], ['great', 'just', 'good', 'headphones', 'like', 'sound', 'product', 'bought', 'price', 'works'], ['sound', 'speakers', 'music', 'cable', 'good', 'great', 'system', 'player', 'audio', 'quality'], ['great', 'just', 'bought', 'well', 'like', 'good', 'really', 'will', 'quality', 'product'], ['mouse', 'just', 'like', 'product', 'will', 'great', 'work', 'good', 'quality', 'bought'], ['camera', 'lens', 'also', 'will', 'good', 'great', 'pictures', 'take', 'small', 'canon']]

In [30]:
# transform output for QuickSight ingestion
from pyspark.sql import Row

R = Row("0", "1", "2", '3', '4', '5', '6', '7', '8', '9')
lda_result_df = sc.parallelize([R(*r) for r in zip(*string_topic_list)]).toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
lda_result_df.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+--------+---------+--------+-----+----------+--------+-------+-------+--------+
|      0|       1|        2|       3|    4|         5|       6|      7|      8|       9|
+-------+--------+---------+--------+-----+----------+--------+-------+-------+--------+
|   like|    case|   remote|   drive| will|     great|   sound|  great|  mouse|  camera|
|   unit|  screen|batteries|    hard|light|      just|speakers|   just|   just|    lens|
|   just|   cover|     just|   phone| good|      good|   music| bought|   like|    also|
|battery|    ipad|   device| windows|great|headphones|   cable|   well|product|    will|
|   good|  kindle|     will|    will| just|      like|    good|   like|   will|    good|
|  great|  tablet|  battery|   works|radio|     sound|   great|   good|  great|   great|
|   also|keyboard|     good|    card|flash|   product|  system| really|   work|pictures|
| really|    like|     like|    just|works|    bought|  player|   will|   good|    take|
|   easy|    back|   

# Save result to cloud

In [74]:
# lda_result_df.write.format("csv").save('s3://bda-project-updated/electronics-result-new/electronics-lda/')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…