# Read in Data

In [1]:
# start spark
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,application_1619746266595_0002,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7fdf1955e510>

In [2]:
# read in data from S3
df = spark.read.json('s3://sagemaker-bda-project/part/part.json')
df.printSchema()
df.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BehaviouralFeatureResult: struct (nullable = true)
 |    |-- AW: double (nullable = true)
 |    |-- CS: double (nullable = true)
 |    |-- ER: double (nullable = true)
 |    |-- FR: double (nullable = true)
 |    |-- MNR: double (nullable = true)
 |    |-- NR: double (nullable = true)
 |    |-- PC: double (nullable = true)
 |    |-- PR: double (nullable = true)
 |    |-- RB: double (nullable = true)
 |    |-- RC: double (nullable = true)
 |    |-- RD: double (nullable = true)
 |    |-- RL: double (nullable = true)
 |    |-- RSP: double (nullable = true)
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: double (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullabl

In [32]:
# Clean the complex fields
from pyspark.sql import Row
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import col, expr, when

temp = df.rdd.map(lambda x: Row(
    prod_id = x['_id'][0],
    prod_name = 'prod_'+str(x['_id'][0][:5]),
    asin = x['asin'],
    category = x['category'],
    helpful_0 = x['helpful'][0],
    helpful_1 = x['helpful'][1],
    overall = x['overall'],
    reviewText = str(x['reviewText']),
    unixreviewTime = x['unixReviewTime'],
    summary = str(x['summary']),
    label=x['label']
))
data = spark.createDataFrame(temp)
data = data.withColumn('reviewtime', from_unixtime(data.unixreviewTime))

data.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 asin           | B000809PCG                                                                                                                                                                                                                                                                                                                                                                                                                   
 category       | Toys_and_Games                                                                                                        

In [33]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- helpful_0: double (nullable = true)
 |-- helpful_1: double (nullable = true)
 |-- label: double (nullable = true)
 |-- overall: double (nullable = true)
 |-- prod_id: string (nullable = true)
 |-- prod_name: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixreviewTime: long (nullable = true)
 |-- reviewtime: string (nullable = true)

# Inspect the data

In [34]:
## Not able to use matplot lib.... lol

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [35]:
data.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['asin', 'category', 'helpful_0', 'helpful_1', 'label', 'overall', 'prod_id', 'prod_name', 'reviewText', 'summary', 'unixreviewTime', 'reviewtime']

In [36]:
data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

99939

In [37]:
data = data.dropna()
data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

19749

# Build the model to identify spam

In [38]:
# split training testing data
train, test = data.randomSplit([0.7, 0.3])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

tokenizer_text = Tokenizer(inputCol="reviewText", outputCol="words")
hashingTF_text = HashingTF(inputCol="words", outputCol="rawFeaturesText", numFeatures=50)
idf_text = IDF(inputCol="rawFeaturesText", outputCol="featuresText")

tokenizer_summary = Tokenizer(inputCol="summary", outputCol="words_summary")
hashingTF_summary = HashingTF(inputCol="words_summary", outputCol="rawFeaturesSummary", numFeatures=10)
idf_summary = IDF(inputCol="rawFeaturesSummary", outputCol="featuresSummary")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=['helpful_0', 'helpful_1', 'overall', 'featuresText',"featuresSummary"],
                     outputCol="features")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [41]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
from pyspark.ml import Pipeline

pipeline_gbt = Pipeline(stages= [tokenizer_text, hashingTF_text, idf_text, tokenizer_summary, hashingTF_summary, idf_summary,va, gbt])

gbt_fitted = pipeline_gbt.fit(train)
results_cv = gbt_fitted.transform(data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [43]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

def get_performance(pred_col, label_col, df):
    roc = BinaryClassificationEvaluator(rawPredictionCol=pred_col, labelCol=label_col, metricName='areaUnderROC').evaluate(df)
    f1 = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='f1').evaluate(df)
    accuracy = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='accuracy').evaluate(df)
    performance = [roc, f1, accuracy]
    return "ROC: {0}, F1: {1}, Accuracy: {2}".format(roc, f1, accuracy)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
get_performance(pred_col = 'prediction', label_col = 'label', df = results_cv)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'ROC: 0.0, F1: 1.0, Accuracy: 1.0'

In [45]:
results_cv.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

19749

In [46]:
# final_result = results_cv.filter("prediction == 0")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [47]:
final_result = results_cv.withColumn('label_new', expr("IF(prediction==0, 'True', 'Fake')"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
final_result.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

19749

# LDA

In [49]:
from collections import defaultdict
from pyspark import SparkContext
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.sql import SQLContext
import re

num_of_stop_words = 50      # Number of most common words to remove, trying to eliminate stop words
num_topics = 10	            # Number of topics we are looking for
num_words_per_topic = 10    # Number of words to display for each topic
max_iterations = 35         # Max number of times to iterate before finishing

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [50]:
# readJSON = spark.read.json('s3://msba6330team2bucket/lda/part.json')
readJSON = final_result
data = readJSON.rdd.map(lambda x: x[8])

# Create list of stop words
text = spark.read.text('s3://sagemaker-bda-project/part/english.txt')
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any', 'are', "aren't",
              'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', "can't",
              'cannot', 'could', "couldn't", 'did', "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down',
              'during', 'each', 'few', 'for', 'from', 'further', 'had', "hadn't", 'has', "hasn't", 'have', "haven't",
              'having', 'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his',
              'how', "how's", 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it', "it's", 'its', 'itself', "let's",
              'me', 'more', 'most', "mustn't", 'my', 'myself', 'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or',
              'other', 'ought', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'same', "shan't", 'she', "she'd",
              "she'll", "she's", 'should', "shouldn't", 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their',
              'theirs', 'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't", 'we',
              "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would',
              "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

tokens = data \
    .map( lambda document: document.strip().lower()) \
    .map( lambda document: re.split("[\s;,#]", document)) \
    .map( lambda word: [x for x in word if x.isalpha()]) \
    .map( lambda word: [x for x in word if len(x) > 3] ) \
    .map( lambda word: [x for x in word if not x in stop_words] )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [51]:
# Get our vocabulary
# 1. Flat map the tokens -> Put all the words in one giant list instead of a list per document
# 2. Map each word to a tuple containing the word, and the number 1, signifying a count of 1 for that word
# 3. Reduce the tuples by key, i.e.: Merge all the tuples together by the word, summing up the counts
# 4. Reverse the tuple so that the count is first...
# 5. ...which will allow us to sort by the word count

termCounts = tokens \
    .flatMap(lambda document: document) \
    .map(lambda word: (word, 1)) \
    .reduceByKey( lambda x,y: x + y) \
    .map(lambda tuple: (tuple[1], tuple[0])) \
    .sortByKey(False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [52]:
# Identify a threshold to remove the top words, in an effort to remove stop words
threshold_value = termCounts.take(num_of_stop_words)[num_of_stop_words - 1][0]

# Only keep words with a count less than the threshold identified above, 
# and then index each one and collect them into a map
vocabulary = termCounts \
    .map(lambda x: x[1]) \
    .zipWithIndex() \
    .collectAsMap()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [53]:
# Convert the given document into a vector of word counts
def document_vector(document):
    id = document[1]
    counts = defaultdict(int)
    for token in document[0]:
        if token in vocabulary:
            token_id = vocabulary[token]
            counts[token_id] += 1
    counts = sorted(counts.items())
    keys = [x[0] for x in counts]
    values = [x[1] for x in counts]
    return (id, Vectors.sparse(len(vocabulary), keys, values))

# Process all of the documents into word vectors using the 
# `document_vector` function defined previously
documents = tokens.zipWithIndex().map(document_vector).map(list)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [54]:
# Get an inverted vocabulary, so we can look up the word by it's index value
inv_voc = {value: key for (key, value) in vocabulary.items()}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [55]:
lda_model = LDA.train(documents, k=num_topics, maxIterations=max_iterations)
topic_indices = lda_model.describeTopics(maxTermsPerTopic=num_words_per_topic)

topic_list = []
word_list = []

# Print topics, showing the top-weighted 10 terms for each topic
for i in range(len(topic_indices)):
    print("Topic #{0}\n".format(i + 1))
    for j in range(len(topic_indices[i][0])):
        print("{0}\t{1}\n".format(inv_voc[topic_indices[i][0][j]].encode('utf-8'),
                                  topic_indices[i][1][j]))
        word_list.append(inv_voc[topic_indices[i][0][j]].encode('utf-8'))
    
    topic_list.append(word_list)
    word_list= []

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Topic #1

b'just'	0.010232412696959585

b'like'	0.010120335144294995

b'great'	0.007551290244187774

b'battery'	0.00736359496671758

b'will'	0.006489463914475238

b'good'	0.006407127120640705

b'really'	0.006343456563536506

b'also'	0.0062429558407392964

b'phone'	0.005938900931748181

b'product'	0.00573910925056264

Topic #2

b'game'	0.011325756160695886

b'like'	0.009632072055486433

b'good'	0.007026026382004509

b'just'	0.006970813955687122

b'watch'	0.006968989987609269

b'will'	0.006897290271370446

b'play'	0.006472204279802451

b'time'	0.005899957930879026

b'even'	0.005246304888609052

b'really'	0.005206705130564308

Topic #3

b'case'	0.013882539939537681

b'will'	0.013582527112271606

b'like'	0.010747578833759106

b'just'	0.01051370896142451

b'good'	0.008009494243880198

b'great'	0.007103084098830147

b'little'	0.006767127974975612

b'bought'	0.006223536175797846

b'really'	0.0058280893843373925

b'even'	0.005414878250772931

Topic #4

b'just'	0.011247061400496767

b'coffee'	0

In [56]:
# convert to string
string_topic_list = []
string_word_list = []

for x in topic_list:
    for y in x:
        word = y.decode()
        string_word_list.append(word)
    
    string_topic_list.append(string_word_list)
    string_word_list=[]

string_topic_list

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[['just', 'like', 'great', 'battery', 'will', 'good', 'really', 'also', 'phone', 'product'], ['game', 'like', 'good', 'just', 'watch', 'will', 'play', 'time', 'even', 'really'], ['case', 'will', 'like', 'just', 'good', 'great', 'little', 'bought', 'really', 'even'], ['just', 'coffee', 'will', 'like', 'water', 'great', 'good', 'bought', 'product', 'much'], ['like', 'just', 'good', 'will', 'great', 'well', 'really', 'little', 'still', 'bought'], ['like', 'just', 'good', 'really', 'great', 'will', 'size', 'bought', 'quality', 'little'], ['will', 'amazon', 'just', 'back', 'time', 'item', 'product', 'customer', 'service', 'told'], ['just', 'will', 'like', 'really', 'great', 'water', 'good', 'even', 'bought', 'much'], ['phone', 'just', 'like', 'great', 'will', 'case', 'good', 'bought', 'product', 'quality'], ['like', 'just', 'will', 'little', 'case', 'great', 'even', 'good', 'time', 'screen']]

In [57]:
from pyspark.sql import Row

R = Row("0", "1", "2", '3', '4', '5', '6', '7', '8', '9')
lda_result_df = sc.parallelize([R(*r) for r in zip(*string_topic_list)]).toDF()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Save result to cloud

In [60]:
final_result.write.format("json").save('s3://bda-project-updated/part-result/part')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [61]:
lda_result_df.write.format("csv").save('s3://bda-project-updated/part-result//part-lda')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…