# Read in Data

In [1]:
# start spark
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
5,application_1619806697997_0006,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7fea94d08410>

In [32]:
# read in data from S3
df = spark.read.json('s3://bda-project-updated/electronics/Electronics.json')
df.printSchema()
df.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _id: struct (nullable = true)
 |    |-- $oid: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- category: string (nullable = true)
 |-- class: double (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [3]:
# Clean the complex fields
from pyspark.sql import Row
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import col, expr, when

temp = df.rdd.map(lambda x: Row(
    prod_name = 'prod_'+str(x['_id'][0][:5]),
    helpful_0 = x['helpful'][0],
    helpful_1 = x['helpful'][1],
    overall = x['overall'],
    reviewText = str(x['reviewText']),
    unixreviewTime = x['unixReviewTime'],
    summary = str(x['summary']),
    label=x['class']
))
data = spark.createDataFrame(temp)
data = data.withColumn('reviewtime', from_unixtime(data.unixreviewTime))

data.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 helpful_0      | 1                                                                                                                                                                                                         

In [4]:
data.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- helpful_0: long (nullable = true)
 |-- helpful_1: long (nullable = true)
 |-- label: double (nullable = true)
 |-- overall: double (nullable = true)
 |-- prod_name: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixreviewTime: long (nullable = true)
 |-- reviewtime: string (nullable = true)

# Inspect the data

In [5]:
## Not able to use matplot lib.... lol

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
data.columns

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

['helpful_0', 'helpful_1', 'label', 'overall', 'prod_name', 'reviewText', 'summary', 'unixreviewTime', 'reviewtime']

In [7]:
data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

7574169

In [8]:
data = data.dropna()
data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

7574169

# Build the model to identify spam

In [9]:
# split training testing data
train, test = data.randomSplit([0.7, 0.3])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

tokenizer_text = Tokenizer(inputCol="reviewText", outputCol="words")
hashingTF_text = HashingTF(inputCol="words", outputCol="rawFeaturesText", numFeatures=30)
idf_text = IDF(inputCol="rawFeaturesText", outputCol="featuresText")

tokenizer_summary = Tokenizer(inputCol="summary", outputCol="words_summary")
hashingTF_summary = HashingTF(inputCol="words_summary", outputCol="rawFeaturesSummary", numFeatures=5)
idf_summary = IDF(inputCol="rawFeaturesSummary", outputCol="featuresSummary")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=['helpful_0', 'helpful_1', 'overall', 'featuresText',"featuresSummary"],
                     outputCol="features")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
from pyspark.ml import Pipeline

pipeline_gbt = Pipeline(stages= [tokenizer_text, hashingTF_text, idf_text, tokenizer_summary, hashingTF_summary, idf_summary,va, gbt])

gbt_fitted = pipeline_gbt.fit(train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
results_cv = gbt_fitted.transform(data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

def get_performance(pred_col, label_col, df):
    roc = BinaryClassificationEvaluator(rawPredictionCol=pred_col, labelCol=label_col, metricName='areaUnderROC').evaluate(df)
    f1 = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='f1').evaluate(df)
    accuracy = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='accuracy').evaluate(df)
    performance = [roc, f1, accuracy]
    return "ROC: {0}, F1: {1}, Accuracy: {2}".format(roc, f1, accuracy)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
#get_performance(pred_col = 'prediction', label_col = 'label', df = results_cv)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
results_cv.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

7574169

In [29]:
final_result = results_cv.select("prod_name", "overall", "reviewText", "reviewtime", "summary", "label", "prediction")
final_result = final_result.withColumn('label_new', expr("IF(prediction==0, 'Real', 'Fake')"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [30]:
final_result.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 prod_name  | prod_5a132                                                                                                                                                                                                        

In [41]:
final_result.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

7574169

In [31]:
final_result.write.format("json").save('s3://bda-project-updated/electronics-result/electronics')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…