# Read in Data

In [3]:
# start spark
spark

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f0f84f07510>

In [4]:
# read in data from S3
df = spark.read.json('s3://bda-project-updated/electronics/Electronics.json')
#df.printSchema()
#df.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
from pyspark.sql import Row
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import col, expr, when

# Retrieve the columns needed
temp = df.rdd.map(lambda x: Row(
    prod_id = str(x['_id'][0]),
    prod_name = 'prod_'+str(x['_id'][0][:5]),
    helpful_0 = x['helpful'][0],
    helpful_1 = x['helpful'][1],
    overall = x['overall'],
    reviewText = str(x['reviewText']),
    unixreviewTime = x['unixReviewTime'],
    summary = str(x['summary']),
    label=x['class']
))
data = spark.createDataFrame(temp)
data = data.withColumn('reviewtime', from_unixtime(data.unixreviewTime))

#data.show(n=3, truncate=False, vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
# drop na if any
data = data.dropna()
#data.count()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Build the model to identify spam

In [7]:
# split training testing data
train, test = data.randomSplit([0.7, 0.3])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

# build tf-idf results for both review text and summary columns
# number of features are adjustable
tokenizer_text = Tokenizer(inputCol="reviewText", outputCol="words")
hashingTF_text = HashingTF(inputCol="words", outputCol="rawFeaturesText", numFeatures=30)
idf_text = IDF(inputCol="rawFeaturesText", outputCol="featuresText")

tokenizer_summary = Tokenizer(inputCol="summary", outputCol="words_summary")
hashingTF_summary = HashingTF(inputCol="words_summary", outputCol="rawFeaturesSummary", numFeatures=5)
idf_summary = IDF(inputCol="rawFeaturesSummary", outputCol="featuresSummary")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
from pyspark.ml.feature import VectorAssembler

# combine all the needed columns into one feature column
va = VectorAssembler(inputCols=['helpful_0', 'helpful_1', 'overall', 'featuresText',"featuresSummary"],
                     outputCol="features")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
from pyspark.ml.classification import GBTClassifier

# build GBT classifier with default parameters
gbt = GBTClassifier()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
from pyspark.ml import Pipeline

# build the pipeline and fit the model
pipeline_gbt = Pipeline(stages= [tokenizer_text, hashingTF_text, idf_text, tokenizer_summary, hashingTF_summary, idf_summary,va, gbt])
gbt_fitted = pipeline_gbt.fit(train)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# transform the data using the build model
# ie, get the prediction results
results_cv = gbt_fitted.transform(data)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

def get_performance(pred_col, label_col, df):
    roc = BinaryClassificationEvaluator(rawPredictionCol=pred_col, labelCol=label_col, metricName='areaUnderROC').evaluate(df)
    f1 = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='f1').evaluate(df)
    accuracy = MulticlassClassificationEvaluator(predictionCol=pred_col, labelCol=label_col, metricName='accuracy').evaluate(df)
    performance = [roc, f1, accuracy]
    return "ROC: {0}, F1: {1}, Accuracy: {2}".format(roc, f1, accuracy)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
get_performance(pred_col = 'prediction', label_col = 'label', df = results_cv)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'ROC: 1.0, F1: 1.0, Accuracy: 1.0'

In [15]:
final_result = results_cv.select("prod_name", "overall", "reviewText", "reviewtime", "summary", "label", "prediction")
final_result = final_result.withColumn('label_new', expr("IF(prediction==0, 'Real', 'Fake')"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
# final_result.write.format("json").save('s3://bda-project-updated/electronics-result-new/electronics')