<a href="https://colab.research.google.com/github/susiexia/BigData_Amazon_reviews_ETL_Cloud/blob/master/Amazon_Reviews_NLP_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project PART 2
Perform ETL in the cloud AND analyze data using Natural Language Processing (NLP) pipeline including Machine Learning.
(part 1 in "Amazon_Reriews_ETL_process.ipynb")

In [0]:
# Install Java, Spark, Findspark and download a Postgresql driver
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark


# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
# Create a spark session, configured with Posetgres driver
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Amazon_Reviews_Analysis_NLP').getOrCreate()

In [67]:

# Read in data from S3 Bukets
from pyspark import SparkFiles

url= "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get('amazon_reviews_us_Beauty_v1_00.tsv.gz'), sep='\t', header=True, inferSchema = True)
df.show(n=5)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|    1797882|R3I2DHQBR577SS|B001ANOOOE|       2102612|The Naked Bee Vit...|          Beauty|          5|            0|          0|   N|                Y|          Five Stars|Love this, excell...|2015-08-31 00:00:00|
|         US|   18381298|R1QNE9NQFJC2Y4|B0016J22EQ|     106393691|Alba Botanica Sun...| 

# NLP pipeline

In [0]:
# import ml.feature functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer, VectorAssembler
from pyspark.sql.functions import length

In [0]:
# add a new column as a future feature
df = df.withColumn('length', length(df.review_body))
df.count()
df.filter(df.review_body.isNotNull()).count()

5114750

In [0]:
amazon_df = df.filter(df.review_body.isNotNull())

In [0]:

amazon_df = df.select('vine','review_body','length')
amazon_df = amazon_df.dropna()
amazon_df.count()


5114750

In [0]:
# create all the ml.features to df
strIndexed = StringIndexer(inputCol='vine', outputCol='label')
tokenizer = Tokenizer(inputCol='review_body', outputCol='tokened')
stopremover = StopWordsRemover(inputCol='tokened', outputCol='removed')
hashngTF = HashingTF(inputCol='removed', outputCol='hashed')
idf = IDF(inputCol='hashed', outputCol='idf_token')

In [0]:
from pyspark.ml.linalg import Vector
# create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [0]:
# create a pipeline and list all stages in the order of being executed
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages= [strIndexed, tokenizer, stopremover, hashngTF, idf, clean_up])

In [0]:
# fit model and transform the pipeline
cleaner = data_prep_pipeline.fit(amazon_df)    # produce a PipelineModel

# use PipelineModel to transform orginal df
cleaned = cleaner.transform(amazon_df)
cleaned.select('label','features').show(5)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                 

# Run ML model and Evaluate

In [0]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
# break whole data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7,0.3])

#create a Naive Bayes Model 
nb = NaiveBayes() 
predictor = nb.fit(training)     # fit training df to nb model, predictor is NaiveBayes object

# transform the model with teasting data
test_results = predictor.transform(testing)
test_results.select('features', 'rawPrediction','probability','prediction').show(5)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [68]:
# use the Class Evaluator for a cleaner description
acc_eval = MulticlassClassificationEvaluator()

acc = acc_eval.evaluate(test_results)    # action (evaluate)

print("Accuracy of model at predicting reviews was : %f "% acc)

Accuracy of model at predicting reviews was : 0.959364 
