# Vine Review Analysis

### Dependencies

In [1]:
# Download a Postgres driver to allow Spark to interact with Postgres
!curl -O https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  979k  100  979k    0     0   807k      0  0:00:01  0:00:01 --:--:--  807k


In [2]:
# Locate Spark
import findspark
findspark.init()

# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from config import db_password

# Spark session adding the Postgres driver to Spark
spark = SparkSession.builder \
                    .appName('vine') \
                    .config('spark.driver.extraClassPath', 'postgresql-42.2.16.jar') \
                    .getOrCreate()
spark

### Data

1. [Download the data](https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Pet_Products_v1_00.tsv.gz)
2. Unzip the downloaded file and move the data file to the same directory as this notebook

Source: https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt

In [5]:
# Read in data
reviews_df = spark.read.csv('pet_product_reviews.tsv', sep='\t', header=True, inferSchema=True)
reviews_df.show(5)

2643619
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   28794885| REAKC26P07MDN|B00Q0K9604|     510387886|(8-Pack) EZwhelp ...|    Pet Products|          5|            0|          0|   N|                Y|A great purchase ...|Best belly bands ...| 2015-08-31|
|         US|   11488901|R3NU7OMZ4HQIEG|B00MBW5O9W|     912374672|Warren Eckstein's...|    Pet Products|        

In [7]:
# Schema and row count
reviews_df.printSchema()
reviews_df.count()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



2643619