In [1]:
#Basic Spark Setup Stuff

import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example").master("local[*]").getOrCreate()

In [2]:
# Read in our datafile of Amazon reviews of musical instruments

df = spark.read.json("reviews_Musical_Instruments_5.json.gz")

In [3]:
# Sanity check our dataset size, should be the same as in Lab #2

df.count()

10261

In [4]:
# Compute a couple of derived features from the data, and augment the dataset

from pyspark.sql.functions import udf

# UDF is Spark's way of handling a "user defined function"
countWords = udf(lambda x:len(x.split()))

def _avgWordLength(string):
    words = string.split()
    return sum(len(word) for word in words) / len(words)
            
avgWordLength = udf(_avgWordLength)

# Add the length of the associated review (in words) to each data point
df = df.withColumn('reviewLen', countWords(df['reviewText']))
# Add the average length of words within the review to each data point
df = df.withColumn('reviewWordAvg', avgWordLength(df.reviewText))

In [6]:
# Check out a couple of examples of our new features, and sanity check them by inspection
df.take(2)

[Row(asin='1384719342', helpful=[0, 0], overall=5.0, reviewText="Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,", reviewTime='02 28, 2014', reviewerID='A2IBPI20UZIR0U', reviewerName='cassandra tu "Yeah, well, that\'s just like, u...', summary='good', unixReviewTime=1393545600, reviewLen='51', reviewWordAvg='4.2745098039215685'),
 Row(asin='1384719342', helpful=[13, 14], overall=5.0, reviewText="The product does exactly as it should and is quite affordable.I did not realized it was double screened until it arrived, so it was even better than I had expected.As an added bonus, one of the screens carries a small hint of the smell of an old grape candy I used to buy, so for reminiscent's sake, I cannot stop putting the pop filter next to my nose and smelling it after recor

In [None]:
#####################################################
# Challenge! 
#
# Try to come up with a couple of derived features
# of your own that might be useful. This is pretty
# open-ended, and there is no one right answer!
# So, be creative and compare with others to see what
# you can devise. These features may come in handy
# during later lab exercises...
#
#####################################################
