## Tutorial 2

### Spark Performance analysis on the Yelp dataset

In [3]:
userid = 'nicolas.hohn@mbs.edu' ## <- CHANGE THIS TO YOUR OWN USERID
dataDirectory = '/Users/%s/data' % userid

# create data directory inside home folder
dbutils.fs.mkdirs(dataDirectory)

# create a mapping of file names to table names
outputs = {'reviews':"%s/yelp_academic_dataset_review_json.parquet" % dataDirectory, 
          'business':"%s/yelp_academic_dataset_business_json.parquet" % dataDirectory}


In [4]:
# read data from parquet files and create tables
data = dict()
for name in outputs.keys():
    data[name] = spark.read.parquet(outputs[name])
    data[name].createOrReplaceTempView(name)

In [5]:
%sql SELECT COUNT(*) FROM business

count(1)
188593


In [6]:
# create a DataFrame from the 'reviews' table and print its schema
data['business'].printSchema()

In [7]:
data['reviews'].printSchema()

In [8]:
# partition reviews table by business_id
nPartitions = 42
data['reviews'].repartition(nPartitions, "business_id").createOrReplaceTempView("reviews")
%time spark.sql("select user_id, count(*) as number FROM reviews GROUP BY user_id ORDER BY number DESC ").take(5)

#click on the Spark job and identify the total amount of shuffling (read and write): ~700MB

In [9]:
# It is much more efficient to partition by user_id since the query does a group by on user_id

data['reviews'].repartition(nPartitions, "user_id").createOrReplaceTempView("reviews")
%time spark.sql("select user_id, count(*) as number FROM reviews GROUP BY user_id ORDER BY number DESC ").take(5)
#click on the Spark job and identify the total amount of shuffling (read and write): ~80MB
# => shuffling has been reduced by ~9x

In [10]:
import timeit

def f(nPartitions, number=1):
  data['reviews'].repartition(nPartitions, "user_id").createOrReplaceTempView("reviews")
  command = """spark.sql("select user_id, count(*) as number FROM reviews GROUP BY user_id ORDER BY number DESC ").take(5)"""
  return timeit.timeit(command, number=number, globals=globals()) / number

# define the list of partition cardinality
listPartitions = list(range(1, 20)) + [ 40, 80, 160]

# loop on list
results = [ f(n, number=3) for n in listPartitions]


In [11]:
# dusplay results
list(zip(listPartitions, results))

In [12]:
nPartitions = 50
data['reviews'].repartition(nPartitions, "user_id").createOrReplaceTempView("reviews")

%time spark.sql("select user_id, count(*) as number FROM reviews GROUP BY user_id ORDER BY number DESC ").take(5)

In [13]:
#cache table
spark.catalog.cacheTable("reviews")
%time spark.sql("select user_id, count(*) as number FROM reviews GROUP BY user_id ORDER BY number DESC ").take(5)

In [14]:
spark.sql("UNCACHE TABLE reviews")

query = """
SELECT business.state, count(*) as number 
FROM business JOIN reviews on business.business_id = reviews.business_id GROUP BY business.state ORDER BY number DESC
"""

%timeit spark.sql(query).take(5)

In [15]:

broadcastQuery = """
SELECT /*+ BROADCAST(business) */ business.state, count(*) as number 
FROM business JOIN reviews on business.business_id = reviews.business_id GROUP BY business.state ORDER BY number DESC
"""

broacastQuery = query.replace("SELECT", """SELECT /*+ BROADCAST(business) */""")

%timeit spark.sql(query).take(5)

### Spark UDFs and UDAFs on the Yelp dataset

In [17]:
from pyspark.sql.functions import udf

# write a simple sentiment analysis
bad = ['terrible', 'worst', 'horrible', 'avoid', 'bland', 'miss', 'average', 'never']
good = [ 'wonderful', 'awesome', 'amazing', 'great', 'delicious', 'excellent', 'always', 'recommend']

import re
import numpy as np
from pyspark.sql.types import IntegerType
# compile regular expression
r2 = re.compile(r'[^a-zA-Z ]', re.MULTILINE)

# actual UDF
@udf("int")
def sentiment_analysis(reviewString):
  # convert to lower caps and remove non alphabetical characters
  s = r2.sub('', reviewString.lower())
  # return basic sentiment analysis score as integer
  return int(np.array([ s.count(word) for word in good]).sum() - np.array([ s.count(word) for word in bad]).sum())

# UDF registration
spark.udf.register("sentiment_analysis", sentiment_analysis)

In [18]:
# test UDF on simple query
query = "SELECT sentiment_analysis(text) FROM reviews where review_id='3KJjc-smPUeIGFjwOYqG3g'"
spark.sql(query).collect()

In [19]:
# cache reviews table
spark.sql("CACHE TABLE reviews")

# create a new table "scores" for further reuse (including debugging...)
spark.sql("SELECT review_id, sentiment_analysis(text) as sentiment, text FROM reviews").createOrReplaceTempView("scores")

# cache the new "scores" table
spark.sql("CACHE TABLE scores")

In [20]:
%sql SELECT max(sentiment) FROM scores

max(sentiment)
173


In [21]:
%sql SELECT * from scores WHERE scores.sentiment = (SELECT max(sentiment) FROM scores)

review_id,sentiment,text
u11TE3zObqIwQ9N4D68W5g,173,"i would give no stars if i could. had to come back to this place three times not one. i got a double tongue piercing & the one closest to the back of my mouth ended up in the middle of my tongue making it impossible to take our either way. i went back in to get it taken care of by one of the night shift workers and was told, i wasn't supposed to get it done in the first place. (he actually was the one who did it for me, which he realized after his coworker told him.) i was told to come back another day because nothing could be done for me, so i went back the next day to see layla. i was sent to her. she couldn't do anything and began to get mad and yelled at me for being upset nothing could be done. she blamed this all on me and yelled and cussed me out as i was leaving the place. i was told ""to leave the fucking place and to never fucking come back along with other words"" honestly this was the most unprofessional experience ive ever received! i don't have any reasons to come back here or to ever refer anyone. would not recommend this place to anyone. fuck this shitty ass place that deserves to be shut down. i've never been treated so poorly as a customer. with how poorly i was treated and how poorly the service here was, i could only imagine how other customers were treated after coming back for problems. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend.would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend. would not recommend."


In [22]:
%sql SELECT * from scores WHERE scores.sentiment = (SELECT max(sentiment) FROM scores)

review_id,sentiment,text
8lDyH3gcsG1qj86myTkF8A,-1,This place never ceases to amaze me. Quality specialty coffee for less than $3 is insane and the vibes are really chill.
noAs1AkLO2rX8ok_SuXDyQ,-1,"I have decided to change my rating after my experience there tonight. I went at about 5:00pm and there was already a good crowd at the Bar so I sat at one of their booths. One of my biggest pet peeves is when you go to a restaurant and they ignore you, and that's just what happened. I sat and sat until I saw one of the guys at the bar who I saw order his food get his dinner and finished it. Honestly, I try to not judge a place off of one experience but after waiting for close to 25 min without even a hello, I doubt I will ever be back. It wasn't even like they were super busy, I watched the bar tender and the other waitress talking it up with all the people at the bar a looked at me but never came over. I ended up just leaving."
afWR4tX6mO_3F2g0FoKoZA,-1,"I have used Tiger Lily many times in the past and, although they are on the pricey side, I have usually been satisfied. That all ended today (Valentine's Day). I ordered flowers on 2/1/17 to be delivered today by 11a and they promised that they would accommodate that since I ordered 2 weeks prior to VD. They completely bonked and no showed. My wife never got the flowers and now I am the goat. The biggest day of the year for flowers means they should have has ""all hands on deck."" I blame the driver and management. I won't be using Tiger Lily any more. There are too many other flower shops that make sure customer service in a priority."
Vn7zmdNmeyUMDPxIadbVJA,-1,"Horrible customer service. They speak very rudely to customers, what makes it worse is they speak that way to customers who are attempting to buy items. Do NOT shop here. Try wig land, hair land, or mid k where the customer service is on par."
qAB4TyihPeZ6CWNCVRaC7w,-1,"Brilliant food and one of the better assortments of beers at a gastro pub in Toronto... Never been disappointed when I have been there... Except for Sat nights when they go to capacity, and I can't get in :-("
3tyIF5s2HXY11KtYIlSHrg,-1,"For the 1st time in around 5 years I tried Tommy's, I won't ever have it again. My food was delivered hot, well.... the chicken was anyway. I paid over $9.00 for a small dinner salad that I didn't get, the pizza was almost leathery in it's texture and to top it all off the delivery guy tried to beat me out of $5.00 when he gave me my change! I don't recommend that anyone order from Tommy's. I know that I never will again. I hate that I have to give them even 1 star, they are horrible!"
rOxSEL_6KOqR6dm0mNfaAA,-1,To the person who said this pales in comparison to filibertos obviously has never had the free colon cleanse that comes with filibertos. Or your friggin high!
Yvgx-lecPmjF-5LfWPTTrA,-1,"So its been nearly four months that I took my Ford edge in for collision repair, they exceeded the two week repair I was quoted and was there three weeks. I didn't mind so much as they were friendly hence I did have to initiate every call for status reviews. My issues are that the manager expressed that if I wrote a positive review via yelp I would receive a $75 cash voucher. I called and was told they needed to view my comment and send the voucher as next step. I never received a voucher nor a follow up call. Negative reviews are not good business clearly. People want a reputable company with positive feedback, hence customer service should set apart on how customers are treated. Integrity says a lot and goes along way. They don't pride themselves on honorable business practices."
XKssbKAfvVyWSYnXHjv4yw,-1,"Je ne peux pas baser ma critique seulement sur la dernière visite, par chance! Le service a été horrible et long, il manquait de personnel pour un si beau jeudi soir. Nous avons attendu 30 minutes pour recevoir notre facture et chaque fois que nous avions besoin de la serveuse, elle prenait du temps pour apparaître et revenait toujours quelques minutes plus tard, car elle avait oublié ce qu'on lui avait demandé. Heureusement, les repas commandé était bons. Le filet mignon est toujours apprécié par les amateurs de viande. Le tartare de saumon est correct, mais il manque d'assaisonnement. Par le passé, j'ai toujours eu un meilleur service et les repas de pâtes m'y ont toujours plu, c'est pourquoi je donne tout de même un 3 étoiles."
c0_nCIq10ymptY4w9t_bog,-1,"First I'd like to say that Dr. Gellar is in his profession for all the right reasons - he is compassionate and doesn't want people to suffer. Secondly his calm presence, caring and understanding rivals a registered nurse's (and I know because I am one). I think a lot of doctors would have become frustrated with my anxiety and fears, just wanting to get the procedure over and move on to the next patient. However he took the time to allow me to focus and relax all the while maintaining his calm composure, never rushing me. And once I was ready, he numbed my foot with the freezing spray while trying to hide the needle for the block from my view. Had I not known what was going to happen I probably wouldn't have even noticed. I felt only a bit of discomfort during the injection and when the procedure began, I felt nothing. He talked me through it in distraction and before I knew it, I was done! If I ever have to have another foot procedure again, I wouldn't hesitate who I would call and I, as a very nervous and scared patient, would urge everyone to seek his help."


In [23]:
def sentiment_analysis(reviewString):
  # convert to lower caps and remove non alphabetical characters
  s = r2.sub('', reviewString.lower())
  # return basic sentiment analysis score as integer
  return int(np.array([ s.count(word) for word in good]).sum() - np.array([ s.count(word) for word in bad]).sum())


sentiment_analysis("Awesome, amazing,  ")