In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set("spark.driver.maxResultSize", "5g"))
sc = pyspark.SparkContext(conf=conf)

In [3]:
!dir ./data

yelp_academic_dataset_business.json.gz	yelp_academic_dataset_user.json.gz
yelp_academic_dataset_review.json.gz


In [4]:
!ls -lh ./data

total 4.2G
-rw-r--r-- 1 trush PDC-edu-Lab  25M Dec  1 07:48 yelp_academic_dataset_business.json.gz
-rw-r--r-- 1 trush PDC-edu-Lab 2.4G Dec  1 07:56 yelp_academic_dataset_review.json.gz
-rw-r--r-- 1 trush PDC-edu-Lab 1.8G Dec  1 07:55 yelp_academic_dataset_user.json.gz


# 1. Identify 100 users with highest number of ratings/fans.  

In [5]:
# Step 1: Load the data using SQL Context

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [6]:
# get SQL table
df_json = sqlContext.read.json("./data/yelp_academic_dataset_user.json.gz")

In [7]:
#df_json.printSchema()

In [8]:
df_json.registerTempTable("tbl_json")

  - PySpark way

In [9]:
df_data = sc.textFile("./data/yelp_academic_dataset_user.json.gz")
#print(df_data.count())
#df_data.take(1)

#### Step 2:
  - Get the information using SQL Statement

In [10]:
#df_json.printSchema()

In [11]:
# 100 users with highest number of ratings/fans.
highest_fan = sqlContext.sql("SELECT user_id, review_count, fans FROM tbl_json ORDER BY fans DESC LIMIT 100")

In [12]:
#highest_fan.show()

  - Get the information using PySpark Statement

In [13]:
#df_data.take(1)

In [14]:
import json

df_top_users1 = df_data.map(lambda x: json.loads(x)).map(lambda x: (x["user_id"], x["review_count"], x["fans"]))
#df_top_users1.take(1)

In [15]:
def extract_user(x):
    x_json = json.loads(x)
    return (x_json["user_id"], x_json["review_count"], x_json["fans"])

#tmp = df_data.take(1)
#extract_user(tmp[0])

In [16]:
df_top_users2 = df_data.map(extract_user).takeOrdered(100, lambda x: -x[2])
#df_top_users2

The top 100 users with the highest number of fans were found by mapping the user data with lists of each user_id and their respective review_count and fans then sorting them in descending order by fans. The top 100 users with the highest number of fans are:

In [17]:
df_top_users2

[('37cpUoM8hlkSQfReIEBd-Q', 1787, 11568),
 ('hizGc5W1tBHPghM5YKCAtg', 1629, 3315),
 ('eKUGKQRE-Ywi5dY55_zChg', 2430, 2916),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 11112, 2718),
 ('j14WgRoU_-2ZE1aw1dXrJg', 3566, 2634),
 ('iLjMdZi0Tm7DQxX1C1_2dg', 2431, 2516),
 ('JjXuiru1_ONzDkYVrHN0aw', 1207, 2316),
 ('ITa3vh5ERI90G_WP4SmGUQ', 3079, 2280),
 ('UsXqCXRZwSCSw0AT7y1uBg', 2919, 2263),
 ('VHdY6oG2JPVNjihWhOooAQ', 1979, 2140),
 ('fgwI3rYHOv1ipfVfCSx7pg', 1847, 2113),
 ('nkN_do3fJ9xekchVC-v68A', 1363, 2046),
 ('m07sy7eLtOjVdZ8oN9JKag', 4913, 2034),
 ('peuxbSQwXed-81cSqL7Ykw', 1592, 1916),
 ('AHRrG3T1gJpHvtpZ-K0G_g', 2109, 1747),
 ('WG3w_73scm_JUWJ_3Lgn0Q', 5013, 1733),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 1137, 1726),
 ('lt7bNHl-TXziny4FETu8nA', 802, 1708),
 ('3zxy3LVBV3ttxoYbY4rQ8A', 1894, 1696),
 ('ysCBsXWPB-LAiewVS3jZfQ', 3687, 1672),
 ('wEE-YMx5pmSuagLtNxMPKA', 1845, 1653),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 2315, 1614),
 ('G9Vb6yQ047TC3O_-GG4WZA', 1540, 1574),
 ('JADdo9NEeO5Az9aOYbyvZA', 2382, 1464),
 ('58yXn5Y4409k

# 2. Extract the reviews of these users and combine it with the business information. 

## a. Are they continental, regional, or local eaters?  

In [18]:
df_reviews = sqlContext.read.json("./data/yelp_academic_dataset_review.json.gz")
df_business = sqlContext.read.json("./data/yelp_academic_dataset_business.json.gz")

In [19]:
df_reviews.registerTempTable("df_reviews")
df_business.registerTempTable("df_business")

In [20]:
highest_fan.registerTempTable("highest_fan")

In [21]:
business_locations = sqlContext.sql("SELECT df_reviews.user_id, latitude, longitude FROM df_reviews, df_business, highest_fan WHERE df_reviews.business_id=df_business.business_id AND highest_fan.user_id = df_reviews.user_id")

In [22]:
#df_business.printSchema()

In [23]:
#df_reviews.printSchema()

In [24]:
#business_locations.show()

__Distance between 2 points lat, lon__

d=2*asin(sqrt((sin((lat1-lat2)/2))^2 + cos(lat1)*cos(lat2)*(sin((lon1-lon2)/2))^2))

https://edwilliams.org/avform147.htm

In [25]:
import math
def distance(lat1, lon1, lat2, lon2):
    return 2*math.asin(math.sqrt((math.sin((lat1-lat2)/2))**2 + math.cos(lat1)*math.cos(lat2)*(math.sin((lon1-lon2)/2))**2))

In [26]:
distance(36.1128958062, -115.1776370528, 33.3483825, -111.8591895)

2.795102054155977

In [27]:
locations = business_locations.rdd

In [28]:
locations1 = locations.map(lambda x: (x[0],[x[1], x[2]]))

In [29]:
#locations1.take(20)

In [30]:
user_latlon = locations1.groupByKey().mapValues(list)

In [31]:
#user_latlon.take(1)

In [32]:
# Return triplet of number of local, regional, and continental reviews
def get_dist_classification(x):
    distances = [0, 0, 0]
    for i in range(len(x[1])):
        for j in range(len(x[1])):
            y = distance(x[1][i][0], x[1][i][1], x[1][j][0], x[1][j][1]) 
            if y >= 0 and y < 0.10:
                distances[0] = distances[0] + 1
            elif y >= 0.10 and y < 0.50:
                distances[1] = distances[1] + 1
            elif y >= 0.50:
                distances[2] = distances[2] + 1
    if distances[0] >= distances[1] and distances[0] >= distances[2]:
        return (x[0], "local");
    elif distances[1] >= distances[0] and distances[1] >= distances[2]:
        return (x[0], "regional");
    elif distances[2] >= distances[0] and distances[2] >= distances[1]:
        return (x[0], "continental");

In [33]:
user_dist_classification = user_latlon.map(lambda x: get_dist_classification(x))

The users are classified as local, regional, or continental eaters based on the locations of the businesses they have reviewed. The classifications for the top 100 users are:

In [34]:
user_dist_classification.collect()

[('U4INQZOPSUaj8hMjLlZ3KA', 'local'),
 ('CxDOIDnH8gp9KXzpBHJYXw', 'regional'),
 ('dIIKEfOgo0KqUfGQvGikPg', 'local'),
 ('NNL1zLTP2J_SOputgoPYeQ', 'continental'),
 ('peuxbSQwXed-81cSqL7Ykw', 'local'),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 'local'),
 ('pou3BbKsIozfH50rxmnMew', 'continental'),
 ('ZIOCmdFaMIF56FR-nWr_2A', 'local'),
 ('IDVFG1pNSHIHoVuoLuZpcQ', 'local'),
 ('bLbSNkLggFnqwNNzzq-Ijw', 'local'),
 ('B7ecAeAIrXg7sgmabS38pg', 'continental'),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 'continental'),
 ('vHc-UrI9yfL_pnnc6nJtyQ', 'continental'),
 ('8DEyKVyplnOcSKx39vatbg', 'local'),
 ('gjhzKWsqCIrpEd9pevbKZw', 'continental'),
 ('eKUGKQRE-Ywi5dY55_zChg', 'local'),
 ('WWnhqRnWWjutMqh-2SzEuQ', 'continental'),
 ('zFYs8gSUYDvXkb6O7YkRkw', 'local'),
 ('JADdo9NEeO5Az9aOYbyvZA', 'local'),
 ('wEE-YMx5pmSuagLtNxMPKA', 'continental'),
 ('i_DR3vdE73nVm2GlMbGpGA', 'local'),
 ('nmdkHL2JKFx55T3nq5VziA', 'local'),
 ('W7DHyQlY_kXls2iXt-_2Ag', 'local'),
 ('Ve0LUwcrzxL7w0RYgY4Aaw', 'local'),
 ('VHdY6oG2JPVNjihWhOooAQ', 'continen

Each review in of the top 100 users is mapped with a list of the latitude and longitude of the business reviewed (using SQL). Then, the latitude/longitude pairs are aggregated to a list and mapped with the associated user_id. The distance function is used to calculate the distance between two latitude/longitude pairs (i.e., the distance between two businesses reviewed by a given user). The distance between each business is mapped as a list with the user_id, then the get_dist_classification function determines whether a user is a local, regional, or continental reviewer. The distance between two businesses is considered local if it is between 0 and 0.10, regional between 0.10 and 0.50, and continental above 0.50 (these numbers were arbitrarily chosen, but finding the actual parameters would significantly increase the accuracy of this model). The classification with the greatest number of reviews is considered the user's distance classification. It appears that the top 100 reviewers are primarily local or continental with 1 regional user out of 100. This may indicate that active/popular reviewers are on both extremes of the spectrum either reviewing businesses nearby or extensively travelling but not in the middle.

## b. Is there a preference in restaurant/food style of their reviews? 

In [35]:
business_styles = sqlContext.sql("SELECT df_reviews.user_id, categories FROM df_reviews, df_business, highest_fan WHERE df_reviews.business_id=df_business.business_id AND highest_fan.user_id = df_reviews.user_id")

In [36]:
user_styles = business_styles.rdd.groupByKey().mapValues(list)

In [37]:
#user_styles.take(1)

In [38]:
def get_styles(x):
    styles = []
    for i in range(len(x[1])):
        if (x[1][i] == None):
            continue
        y = x[1][i].split(", ")
        for j in range(len(y)):
            styles.append(y[j])
    return (x[0], styles)

In [39]:
get_styles(('Lnwip57QIhwr81NhubGMgQ',
  ['Travel Services, Tours, Airport Shuttles, Transportation, Hotels & Travel']))

('Lnwip57QIhwr81NhubGMgQ',
 ['Travel Services',
  'Tours',
  'Airport Shuttles',
  'Transportation',
  'Hotels & Travel'])

In [40]:
user_styles1 = user_styles.map(lambda x: get_styles(x))

In [41]:
#user_styles1.take(1)

In [42]:
def get_counts(x):
    dict_count_occurences = {}

    for i in x[1]:
        if i in dict_count_occurences:
            dict_count_occurences[i] += 1
        else:
            dict_count_occurences[i] = 1
    y = max(dict_count_occurences, key=dict_count_occurences.get)
    return (x[0], y)

In [43]:
user_styles2 = user_styles1.map(lambda x: get_counts(x))

In [44]:
user_styles2.collect()

[('U4INQZOPSUaj8hMjLlZ3KA', 'Restaurants'),
 ('CxDOIDnH8gp9KXzpBHJYXw', 'Restaurants'),
 ('dIIKEfOgo0KqUfGQvGikPg', 'Restaurants'),
 ('NNL1zLTP2J_SOputgoPYeQ', 'Restaurants'),
 ('peuxbSQwXed-81cSqL7Ykw', 'Restaurants'),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 'Restaurants'),
 ('pou3BbKsIozfH50rxmnMew', 'Restaurants'),
 ('ZIOCmdFaMIF56FR-nWr_2A', 'Restaurants'),
 ('IDVFG1pNSHIHoVuoLuZpcQ', 'Restaurants'),
 ('bLbSNkLggFnqwNNzzq-Ijw', 'Restaurants'),
 ('B7ecAeAIrXg7sgmabS38pg', 'Restaurants'),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 'Restaurants'),
 ('vHc-UrI9yfL_pnnc6nJtyQ', 'Restaurants'),
 ('8DEyKVyplnOcSKx39vatbg', 'Restaurants'),
 ('gjhzKWsqCIrpEd9pevbKZw', 'Restaurants'),
 ('eKUGKQRE-Ywi5dY55_zChg', 'Restaurants'),
 ('WWnhqRnWWjutMqh-2SzEuQ', 'Restaurants'),
 ('zFYs8gSUYDvXkb6O7YkRkw', 'Restaurants'),
 ('JADdo9NEeO5Az9aOYbyvZA', 'Restaurants'),
 ('wEE-YMx5pmSuagLtNxMPKA', 'Restaurants'),
 ('i_DR3vdE73nVm2GlMbGpGA', 'Restaurants'),
 ('nmdkHL2JKFx55T3nq5VziA', 'Restaurants'),
 ('W7DHyQlY_kXls2iXt-_2Ag', 'Res

Each user's preference in business style is determined by aggregating the styles of every business they have reviewed and mapping the data as a list of strings (styles) for each user_id. Then, the lists of strings are aggregated into a single list by splitting the strings with multiple styles and appending them to a single list. The number of occurrences of each style is calculated in the get_counts function, and the user's preferred style is the style with the most number of occurrences.

The output shows that the majority of the top 100 users review restaurants as well as hotels/travel. This may indicate that the top 100 reviewers travel to restaurants in various locations and review the food and hospitality they experience.

## c. Can you infer the locations of these users?

In [45]:
import statistics
def get_avg_latlon(x):
    lat = []
    lon = []
    for i in range(len(x[1])):
        lat.append(x[1][i][0])
        lon.append(x[1][i][1]) 
    return (x[0], statistics.mean(lat), statistics.mean(lon))

In [46]:
inferred_loc = user_latlon.map(lambda x: get_avg_latlon(x))

In [47]:
inferred_loc.collect()

[('U4INQZOPSUaj8hMjLlZ3KA', 36.51283329476192, -112.13721542705822),
 ('CxDOIDnH8gp9KXzpBHJYXw', 43.85072735940806, -79.3867659523086),
 ('dIIKEfOgo0KqUfGQvGikPg', 33.8201266602196, -111.61873846308333),
 ('NNL1zLTP2J_SOputgoPYeQ', 40.82428099853519, -84.4194633651611),
 ('peuxbSQwXed-81cSqL7Ykw', 36.118188533676665, -115.16908479082666),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 41.32055109933965, -82.6153455315933),
 ('pou3BbKsIozfH50rxmnMew', 38.39303621056666, -91.97555693439833),
 ('ZIOCmdFaMIF56FR-nWr_2A', 35.87446814155674, -113.54711028996454),
 ('IDVFG1pNSHIHoVuoLuZpcQ', 35.53689021071149, -114.51023736574253),
 ('bLbSNkLggFnqwNNzzq-Ijw', 36.09158875728172, -115.13965717917111),
 ('B7ecAeAIrXg7sgmabS38pg', 34.56928123779546, -113.30101414721364),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 40.35675393178529, -81.67682894785214),
 ('vHc-UrI9yfL_pnnc6nJtyQ', 42.03307563417222, -113.79016276923889),
 ('8DEyKVyplnOcSKx39vatbg', 36.113911596501765, -115.19636836577276),
 ('gjhzKWsqCIrpEd9pevbKZw', 39.6321380970

I inferred the locations of the users by finding the average of the latitudes and longitudes of the reviewed businesses. However, it may be argued that this is an inaccurate approach because users may not be reviewing places that are an equidistant radius from their respective locations. A more accurate approach may be to consider the dates during which users reviewed businesses. A user may travel to review businesses if the dates of reviews concentrated in certain locations are further apart. The existing output may be further contextualized by identifying the city/state or even the address of the average latitude/longitude with a library like geopy.

# 3. Identify one of your favorite restaurants that is available on Yelp. Search for all reviews and reviewers for this restaurants.

## KFC

In [48]:
df_KFC = sqlContext.sql("SELECT df_reviews.user_id, df_business.business_id, df_reviews.stars, text FROM df_reviews, df_business WHERE df_reviews.business_id=df_business.business_id AND df_business.business_id='0SySGK7xWEHSo4ZLvD1C0A'")

In [49]:
df_KFC1 = df_KFC.rdd

In [50]:
business_locations1 = sqlContext.sql("SELECT df_reviews.user_id, latitude, longitude FROM df_reviews, df_business, highest_fan WHERE df_reviews.business_id=df_business.business_id")

In [51]:
locations2 = business_locations1.rdd

In [52]:
locations3 = locations2.map(lambda x: (x[0],[x[1], x[2]]))

In [53]:
user_latlon1 = locations3.groupByKey().mapValues(list)

In [54]:
user_dist_classification1 = user_latlon.map(lambda x: get_dist_classification(x))

In [55]:
#user_dist_classification1.take(10)

In [59]:
user_dist = user_dist_classification1.collect()

In [60]:
def get_nonlocal(x):
    for i in user_dist:
        if (x[0] == i[0]) and (i[1] != 'local'):
            return (x[0], i[1])
        elif (x[0] == i[0]) and (i[1] == 'local'):
            return ('-','-')
    return ('-', 'local')

In [61]:
df_KFC2 = df_KFC1.map(lambda x: get_nonlocal(x))

In [62]:
df_KFC2.collect()

[('-', 'local'),
 ('-', 'local'),
 ('-', 'local'),
 ('-', 'local'),
 ('-', 'local'),
 ('-', 'local'),
 ('-', 'local')]

## a. Is this restaurant frequented by non-local reviewers (how do you know)?

No, this location only has reviews from local reviewers. This is found by identifying the reviews associated with the location, then getting the distance classification of every user in the user dataset since the location may not have reviews from the already classified top 100. The resulting output supports the claim that the restaurant is primarily reviewed by local reviewers

In [63]:
df_KFC1.collect()

[Row(user_id='KCHbs_KET1F4bbTlrrt5bQ', business_id='0SySGK7xWEHSo4ZLvD1C0A', stars=1.0, text="I would seriously avoid this particular KFC. Last time I went I got a chicken sandwich (not sure what type exactly) but the chicken was tough and slightly grey colored. It tasted off. It's like it was recooked several times and was awful. Just absolutely nasty."),
 Row(user_id='H_l3Okb4P-ij2euaQZ1ikw', business_id='0SySGK7xWEHSo4ZLvD1C0A', stars=1.0, text="This is the worst fast food experiences ever at this location. I've been here several times over the years and I keep telling myself I'm a glutton for punishment. But, it is near park and ride & my son wanted some KFC. This time, I finally learned my lesson. Never again. \n\nIt was prime dinner time, after work. So it is busy but the drive thru was moving. I ordered two value meals. Two. Nothing outrageous like you will see at a KFC when someone will come through and order like 3 buckets of chicken. \n\nI pulled up and paid & got my two drin

## b. What are the positive things about this restaurant (study higher-rated reviews)

There are very few positive things in the reviews for this particular KFC location. There is one review with 4 stars that states the food was delicious, and the environment was clean.

## c. What are the negative things about this restaurant (study lower-rated reviews)

The negative things stated in the reviews for this location are that the food was very low-quality. Many reviewers cited undercooked or disfigured food, unprofessional employees, incorrect orders, and excessive wait times.