In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set("spark.driver.maxResultSize", "5g"))
sc = pyspark.SparkContext(conf=conf)

In [3]:
!dir /users/trush/CSC496/Labs/Lab4

data  Lab4.ipynb


In [4]:
!ls -lh /users/trush/CSC496/Labs/Lab4/data

total 4.2G
-rw-r--r-- 1 trush PDC-edu-Lab  25M Nov 17 08:30 yelp_academic_dataset_business.json.gz
-rw-r--r-- 1 trush PDC-edu-Lab 2.4G Nov 17 08:36 yelp_academic_dataset_review.json.gz
-rw-r--r-- 1 trush PDC-edu-Lab 1.8G Nov 17 08:34 yelp_academic_dataset_user.json.gz


# 1. Identify 100 users with highest number of ratings/reviews/followers.  You can decide/justify this ranking and decide the importance of ratings/reviews/followers. 

In [5]:
# Step 1: Load the data using SQL Context

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [6]:
# Get SQL table
df_json = sqlContext.read.json("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_user.json.gz")

In [7]:
df_json.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [8]:
df_json.registerTempTable("tbl_json")

In [9]:
# Using Python
import json
def parseJSONReviews(x):
    res = json.loads(x)
    return (res['user_id'], int(res['review_count']), res['fans'])

In [10]:
df_data = sc.textFile("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_user.json.gz")
print(df_data.count())
df_data.take(1)

1968703


['{"user_id":"ntlvfPzc8eglqvk92iDIAw","name":"Rafael","review_count":553,"yelping_since":"2007-07-06 03:27:11","useful":628,"funny":225,"cool":227,"elite":"","friends":"oeMvJh94PiGQnx_6GlndPQ, wm1z1PaJKvHgSDRKfwhfDg, IkRib6Xs91PPW7pon7VVig, A8Aq8f0-XvLBcyMk2GJdJQ, eEZM1kogR7eL4GOBZyPvBA, e1o1LN7ez5ckCpQeAab4iw, _HrJVzFaRFUhPva8cwBjpQ, pZeGZGzX-ROT_D5lam5uNg, 0S6EI51ej5J7dgYz3-O0lA, woDt8raW-AorxQM_tIE2eA, hWUnSE5gKXNe7bDc8uAG9A, c_3LDSO2RHwZ94_Q6j_O7w, -uv1wDiaplY6eXXS0VwQiA, QFjqxXn3acDC7hckFGUKMg, ErOqapICmHPTN8YobZIcfQ, mJLRvqLOKhqEdkgt9iEaCQ, VKX7jlScJSA-ja5hYRw12Q, ijIC9w5PRcj3dWVlanjZeg, CIZGlEw-Bp0rmkP8M6yQ9Q, OC6fT5WZ8EU7tEVJ3bzPBQ, UZSDGTDpycDzrlfUlyw2dQ, deL6e_z9xqZTIODKqnvRXQ, 5mG2ENw2PylIWElqHSMGqg, Uh5Kug2fvDd51RYmsNZkGg, 4dI4uoShugD9z84fYupelQ, EQpFHqGT9Tk6YSwORTtwpg, o4EGL2-ICGmRJzJ3GxB-vw, s8gK7sdVzJcYKcPv2dkZXw, vOYVZgb_GVe-kdtjQwSUHw, wBbjgHsrKr7BsPBrQwJf2w, p59u2EC_qcmCmLeX1jCi5Q, VSAZI1eHDrOPRWMK4Q2DIQ, efMfeI_dkhpeGykaRJqxfQ, x6qYcQ8_i0mMDzSLsFCbZg, K_zSmtNGw1fu-vm

In [11]:
df_highest = df_data.map(lambda x: parseJSONReviews(x)).takeOrdered(100, lambda x: -x[2])
df_highest

[('37cpUoM8hlkSQfReIEBd-Q', 1787, 11568),
 ('hizGc5W1tBHPghM5YKCAtg', 1629, 3315),
 ('eKUGKQRE-Ywi5dY55_zChg', 2430, 2916),
 ('Hi10sGSZNxQH3NLyWSZ1oA', 11112, 2718),
 ('j14WgRoU_-2ZE1aw1dXrJg', 3566, 2634),
 ('iLjMdZi0Tm7DQxX1C1_2dg', 2431, 2516),
 ('JjXuiru1_ONzDkYVrHN0aw', 1207, 2316),
 ('ITa3vh5ERI90G_WP4SmGUQ', 3079, 2280),
 ('UsXqCXRZwSCSw0AT7y1uBg', 2919, 2263),
 ('VHdY6oG2JPVNjihWhOooAQ', 1979, 2140),
 ('fgwI3rYHOv1ipfVfCSx7pg', 1847, 2113),
 ('nkN_do3fJ9xekchVC-v68A', 1363, 2046),
 ('m07sy7eLtOjVdZ8oN9JKag', 4913, 2034),
 ('peuxbSQwXed-81cSqL7Ykw', 1592, 1916),
 ('AHRrG3T1gJpHvtpZ-K0G_g', 2109, 1747),
 ('WG3w_73scm_JUWJ_3Lgn0Q', 5013, 1733),
 ('NfU0zDaTMEQ4-X9dbQWd9A', 1137, 1726),
 ('lt7bNHl-TXziny4FETu8nA', 802, 1708),
 ('3zxy3LVBV3ttxoYbY4rQ8A', 1894, 1696),
 ('ysCBsXWPB-LAiewVS3jZfQ', 3687, 1672),
 ('wEE-YMx5pmSuagLtNxMPKA', 1845, 1653),
 ('djxnI8Ux8ZYQJhiOQkrRhA', 2315, 1614),
 ('G9Vb6yQ047TC3O_-GG4WZA', 1540, 1574),
 ('JADdo9NEeO5Az9aOYbyvZA', 2382, 1464),
 ('58yXn5Y4409k

In [12]:
# Using SQL
highest_fan = sqlContext.sql("SELECT user_id, review_count, fans FROM tbl_json ORDER BY fans DESC LIMIT 100")

In [13]:
highest_fan.show()

+--------------------+------------+-----+
|             user_id|review_count| fans|
+--------------------+------------+-----+
|37cpUoM8hlkSQfReI...|        1787|11568|
|hizGc5W1tBHPghM5Y...|        1629| 3315|
|eKUGKQRE-Ywi5dY55...|        2430| 2916|
|Hi10sGSZNxQH3NLyW...|       11112| 2718|
|j14WgRoU_-2ZE1aw1...|        3566| 2634|
|iLjMdZi0Tm7DQxX1C...|        2431| 2516|
|JjXuiru1_ONzDkYVr...|        1207| 2316|
|ITa3vh5ERI90G_WP4...|        3079| 2280|
|UsXqCXRZwSCSw0AT7...|        2919| 2263|
|VHdY6oG2JPVNjihWh...|        1979| 2140|
|fgwI3rYHOv1ipfVfC...|        1847| 2113|
|nkN_do3fJ9xekchVC...|        1363| 2046|
|m07sy7eLtOjVdZ8oN...|        4913| 2034|
|peuxbSQwXed-81cSq...|        1592| 1916|
|AHRrG3T1gJpHvtpZ-...|        2109| 1747|
|WG3w_73scm_JUWJ_3...|        5013| 1733|
|NfU0zDaTMEQ4-X9db...|        1137| 1726|
|lt7bNHl-TXziny4FE...|         802| 1708|
|3zxy3LVBV3ttxoYbY...|        1894| 1696|
|ysCBsXWPB-LAiewVS...|        3687| 1672|
+--------------------+------------

# 2. Extract the reviews of these users and combine it with the business information.

### a. Are they continental, regional, or local eaters?
- Look at review data for business data. Look at business data for location.
- Variations in latitude and longitude of reviewed businesses (distance between furthest pair of restaurants, cluster into 3)

### b. Is there a preference in restaurant/food style of their reviews?
- Frequent itemsets
    - per user
        - set of items: categories
        - set of baskets: restaurants

### c. Can you infer the locations of these users?
- Locations of businesses most frequently visited (consider timestamp for users who go on trips to review restaurants)

#### How to prepare data to answer questions?
- List of users with highest fans (user_id, review_count, fans) - local
- Map users to reviews

In [14]:
#raw_review_df.filter(lambda x: json.loads(x)["user_id"] in df_top_users)

In [None]:
def extract_id(x):
    x_json = json.loads(x)
    return (x_json["user_id"])
tmp = df_data.take(1)
extract_id(tmp[0])

# 3. Identify one of your favorite restaurants that is available on Yelp. Search for all reviews and reviewers for this restaurants.  

### a. Is this restaurant frequented by non-local reviewers (how do you know)?
- Coach's

### b. What are the positive things about this restaurant (study higher-rated reviews)
- sentiment analysis

### c. What are the negative things about this restaurant (study lower-rated reviews)
- sentiment analysis

In [21]:
raw_review_df = sc.textFile("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_review.json.gz")
print(raw_review_df)
raw_review_df.take(1)

/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_review.json.gz MapPartitionsRDD[30] at textFile at NativeMethodAccessorImpl.java:0


['{"review_id":"xQY8N_XvtGbearJ5X4QryQ","user_id":"OwjRMXRC0KyPrIlcjaXeFQ","business_id":"-MhfebM0QIsKt87iDN-FNw","stars":2.0,"useful":5,"funny":0,"cool":0,"text":"As someone who has worked with many museums, I was eager to visit this gallery on my most recent trip to Las Vegas. When I saw they would be showing infamous eggs of the House of Faberge from the Virginia Museum of Fine Arts (VMFA), I knew I had to go!\\n\\nTucked away near the gelateria and the garden, the Gallery is pretty much hidden from view. It\'s what real estate agents would call \\"cozy\\" or \\"charming\\" - basically any euphemism for small.\\n\\nThat being said, you can still see wonderful art at a gallery of any size, so why the two *s you ask? Let me tell you:\\n\\n* pricing for this, while relatively inexpensive for a Las Vegas attraction, is completely over the top. For the space and the amount of art you can fit in there, it is a bit much.\\n* it\'s not kid friendly at all. Seriously, don\'t bring them.\\n* 

In [22]:
user_reviews = df_user_review.map(lambda x: (x[1], 1))
user_reviews.take(10)

[('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1),
 ('"', 1)]

In [25]:
df_user_reviews = raw_review_df.filter(lambda x: json.loads(x)["user_id"] in df_data)
print(df_user_reviews.count())
df_user_reviews.take(10)

Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/serializers.py", line 468, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/cloudpickle.py", line 1097, in dumps
    cp.dump(obj)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/cloudpickle.py", line 357, in dump
    return Pickler.dump(self, obj)
  File "/usr/lib/python3.6/pickle.py", line 409, in dump
    self.save(obj)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple
    save(element)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/cloudpickle.py", line 501, in save_function
    self.save_function_tuple(obj)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/pyspark/cloudpickle.py", li

PicklingError: Could not serialize object: Exception: It appears that you are attempting to broadcast an RDD or reference an RDD from an action or transformation. RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(lambda x: rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.

In [26]:
df_test = raw_review_df.take(1)
json.loads(df_test)["user_id"]

TypeError: the JSON object must be str, bytes or bytearray, not 'list'