In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [2]:
number_cores = 8
memory_gb = 24
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
!ls -lh ../data/yelp/

ls: cannot access '../data/yelp/': No such file or directory


### Identify 100 users with highest number of ratings/fans.  

- User dataset
- You can decide/justify this ranking and decide the importance of ratings/fans. 
- Focusing on fans is better. 


#### Step 1: 
- Load the data:
  - We can use SQL: SQlContext

In [4]:
# import SQLContext and create a sqlContext
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [5]:
# get SQL table
#df_json = sqlContext.read.json("../data/yelp/yelp_academic_dataset_user.json.gz")

In [6]:
df_json.printSchema()

NameError: name 'df_json' is not defined

In [None]:
df_json.registerTempTable("tbl_json")

  - PySpark way

In [None]:
df_data = sc.textFile("../data/yelp/yelp_academic_dataset_user.json.gz")
print(df_data.count())
df_data.take(1)

#### Step 2:
  - Get the information using SQL Statement

In [None]:
df_json.printSchema()

In [None]:
# 100 users with highest number of ratings/fans.
highest_fan = sqlContext.sql("SELECT user_id, review_count, fans FROM tbl_json ORDER BY fans DESC LIMIT 100")

In [None]:
highest_fan.show()

  - Get the information using PySpark Statement

In [None]:
df_data.take(1)

In [None]:
import json

df_top_users1 = df_data.map(lambda x: json.loads(x)).map(lambda x: (x["user_id"], x["review_count"], x["fans"]))
df_top_users1.take(1)

In [None]:
def extract_user(x):
    x_json = json.loads(x)
    return (x_json["user_id"], x_json["review_count"], x_json["fans"])

tmp = df_data.take(1)
extract_user(tmp[0])

In [None]:
df_top_users2 = df_data.map(extract_user).takeOrdered(100, lambda x: -x[2])
df_top_users2

In [None]:
highest_fan.show()

### Extract the reviews of these users and combine it with the business information. 


- Are they continental, regional, or local eaters? 
    - Look at review data for business data, look at business data for location. 
    - A dataset exists, is posted on Discord. 
    - Second options: distance between furthest pair of restaurants, cluster into three. 
- Is there a preference in restaurant/food style of their reviews? 
    - How/Can we phrase this as a frequent items question?
        - per user
            - set of items: categories
            - set of baskets: restaurants
- Can you infer the locations of these users?
    - pay attention to timing patterns of review

#### How to prepare data to answer the questions?

- List of users with highest fans (user_id, review_count, fans) - local 
- We need to map users to reviews

In [None]:
# in the review.json, they have user_id, so we 
# conditional statement in the parsedJson .. 
raw_review_df.filter(lambda x: json.loads(x)["user_id"] in user_list)

In [None]:
raw_review_df = sc.textFile("../data/yelp/yelp_academic_dataset_review.json.gz")
print(raw_review_df.count())
raw_review_df.take(1)

In [None]:
review = raw_review_df.take(1)
json.loads(review[0])["user_id"] in df_top_users

In [None]:
df_test = raw_review_df.map(lambda x: json.loads(x)["user_id"])
df_test.take(1)

In [None]:
user_list = []
for item in df_top_users:
    user_list.append(item[0])
user_list

In [None]:
'JjXuiru1_ONzDkYVrHN0aw' in user_list

In [None]:
df_user_reviews = raw_review_df.filter(lambda x: json.loads(x)["user_id"] in user_list)
print(df_user_reviews.count())
df_user_reviews.take(1)

In [None]:
sum = 0
for item in df_top_users:
    sum = sum + int(item[1])
sum

At this point, what data do we have?

- user_id, reviews, fans
- review raw data for top users (complete)

What else do we need to answer:
- Are they continental, regional, or local eaters?

- business_id: get it from review data for top users .

In [None]:
raw_business_df = sc.textFile("../data/yelp/yelp_academic_dataset_business.json.gz")
print(raw_business_df.count())
raw_business_df.take(1)

In [None]:
df_business = df_user_reviews.map(lambda x: json.loads(x)["business_id"])
df_business.count()

In [None]:
df_business.distinct().count()

In [None]:
df_business.take(2)

In [None]:
df_unique_business = df_business.distinct().collect()

In [None]:
# start by copy/pasting old code and we will modify. 
# do not run this cell!!!!!

df_user_reviews = raw_review_df.filter(lambda x: json.loads(x)["user_id"] in user_list)
print(df_user_reviews.count())
df_user_reviews.take(1)

Two options:
1. Grab complete businesses data for businesses in the list, extract lat/long later. 
2. Write function to extract lat/long from businesses in the list only. 

Which one?

In [None]:
df_unique_business[:10]

In [None]:
# option 1:
df_user_businesses = raw_business_df.filter(lambda x: json.loads(x)["business_id"] in df_unique_business)

In [None]:
print(df_user_businesses.count())
df_user_businesses.take(1)

In [None]:
df_latlong = df_user_businesses.map(lambda x: (json.loads(x)["business_id"],json.loads(x)["latitude"],json.loads(x)["longitude"]))
print(df_latlong.count())
df_latlong.take(2)


On Spark cluster:
- user_id, reviews, fans: 100 data items
- review_id, user_id, business_id,...: 16937 items
- business_id, lat, long: 11863 items

How to bring them all together?
1. Convert everything to SparkSQL, do SQL things
2. Filter data down to smaller items, bring them all back to the notebook, do Python things

### Identify one of your favorite restaurants that is available on Yelp. Search for all reviews and reviewers for this restaurants. 

- Is this restaurant frequented by non-local reviewers (how do you know)?
- What are the positive things about this restaurant (study higher-rated reviews)
- What are the negative things about this restaurant (study lower-rated reviews)