In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set("spark.driver.maxResultSize", "5g"))
sc = pyspark.SparkContext(conf=conf)

In [None]:
!dir /users/trush/CSC496/Labs/Lab4/data

In [None]:
!ls -lh /users/trush/CSC496/Labs/Lab4/data

# 1. Identify 100 users with highest number of ratings/fans.  

In [3]:
# Step 1: Load the data using SQL Context

from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
# get SQL table
df_json = sqlContext.read.json("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_user.json.gz")

In [None]:
#df_json.printSchema()

In [None]:
df_json.registerTempTable("tbl_json")

  - PySpark way

In [None]:
df_data = sc.textFile("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_user.json.gz")
print(df_data.count())
#df_data.take(1)

#### Step 2:
  - Get the information using SQL Statement

In [None]:
#df_json.printSchema()

In [None]:
# 100 users with highest number of ratings/fans.
highest_fan = sqlContext.sql("SELECT FROM tbl_json ORDER BY fans DESC LIMIT 100")

In [None]:
#highest_fan.show()

  - Get the information using PySpark Statement

In [None]:
#df_data.take(1)

In [None]:
import json

df_top_users1 = df_data.map(lambda x: json.loads(x)).map(lambda x: (x["user_id"], x["review_count"], x["fans"]))
#df_top_users1.take(1)

In [None]:
def extract_user(x):
    x_json = json.loads(x)
    return (x_json["user_id"], x_json["review_count"], x_json["fans"])

#tmp = df_data.take(1)
#extract_user(tmp[0])

In [None]:
df_top_users2 = df_data.map(extract_user).takeOrdered(100, lambda x: -x[2])
#df_top_users2

In [None]:
#highest_fan.show()

# 2. Extract the reviews of these users and combine it with the business information. 

## a. Are they continental, regional, or local eaters?  

In [4]:
df_reviews = sqlContext.read.json("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_review.json.gz")
df_business = sqlContext.read.json("/users/trush/CSC496/Labs/Lab4/data/yelp_academic_dataset_business.json.gz")

In [5]:
df_reviews.registerTempTable("df_reviews")
df_business.registerTempTable("df_business")

In [None]:
business_locations = sqlContext.sql("SELECT user_id, latitude, longitude FROM df_reviews INNER JOIN df_business ON df_reviews.business_id=df_business.business_id")

In [None]:
#df_business.printSchema()

In [None]:
#df_reviews.printSchema()

In [None]:
#business_locations.show()

__Distance between 2 points lat, lon__

d=2*asin(sqrt((sin((lat1-lat2)/2))^2 + cos(lat1)*cos(lat2)*(sin((lon1-lon2)/2))^2))

https://edwilliams.org/avform147.htm

In [None]:
import math
def distance(lat1, lon1, lat2, lon2):
    return 2*math.asin(math.sqrt((math.sin((lat1-lat2)/2))**2 + math.cos(lat1)*math.cos(lat2)*(math.sin((lon1-lon2)/2))**2))

In [None]:
distance(36.1128958062, -115.1776370528, 33.3483825, -111.8591895)

In [None]:
locations = business_locations.rdd

In [None]:
locations1 = locations.map(lambda x: (x[0],[x[1], x[2]]))

In [None]:
#locations1.take(1)

In [None]:
user_latlon = locations1.groupByKey().mapValues(list)

In [None]:
#user_latlon.take(1)

In [None]:
def get_distances(x):
    distances = []
    for i in range(len(x[1])):
        for j in range(len(x[1])):
            distances.append(distance(x[1][i][0], x[1][i][1], x[1][j][0], x[1][j][1]))
    return (x[0], distances)

In [None]:
get_distances(('jbOJV077QCGzduE6zFYO_A',
  [[36.1023786, -115.1745465],
   [36.1351760705, -115.427117132],
   [36.1429464, -115.1749359],
   [36.0657173614, -115.4354310036],
   [36.1585998893, -115.2042446423],
   [36.1270336, -115.2098191],
   [33.42028, -111.8396342],
   [36.102917, -115.17],
   [36.1435299, -115.1469733],
   [36.117397, -115.1757521],
   [36.1182917, -115.1725932],
   [36.086209, -115.13706],
   [33.4154053, -111.8329174],
   [36.112503, -115.061928],
   [36.1564182, -115.2077663],
   [33.4509853, -111.7702695],
   [36.1019817648, -115.1738418651],
   [33.436137, -111.716897],
   [36.159706, -115.245078],
   [36.0966823101, -115.1758075725],
   [36.1740973156, -115.1451098718],
   [36.1201403, -115.1543617],
   [36.1213235673, -115.1749563217],
   [36.1584751, -115.1265495],
   [33.452261, -111.82834],
   [36.093883, -115.176202]]))

In [None]:
user_distances = user_latlon.map(lambda x: get_distances(x))

In [None]:
#user_distances.take(1)

In [None]:
import statistics
def get_avg_dist(x):
    distances = []
    for i in range(len(x[1])):
        for j in range(len(x[1])):
            distances.append(distance(x[1][i][0], x[1][i][1], x[1][j][0], x[1][j][1]))
    return (x[0], statistics.mean(distances))

In [None]:
def get_dist_classification(x):
    distances = []
    for i in range(len(x[1])):
        for j in range(len(x[1])):
            distances.append(distance(x[1][i][0], x[1][i][1], x[1][j][0], x[1][j][1]))
    max_dist = max(distances)
    if max_dist >= 0 and max_dist < 0.10:
        return (x[0], "local")
    elif max_dist >= 0.10 and max_dist < 0.50:
        return (x[0], "regional")
    elif max_dist >= 0.50:
        return (x[0], "continental")

In [None]:
get_avg_dist(('jbOJV077QCGzduE6zFYO_A',
  [[36.1023786, -115.1745465],
   [36.1351760705, -115.427117132],
   [36.1429464, -115.1749359],
   [36.0657173614, -115.4354310036],
   [36.1585998893, -115.2042446423],
   [36.1270336, -115.2098191],
   [33.42028, -111.8396342],
   [36.102917, -115.17],
   [36.1435299, -115.1469733],
   [36.117397, -115.1757521],
   [36.1182917, -115.1725932],
   [36.086209, -115.13706],
   [33.4154053, -111.8329174],
   [36.112503, -115.061928],
   [36.1564182, -115.2077663],
   [33.4509853, -111.7702695],
   [36.1019817648, -115.1738418651],
   [33.436137, -111.716897],
   [36.159706, -115.245078],
   [36.0966823101, -115.1758075725],
   [36.1740973156, -115.1451098718],
   [36.1201403, -115.1543617],
   [36.1213235673, -115.1749563217],
   [36.1584751, -115.1265495],
   [33.452261, -111.82834],
   [36.093883, -115.176202]]))

In [None]:
get_dist_classification(('jbOJV077QCGzduE6zFYO_A',
  [[36.1023786, -115.1745465],
   [36.1351760705, -115.427117132],
   [36.1429464, -115.1749359],
   [36.0657173614, -115.4354310036],
   [36.1585998893, -115.2042446423],
   [36.1270336, -115.2098191],
   [33.42028, -111.8396342],
   [36.102917, -115.17],
   [36.1435299, -115.1469733],
   [36.117397, -115.1757521],
   [36.1182917, -115.1725932],
   [36.086209, -115.13706],
   [33.4154053, -111.8329174],
   [36.112503, -115.061928],
   [36.1564182, -115.2077663],
   [33.4509853, -111.7702695],
   [36.1019817648, -115.1738418651],
   [33.436137, -111.716897],
   [36.159706, -115.245078],
   [36.0966823101, -115.1758075725],
   [36.1740973156, -115.1451098718],
   [36.1201403, -115.1543617],
   [36.1213235673, -115.1749563217],
   [36.1584751, -115.1265495],
   [33.452261, -111.82834],
   [36.093883, -115.176202]]))

In [None]:
user_avg_dist = user_latlon.map(lambda x: get_avg_dist(x))

In [None]:
#user_avg_dist.take(10)

In [None]:
user_dist_classification = user_latlon.map(lambda x: get_dist_classification(x))

In [None]:
#user_dist_classification.take(1)

## b. Is there a preference in restaurant/food style of their reviews? 

In [6]:
reviewed_businesses = sqlContext.sql("SELECT user_id, business_id FROM df_reviews")

In [7]:
reviewed_businesses.show()

+--------------------+--------------------+
|             user_id|         business_id|
+--------------------+--------------------+
|OwjRMXRC0KyPrIlcj...|-MhfebM0QIsKt87iD...|
|nIJD_7ZXHq-FX8byP...|lbrU8StCq3yDfr-QM...|
|V34qejxNsCbcgD8C0...|HQl28KMwrEKHqhFrr...|
|ofKDkJKXSKZXu5xJN...|5JxlZaqCnk1MnbgRi...|
|UgMW8bLE0QMJDCkQ1...|IS4cv902ykd8wj1TR...|
|5vD2kmE25YBrbayKh...|nlxHRv1zXGT0c0K51...|
|aq_ZxGHiri48TUXJl...|Pthe4qk5xh4n-ef-9...|
|dsd-KNYKMpx6ma_sR...|FNCJpSn0tL9iqoY3J...|
|P6apihD4ASf1vpPxH...|e_BiI4ej1CW1F0EyV...|
|jOERvhmK6_lo_XGUB...|Ws8V970-mQt2X9CwC...|
|s5j_CRBWDCCMDJ6r7...|PA61Rwk3AMwOEXHev...|
|HJECayULRM-6xh2GC...|l-nL4BmhzpZjcavoo...|
|1YIQGP-a534nyksaw...|Naa6E0YU0Wr7jCuCE...|
|qftVgPj_kRTildMDj...|Ns4tjgLfqR1qawGlN...|
|5lb0POg2t-AkMFx66...|ZlCSsWS07JulSBIQl...|
|TF4C-F5iqavACQgKT...|7Ka9Pd8X9SRHs1D5E...|
|2hRe26HSCAWbFRn5W...|d4qwVw4PcN-_2mK2o...|
|6sJN_HlM_uwpfLJ1p...|oVuZtlCFg_zF090Nh...|
|kMkWON2lmw0s-M-fw...|_iGvLfEsqDwPUxRUA...|
|QodunSzok4nIYFNrT...|poSV39UqEg

In [8]:
business_styles = sqlContext.sql("SELECT user_id, categories FROM df_reviews INNER JOIN df_business ON df_reviews.business_id=df_business.business_id")

In [9]:
business_styles.show()

+--------------------+--------------------+
|             user_id|          categories|
+--------------------+--------------------+
|OwjRMXRC0KyPrIlcj...|Shopping, Arts & ...|
|nIJD_7ZXHq-FX8byP...|Beauty & Spas, Ha...|
|V34qejxNsCbcgD8C0...|Restaurants, Gast...|
|ofKDkJKXSKZXu5xJN...|Restaurants, Mexican|
|UgMW8bLE0QMJDCkQ1...|Fast Food, Restau...|
|5vD2kmE25YBrbayKh...|Restaurants, Deli...|
|aq_ZxGHiri48TUXJl...|   Restaurants, Thai|
|dsd-KNYKMpx6ma_sR...|Doctors, Cosmetic...|
|P6apihD4ASf1vpPxH...|Restaurants, Italian|
|jOERvhmK6_lo_XGUB...|Home & Garden, Re...|
|s5j_CRBWDCCMDJ6r7...|Local Services, S...|
|HJECayULRM-6xh2GC...|Specialty Food, R...|
|1YIQGP-a534nyksaw...|Home Services, Fu...|
|qftVgPj_kRTildMDj...|Asian Fusion, Fas...|
|5lb0POg2t-AkMFx66...|Airport Shuttles,...|
|TF4C-F5iqavACQgKT...|Contractors, Pool...|
|2hRe26HSCAWbFRn5W...|Mexican, Restaurants|
|6sJN_HlM_uwpfLJ1p...|Restaurants, Indi...|
|kMkWON2lmw0s-M-fw...|Automotive, Car D...|
|QodunSzok4nIYFNrT...|Restaurant

In [10]:
user_reviewed_businesses = reviewed_businesses.rdd.groupByKey().mapValues(list)

In [11]:
#user_reviewed_businesses.take(1)

[('_xM0dU_rR4NLkCcS9Eu44A',
  ['LeuGZImVz_IG1e-SMYTSXw',
   'VsewHMsfj1Mgsl2i_hio7w',
   'w8zw4-3Ns2M4CQxeW9Bszw',
   'MpmFFw0GE_2iRFPdsRpJbA',
   'ECOkEVUodMLUxvI0PMI4gQ',
   'eoHdUeQDNgQ6WYEnP2aiRw',
   'lpcniW0cBbQMSmksTrD_2Q'])]

In [12]:
user_styles = business_styles.rdd.groupByKey().mapValues(list)

In [13]:
#user_styles.take(10)

[('Lnwip57QIhwr81NhubGMgQ',
  ['Travel Services, Tours, Airport Shuttles, Transportation, Hotels & Travel']),
 ('YHGrGROn8-MLnWXVrY9z-g',
  ['Performing Arts, Arts & Entertainment',
   'Italian, Restaurants, Food, Beer, Wine & Spirits',
   'Casinos, Magicians, Event Planning & Services, Performing Arts, Arts & Entertainment',
   'Lounges, Steakhouses, Bars, Nightlife, Restaurants, Cocktail Bars',
   'Performing Arts, Arts & Entertainment, Nightlife, Music Venues',
   'Arts & Entertainment, Casinos',
   'Home Services, Flooring, Grout Services, Home Cleaning, Tiling, Professional Services, Office Cleaning, Carpet Cleaning, Local Services',
   'Performing Arts, Arts & Entertainment']),
 ('spHbYiyiGKN9mEaz6e2yRQ',
  ['Restaurants, Spanish, Bars, Wine Bars, Nightlife, Tapas Bars',
   'Food, Canadian (New), Cocktail Bars, Specialty Food, Restaurants, Bars, Nightlife',
   'Restaurants, American (New)',
   'Restaurants, Nightlife, American (Traditional), Bars, Seafood, Steakhouses, Breakfast 

In [95]:
def get_styles(x):
    if x[1] != []:
        styles = []
        for i in range(len(x[1])):
            y = x[1][i].split(", ")
            for j in range(len(y)):
                styles.append(y[j])
        return (x[0], styles)
    return ('-', [])

In [96]:
get_styles(('Lnwip57QIhwr81NhubGMgQ',
  ['Travel Services, Tours, Airport Shuttles, Transportation, Hotels & Travel']))

('Lnwip57QIhwr81NhubGMgQ',
 ['Travel Services',
  'Tours',
  'Airport Shuttles',
  'Transportation',
  'Hotels & Travel'])

In [97]:
user_styles1 = user_styles.map(lambda x: get_styles(x))

In [98]:
user_styles1.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 81.0 failed 1 times, most recent failure: Lost task 0.0 in stage 81.0 (TID 44, pcvm602-6.emulab.net, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-97-3c3845bb19f2>", line 1, in <lambda>
  File "<ipython-input-95-3062fc6ffaaa>", line 5, in get_styles
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:168)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-97-3c3845bb19f2>", line 1, in <lambda>
  File "<ipython-input-95-3062fc6ffaaa>", line 5, in get_styles
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:941)
	at scala.collection.Iterator.foreach$(Iterator.scala:941)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:315)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:313)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:307)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:307)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:294)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:288)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1004)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2139)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [91]:
test1 = user_styles.map(lambda x: type(x[1]))

In [92]:
test1.take(10)

[str, str, str, str, str, str, str, str, str, str]

In [93]:
test1.filter(lambda x: x != list).collect()

[]

In [63]:
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession

spark = SparkSession(sc)
df_FP = spark.createDataFrame(user_styles1, ["user_id","styles"])
#df_FP.take(5)

In [36]:
minSupport = 120 / df_FP.count()
#print(minSupport)
fpGrowth = FPGrowth(itemsCol="styles", minSupport=minSupport, minConfidence=0.6)
model = fpGrowth.fit(df_FP)

# Display frequent itemsets.
model.freqItemsets.show()

Py4JJavaError: An error occurred while calling o304.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 34.0 failed 1 times, most recent failure: Lost task 0.0 in stage 34.0 (TID 20, pcvm602-6.emulab.net, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-33-3c3845bb19f2>", line 1, in <lambda>
  File "<ipython-input-31-488b48163037>", line 5, in get_styles
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2008)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2007)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2007)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:973)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:973)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2239)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2188)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2177)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:385)
	at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2981)
	at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2980)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2980)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 605, in main
    process()
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 597, in process
    serializer.dump_stream(out_iter, outfile)
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 271, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/spark-3.0.1-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/util.py", line 107, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-33-3c3845bb19f2>", line 1, in <lambda>
  File "<ipython-input-31-488b48163037>", line 5, in get_styles
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:503)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:638)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:456)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:127)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


## c. Can you infer the locations of these users?

# 3. Identify one of your favorite restaurants that is available on Yelp. Search for all reviews and reviewers for this restaurants. 

- Is this restaurant frequented by non-local reviewers (how do you know)?
- What are the positive things about this restaurant (study higher-rated reviews)
- What are the negative things about this restaurant (study lower-rated reviews)