In [12]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit, concat_ws


## Load data from photo, topic modeling, text summarization

In [3]:
spark= SparkSession.builder.getOrCreate()

23/12/09 01:41:59 WARN Utils: Your hostname, CelinedeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.100.25 instead (on interface en0)
23/12/09 01:41:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/09 01:41:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/09 01:42:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/12/09 01:42:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/12/09 01:42:00 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/12/09 01:42:00 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/12/09 01:42:00 WARN Utils: Service 'SparkUI' could not bind on port 4044. Atte

23/12/09 01:42:14 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
# Load data from photo.json, topic modeling dataset, text summarization dataset
photo = spark.read.json('/Users/celine/Desktop/5430nlp/group/photos.json')
topic = spark.read.json('/Users/celine/Desktop/5430nlp/group/tm.json')
summarization = spark.read.option("header", "true") \
    .option("multiLine", "true") \
    .option("quote", "\"") \
    .option("escape", "\"").csv('/Users/celine/Desktop/5430nlp/group/summarization_data_clean.csv')


                                                                                

In [5]:
photo.show()

+--------------------+--------------------+-------+--------------------+
|         business_id|             caption|  label|            photo_id|
+--------------------+--------------------+-------+--------------------+
|Nk-SJhPlDBkAZvfsA...|Nice rock artwork...| inside|zsvj7vloL4L5jhYyP...|
|yVZtL5MmrpiivyCIr...|                    |outside|HCUdRJHHm_e0OCTlZ...|
|_ab50qdWOk0DdB6XO...|      oyster shooter|  drink|vkr8T0scuJmGVvN2H...|
|SZU9c8V2GuREDN5Kg...|       Shrimp scampi|   food|pve7D6NUrafHW3EAO...|
|Gzur0f0XMkrVxIwYJ...|                    |   food|H52Er-uBg6rNrHcRe...|
|jl38yx7zzMRbg-kOK...|                    |   food|wZ29mUm6nKz566j17...|
|-9NmUeTphyS9Lq1o9...|                    | inside|QRUgAISgYLQJ9SK2y...|
|RRCgIohWjaeGtlbpc...|                    | inside|bb7o8kXXXqc-8PWU6...|
|p2J__JQ_mN5lVd7iG...|                    | inside|mcjlyGuLFJ0t4vDix...|
|u9vhzYtXpfyvAOAMn...|    Inside reception| inside|3ROd5PAQ_0OkmoKWV...|
|4OLfgjPToHrYf4HKX...|Small portion of ...|outside|

In [9]:
topic.show()

+--------------------+--------------------+------------+--------+--------------------+-----------+-----+--------------------+--------------------+
|             address|         business_id|        city| cluster|                name|postal_code|stars|                text|               topic|
+--------------------+--------------------+------------+--------+--------------------+-----------+-----+--------------------+--------------------+
|      2654 S 10th St|RI33oswGDkIsc0fuQ...|Philadelphia|   pizza|       Oregon Steaks|      19148|  3.5|ppl order place i...|[Beef, Cheese, Br...|
|         935 Race St|MTSW4McQd7CbVtyjq...|Philadelphia|  others|  St Honore Pastries|      19107|  4.0|bakery chinatown ...|  [Bagel, Bar, Beef]|
|    2634 Belgrade St|GmJDCmDhtKbjofeO3...|Philadelphia|  others|The Freshworks of...|      19125|  4.5|omg chance nothin...|[Beef, Cheese, Br...|
|      820 N Broad St|y1HgVnt4K5owmYXC5...|Philadelphia| chinese|Flambo Caribbean ...|      19130|  4.5|word excellent

In [10]:
summarization.show()

+--------------------+----------------------+---------------------+----------------------+------------+-------------------+
|         business_id|positive_summarization|neutral_summarization|negative_summarization|review_count|positive_percentage|
+--------------------+----------------------+---------------------+----------------------+------------+-------------------+
|6_T2xzR74JqGCTPef...|  This was my first...|  BUT. It's was good.|  It felt it took h...|         222|             76.58%|
|u0Bt7uvvj7LAjAdiM...|  Overall, go try l...| Spectacular servi...|  I don't know if i...|         149|             81.21%|
|05ev984NYfimRN0Ui...|  This was some of ...| They have a great...|  Overall the Resta...|         185|             77.84%|
|Jg9kkOl4sx2AXJVoh...|  If you haven't wa...| I was there like ...|  Overall, it wasn'...|         343|             86.88%|
|wUnLSg_GKfEIQ5CQQ...|  I love a good tas...| As a result, I wo...|  We had dinner las...|          85|             88.24%|
|AlxnbBd

## Photo

In [6]:
# Select photo with label food and drink
photo_food = photo.filter((F.col('label') == 'food') | (F.col('label') == 'drink'))


In [8]:
# Only select one photo for each business
windowSpec = Window.partitionBy('business_id').orderBy('photo_id')

# Add a row number column
photo_food_with_row_number = photo_food.withColumn('row_number', F.row_number().over(windowSpec))

# Filter for the first photo for each business
photo_per_business = photo_food_with_row_number.filter(F.col('row_number') == 1)

# Now you can drop the 'row_number' column if you want
photo_per_business = photo_per_business.drop('row_number')

photo_per_business.count()


                                                                                

28237

## Topic modeling

In [13]:
# Add a state column fill "PA"
topic = topic.withColumn('state', lit('PA'))
# Concat address, city, state, postal_code as full address
topic = topic.withColumn('full_address', concat_ws(', ', 'address', 'city', concat_ws(' ', 'state','postal_code')))

In [15]:
topic.show()

+--------------------+--------------------+------------+--------+--------------------+-----------+-----+--------------------+--------------------+-----+--------------------+
|             address|         business_id|        city| cluster|                name|postal_code|stars|                text|               topic|state|        full_address|
+--------------------+--------------------+------------+--------+--------------------+-----------+-----+--------------------+--------------------+-----+--------------------+
|      2654 S 10th St|RI33oswGDkIsc0fuQ...|Philadelphia|   pizza|       Oregon Steaks|      19148|  3.5|ppl order place i...|[Beef, Cheese, Br...|   PA|2654 S 10th St, P...|
|         935 Race St|MTSW4McQd7CbVtyjq...|Philadelphia|  others|  St Honore Pastries|      19107|  4.0|bakery chinatown ...|  [Bagel, Bar, Beef]|   PA|935 Race St, Phil...|
|    2634 Belgrade St|GmJDCmDhtKbjofeO3...|Philadelphia|  others|The Freshworks of...|      19125|  4.5|omg chance nothin...|[Beef

In [16]:
topic = topic.select('business_id', 'name', 'stars', 'full_address', 'topic')

In [18]:
# Join photo and topic modeling dataset
photo_food_tm = photo_per_business.select('business_id', 'photo_id').join(topic, 'business_id', 'right')
photo_food_tm.count()

2276

In [19]:
photo_food_tm.show()

+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|         business_id|            photo_id|                name|stars|        full_address|               topic|
+--------------------+--------------------+--------------------+-----+--------------------+--------------------+
|RI33oswGDkIsc0fuQ...|EBrKYUF01H2nvHC1r...|       Oregon Steaks|  3.5|2654 S 10th St, P...|[Beef, Cheese, Br...|
|MTSW4McQd7CbVtyjq...|7urDyNpZZ3Z8RpKLr...|  St Honore Pastries|  4.0|935 Race St, Phil...|  [Bagel, Bar, Beef]|
|GmJDCmDhtKbjofeO3...|                NULL|The Freshworks of...|  4.5|2634 Belgrade St,...|[Beef, Cheese, Br...|
|y1HgVnt4K5owmYXC5...|ZZm7cTtBSkdxXKt_3...|Flambo Caribbean ...|  4.5|820 N Broad St, P...|[Brunch, Bartende...|
|DOl21AqtT74dUZ3kv...|7KDZ-oY2I_u42XE7L...|     Golden Triangle|  4.5|610 S 5th St, Phi...|[Brunch, Breakfas...|
|tqBkgu2wyCjYBWuTQ...|                NULL|Orchard Smoothie ...|  3.0|1200  S Federal S...|[Amaz

# Text summarization

In [21]:
# Join photo_food_tm and text summarization dataset
topic_sum_photo = photo_food_tm.join(summarization, 'business_id', 'inner')
topic_sum_photo.show()

+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------------------+---------------------+----------------------+------------+-------------------+
|         business_id|            photo_id|                name|stars|        full_address|               topic|positive_summarization|neutral_summarization|negative_summarization|review_count|positive_percentage|
+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------------------+---------------------+----------------------+------------+-------------------+
|nBr6NgqcZz1GhtAng...|                NULL|          Forin Cafe|  4.5|2041 Coral St, St...|[Cake, Bagel, Bru...|  I was so excited ...| The cafe is also ...|  Sorry, this busin...|          12|             91.67%|
|VwZ5NDbIu3elGQI6M...|9qhTofis6GYRB2Ynb...|           Blue Corn|  4.5|940 S 9th St, Phi...|[Ambiance, Barten...|  A + + + !! I had ...| I didn't