In [8]:
import pyspark
from pyspark.sql import SparkSession

In [9]:
from pyspark.sql.functions import udf, struct, col
from pyspark.sql.types import *

In [10]:
spark= SparkSession.builder.appName("Data Analysis").config("spark.driver.memory", "4g").getOrCreate()

In [11]:
business_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_business.json")
user_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_user.json")
review_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_review.json")
checkin_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_checkin.json")
tip_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_tip.json")

In [5]:
business_df.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [6]:
user_df.printSchema()

root
 |-- average_stars: double (nullable = true)
 |-- compliment_cool: long (nullable = true)
 |-- compliment_cute: long (nullable = true)
 |-- compliment_funny: long (nullable = true)
 |-- compliment_hot: long (nullable = true)
 |-- compliment_list: long (nullable = true)
 |-- compliment_more: long (nullable = true)
 |-- compliment_note: long (nullable = true)
 |-- compliment_photos: long (nullable = true)
 |-- compliment_plain: long (nullable = true)
 |-- compliment_profile: long (nullable = true)
 |-- compliment_writer: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- elite: string (nullable = true)
 |-- fans: long (nullable = true)
 |-- friends: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- yelping_since: string (nullable = true)



In [7]:
review_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



In [8]:
checkin_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- date: string (nullable = true)



In [9]:
tip_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- compliment_count: long (nullable = true)
 |-- date: string (nullable = true)
 |-- text: string (nullable = true)
 |-- user_id: string (nullable = true)



In [10]:
# Count the distinct rows in each table
business_count = business_df.distinct().count()
user_count = user_df.distinct().count()
review_count = review_df.distinct().count()
checkin_count = checkin_df.distinct().count()
tips_count = tip_df.distinct().count()
# Print the total distinct rows count for each table
print("Distinct Rows Count:")
print("Business:", business_count)
print("Users:", user_count)
print("Review:", review_count)
print("Checkin:", checkin_count)
print("Tips:", tips_count)

Distinct Rows Count:
Business: 150346
Users: 1987897
Review: 6990280
Checkin: 131930
Tips: 908848


In [12]:
business_df.createOrReplaceTempView("business_table")
user_df.createOrReplaceTempView("user_table")
review_df.createOrReplaceTempView("review_table")
checkin_df.createOrReplaceTempView("checkin_table")
tip_df.createOrReplaceTempView("tip_table")

In [12]:
#How many businesses does each user review on average?
output = spark.sql(
'''
SELECT user_id, name, review_count / business_count AS average_business_count
FROM (
    SELECT u.user_id, u.name, COUNT(*) AS review_count,
        COUNT(DISTINCT b.business_id) AS business_count
    FROM user_table u, review_table r, business_table b
    WHERE u.user_id = r.user_id
        AND r.business_id = b.business_id
    GROUP BY u.user_id, u.name
) temp
ORDER BY average_business_count DESC
limit 10
'''
)
output.show(truncate=False)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/average_business_count.csv")

+----------------------+------+----------------------+
|user_id               |name  |average_business_count|
+----------------------+------+----------------------+
|F8vHZipbIVclQZEMEyOl9w|Beth  |16.0                  |
|Sa6MxEeKd8euwZTsid4Tkg|Liz   |12.0                  |
|I6x-ZBHeCNlMnmtfUVf5lg|Jeff  |10.0                  |
|DI5CPhofkm-frITqdStCVw|Ken   |10.0                  |
|cyFYb0DSjZLlaVUp7dsD3w|L.S.  |9.0                   |
|Qix8WCSoQpwHoIt5YW6Vwg|Larrie|9.0                   |
|RlYJZ3t8MrLz4tjuTHz2HQ|John  |8.0                   |
|iwpEsTz2WPEuLVBiY_5bHw|Vishal|8.0                   |
|EhWmDaaj8PEULAsy19baVg|Edith |8.0                   |
|eedIxFP9WWpgFLQc77dwdg|Angela|7.0                   |
+----------------------+------+----------------------+



In [13]:
#businesses with the most active user base based on the highest average number of reviews per user
output = spark.sql(
'''
SELECT b.business_id, b.name, round(r.avg_reviews_per_user,2) as avg_reviews_per_user
FROM business_table b, 
(SELECT business_id, AVG(review_count) AS avg_reviews_per_user
    FROM (
        SELECT r.business_id, r.user_id, COUNT(*) AS review_count
        FROM review_table AS r, user_table AS u
        WHERE r.user_id = u.user_id
        GROUP BY r.business_id, r.user_id
    ) AS subquery
    GROUP BY business_id
) r
WHERE b.business_id = r.business_id
ORDER BY avg_reviews_per_user DESC
LIMIT 10
'''
)
output.show(truncate=False)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/avg_reviews_per_user.csv")

+----------------------+------------------------------+--------------------+
|business_id           |name                          |avg_reviews_per_user|
+----------------------+------------------------------+--------------------+
|SCXkyyKWzS7qWl2jBiWQiQ|Governor Square Apartments    |3.71                |
|YOdGWNE5nl73AUoWpFvyIg|RTC 4th Street Bus Station    |3.22                |
|mgohMnfSufUDwDHSJTplVg|Vangard Lofts                 |3.0                 |
|e_VHdaIRyiWrKqcCRVq3Bw|Essence of Beauty             |3.0                 |
|TKJJu8cuKv4JkmgdC4lX4g|Ethan Allen Home Interiors    |2.83                |
|nDyvVBvJnLlVp7H_Pb4BYA|Boise Airport Cab             |2.67                |
|cuXUuoOSZ4VbV6jvh3BC0A|Subway                        |2.62                |
|ISFPX6auJZwFd1QQttcJew|Camelot At Cinnaminson Harbour|2.5                 |
|1vtI5aA07QE3c7f-g1Bppw|Las Brisas Apartment Homes    |2.4                 |
|xTSi5C1QkHfHZ7cdqZ5VfA|Abigail Court                 |2.4                 |

In [14]:
#user ratings vary across different business categories 
output = spark.sql(
'''
SELECT categories, AVG(stars) AS average_rating
FROM business_table
GROUP BY categories
ORDER BY average_rating DESC
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/average_rating.csv")

+--------------------+--------------+
|          categories|average_rating|
+--------------------+--------------+
|Salad, Restaurant...|           5.0|
|Veterinarians, Pe...|           5.0|
|Professional Serv...|           5.0|
|Physical Therapy,...|           5.0|
|Airlines, Transpo...|           5.0|
|Wedding Planning,...|           5.0|
|Eyelash Service, ...|           5.0|
|Windows Installat...|           5.0|
|Traditional Chine...|           5.0|
|Automotive, Home ...|           5.0|
|Yoga, Trainers, A...|           5.0|
|Men's Hair Salons...|           5.0|
|Bikes, Shopping, ...|           5.0|
|Wedding Planning,...|           5.0|
|Wedding Planning,...|           5.0|
|Health & Medical,...|           5.0|
|Acupuncture, Heal...|           5.0|
|Professional Serv...|           5.0|
|Office Cleaning, ...|           5.0|
|Massage Therapy, ...|           5.0|
+--------------------+--------------+
only showing top 20 rows



In [15]:
#top-rated businesses in each category
output = spark.sql(
'''
SELECT categories, business_id, name, stars
FROM (
    SELECT categories, business_id, name, stars,
        ROW_NUMBER() OVER (PARTITION BY categories ORDER BY stars DESC) AS rn
    FROM business_table
) AS subquery
WHERE rn = 1
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/top_rated_businesses_per_category.csv")

+--------------------+--------------------+--------------------+-----+
|          categories|         business_id|                name|stars|
+--------------------+--------------------+--------------------+-----+
|Acai Bowls, Juice...|DQ8qh7o8clZo5wZ2X...|   Rahvia Acai Bowls|  4.0|
|Acai Bowls, Organ...|RIN3ZEQmbqXh8ZYQ4...|    Good Vibes Juice|  4.5|
|Accessories, Fash...|1r3ESnezvx0V17Hhn...|    Collective at MX|  4.5|
|Accessories, Fash...|QohlZLW7Lt1DSCFZ3...|      Vineyard Vines|  3.5|
|Accessories, Jewe...|12ONhB1nammjZWPXU...|Haute Women's Bou...|  5.0|
|Accessories, Shop...|kgJ0N-a7ITVWQlEq9...|       Vamp Boutique|  3.5|
|Accessories, Wome...|B9oez-fvFRD76en7p...|            Dress Up|  4.0|
|Accessories, Wome...|7w-nj4YD5bhO_Jq4K...|United Apparel Li...|  4.0|
|Active Life, Amus...|fUTl2cZOu5UdBBwtq...|  Skate Country East|  4.0|
|Active Life, Arca...|vYglkvPqLy1Oa_sSs...|    Gateway Fun Park|  2.0|
|Active Life, Arts...|KouCHiR1PT00qiT-V...|          Clue Carré|  4.5|
|Activ

In [14]:
#Top 5 influential users based on their reviews, ratings, and interactions with businesses
output = spark.sql(
'''
SELECT r.user_id, u.name, COUNT(*) AS review_count, AVG(r.stars) AS average_rating, 
COUNT(DISTINCT r.business_id) AS business_count
FROM review_table r, user_table u
WHERE r.user_id = u.user_id
GROUP BY r.user_id, u.name
ORDER BY review_count DESC, average_rating DESC, business_count DESC
LIMIT 5
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/influencial_users.csv")

+--------------------+--------+------------+------------------+--------------+
|             user_id|    name|review_count|    average_rating|business_count|
+--------------------+--------+------------+------------------+--------------+
|_BcWyKQL16ndpBdgg...|   Karen|        3048|3.6377952755905514|          2199|
|Xw7ZjaGfr0WNVt6s_...|Marielle|        1840| 4.072826086956522|          1754|
|0Igx-a1wAstiBDerG...|     Jen|        1747|3.9902690326273613|          1471|
|-G7Zkl1wIWBBmD0KR...|  Gerald|        1682| 3.652794292508918|          1549|
|ET8n-r7glWYqZhuR6...|Michelle|        1653| 4.046581972171809|          1342|
+--------------------+--------+------------+------------------+--------------+



In [7]:
#Influential users who provide a high number of tips that receive "cool" votes
output = spark.sql(
'''
SELECT u.user_id, u.name, COUNT(*) AS tip_count, SUM(u.cool) AS cool_votes
FROM tip_table t, user_table u
WHERE t.user_id = u.user_id
GROUP BY u.user_id, u.name
ORDER BY tip_count DESC, cool_votes DESC
limit 10
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/influencial_users_cool_tips.csv")

+--------------------+-------+---------+----------+
|             user_id|   name|tip_count|cool_votes|
+--------------------+-------+---------+----------+
|fCvMnJU1Z-XhAjKg9...|Michael|     4071|   1950009|
|5Y5KbsI5buMcNh2hT...| Marcia|     1385|   4466625|
|lMY8NBPyzlPbbu-KB...|Christy|     1373|  48994132|
|qjfMBIZpQT9DDtw_B...| Rachel|     1230|  18724290|
|Rr4cLb6Go91FT134o...|    Gem|     1230|     19680|
|-G7Zkl1wIWBBmD0KR...| Gerald|     1035|  50018445|
|sraTrYU-7q_bQ0TxQ...|  Brian|      902|  12929268|
|I2XpWCHAom1JRyHXZ...|  Kathy|      821|   1117381|
|xWmYN57XXZbg0LOK8...|   John|      813|    477231|
|zYFGMy1_thjMnvQLX...|  Wanda|      782|   4719370|
+--------------------+-------+---------+----------+



In [12]:
#distribution of check-ins and tips across different states
output = spark.sql(
'''
SELECT b.state, COUNT(DISTINCT c.business_id) AS checkin_count, COUNT(DISTINCT t.business_id) AS tip_count
FROM business_table b, checkin_table c, tip_table t
WHERE b.business_id = c.business_id
  AND b.business_id = t.business_id
GROUP BY b.state
ORDER BY checkin_count DESC, tip_count DESC
limit 10
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/dist_tip_checkin_region.csv")

+-----+-------------+---------+
|state|checkin_count|tip_count|
+-----+-------------+---------+
|   PA|        22793|    22793|
|   FL|        17629|    17629|
|   TN|         8076|     8076|
|   IN|         8017|     8017|
|   MO|         7814|     7814|
|   LA|         7656|     7656|
|   AZ|         6453|     6453|
|   NJ|         5715|     5715|
|   NV|         5038|     5038|
|   AB|         3870|     3870|
+-----+-------------+---------+



In [7]:
#top-rated businesses in each category, considering both the average rating and the number of tips received
output = spark.sql(
'''
WITH business_ratings AS (
    SELECT b.business_id, b.categories, AVG(r.stars) AS average_rating, COUNT(t.business_id) AS tip_count
    FROM business_table b, review_table r, tip_table t
    WHERE b.business_id = r.business_id
        AND b.business_id = t.business_id
    GROUP BY b.business_id, b.categories
)

SELECT br.categories, br.business_id, AVG(br.average_rating) AS average_rating, SUM(br.tip_count) AS total_tips
FROM business_ratings br, (
    SELECT categories, MAX(average_rating) AS max_rating
    FROM business_ratings
    GROUP BY categories
) top_ratings
WHERE br.categories = top_ratings.categories AND br.average_rating = top_ratings.max_rating
GROUP BY br.categories, br.business_id
ORDER BY br.categories , average_rating DESC,total_tips desc
limit 10
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/top_bus_avgrat_notips.csv")

+--------------------+--------------------+------------------+----------+
|          categories|         business_id|    average_rating|total_tips|
+--------------------+--------------------+------------------+----------+
|ATV Rentals/Tours...|M-GMA5wbpjusPT04_...| 4.509803921568627|       204|
|Acai Bowls, Ameri...|erCGkXWcik3Kij95v...| 3.241830065359477|      1989|
|Acai Bowls, Asian...|x8AQIEjSqsW0z4vRy...|   4.1417004048583|      3952|
|Acai Bowls, Coffe...|J_w5J7dgRpyB4GFiB...|            4.1875|       912|
|Acai Bowls, Coffe...|4mYPqTpCjHqx8Ee4T...| 4.888888888888889|         9|
|    Acai Bowls, Food|g4wOwXihlqSMuTHgp...| 4.428571428571429|         7|
|Acai Bowls, Food ...|vdv_HaU7ZIID12KB9...| 3.557377049180328|      1098|
|Acai Bowls, Food,...|NhZ8lnDbq6cJYJJ0w...| 4.244186046511628|       258|
|Acai Bowls, Food,...|Hroul_wxzfjww2z1u...|3.9298245614035086|       684|
|Acai Bowls, Food,...|D_xrWpy2G0Gc33sxP...| 4.305555555555555|       216|
+--------------------+----------------

In [14]:
#patterns or trends in user ratings over time
output = spark.sql(
'''
SELECT SUM(average_stars) total_avg_5_star_rtngs,
sum(review_count) total_review_count,
date_format(yelping_since,'yyyy') as month
from user_table
where average_stars='5.0'
group by date_format(yelping_since,'yyyy')
order by total_avg_5_star_rtngs desc
'''
)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/trends_ratings_reviews_user.csv")

+----------------------+------------------+-----+
|total_avg_5_star_rtngs|total_review_count|month|
+----------------------+------------------+-----+
|              250480.0|            129701| 2016|
|              245150.0|            136606| 2015|
|              196535.0|            117880| 2014|
|              183670.0|             89264| 2017|
|              176655.0|             78359| 2018|
|              150300.0|             97000| 2013|
|              148305.0|             60967| 2019|
|              126110.0|             83020| 2012|
|               97745.0|             68566| 2011|
|               70065.0|             26754| 2020|
|               57440.0|             18810| 2021|
|               46450.0|             34059| 2010|
|               19855.0|             16626| 2009|
|                5540.0|              5235| 2008|
|                5025.0|              1149| 2022|
|                1845.0|              1730| 2007|
|                 580.0|               583| 2006|


In [40]:
#Top star rated business in each state
output = spark.sql(
'''
-- Calculate the average ratings per business and state
WITH average_ratings AS (
  SELECT b.name, b.state, AVG(b.stars) AS average_rating, AVG(review_count) AS average_review, b.latitude,b.longitude
  FROM business_table b
  GROUP BY b.name, b.state, b.latitude,b.longitude
),

-- Rank businesses within each state
ranked_businesses AS (
  SELECT name, state, average_rating, average_review,latitude,longitude,
         ROW_NUMBER() OVER (PARTITION BY state ORDER BY average_rating DESC, average_review DESC) AS ranking
  FROM average_ratings
)

-- Filter for top-rated businesses
SELECT name, state,latitude,longitude, average_rating, average_review
FROM ranked_businesses
WHERE  ranking=1
'''
)
output.show(30)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/top_rated_business_per_state.csv")

+--------------------+-----+-------------+---------------+--------------+--------------+
|                name|state|     latitude|      longitude|average_rating|average_review|
+--------------------+-----+-------------+---------------+--------------+--------------+
|            Tumerico|   AZ|32.2277054884|-110.9347224981|           5.0|         705.0|
|  Free Tours By Foot|   LA|   29.9253533|    -90.0799876|           5.0|         769.0|
|     Gelato Dolceria|   NJ|39.8981422821|  -75.032111215|           5.0|         121.0|
|          Buena Onda|   CA|   34.4246414|    -119.686693|           5.0|         414.0|
|          Scalessa's|   DE|   39.7608359|    -75.5623992|           5.0|         176.0|
|A New Twist Ballo...|   ID|   43.6054854|     -116.20715|           5.0|         276.0|
|           Mr Brakes|   MI|   39.8031006|    -75.0505471|           2.5|           9.0|
|     Blues City Deli|   MO|    38.605024|    -90.2181096|           5.0|         991.0|
|   MudMan Food Truck