In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
from pyspark.sql.functions import udf, struct, col
from pyspark.sql.types import *

In [3]:
spark= SparkSession.builder.appName("Data Analysis on business").getOrCreate()

In [4]:
business_df = spark.read.json("C:\Supriyaa-spark-notes\yelp\yelp_academic_dataset_business.json")

In [5]:
business_df.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [6]:
business_df.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [7]:
nrow=business_df.count()
ncol=len(business_df.columns)
print("Dataset Shape:",'(', nrow,',',ncol,')')

Dataset Shape: ( 150346 , 14 )


In [8]:
business_df.dtypes

[('address', 'string'),
 ('attributes',
  'struct<AcceptsInsurance:string,AgesAllowed:string,Alcohol:string,Ambience:string,BYOB:string,BYOBCorkage:string,BestNights:string,BikeParking:string,BusinessAcceptsBitcoin:string,BusinessAcceptsCreditCards:string,BusinessParking:string,ByAppointmentOnly:string,Caters:string,CoatCheck:string,Corkage:string,DietaryRestrictions:string,DogsAllowed:string,DriveThru:string,GoodForDancing:string,GoodForKids:string,GoodForMeal:string,HairSpecializesIn:string,HappyHour:string,HasTV:string,Music:string,NoiseLevel:string,Open24Hours:string,OutdoorSeating:string,RestaurantsAttire:string,RestaurantsCounterService:string,RestaurantsDelivery:string,RestaurantsGoodForGroups:string,RestaurantsPriceRange2:string,RestaurantsReservations:string,RestaurantsTableService:string,RestaurantsTakeOut:string,Smoking:string,WheelchairAccessible:string,WiFi:string>'),
 ('business_id', 'string'),
 ('categories', 'string'),
 ('city', 'string'),
 ('hours',
  'struct<Friday:st

In [9]:
business_df.describe().show()

+-------+-----------------+--------------------+--------------------+-----------+-------------------+-----------------+------------------+--------------------+-----------------+------------------+------------------+------+
|summary|          address|         business_id|          categories|       city|            is_open|         latitude|         longitude|                name|      postal_code|      review_count|             stars| state|
+-------+-----------------+--------------------+--------------------+-----------+-------------------+-----------------+------------------+--------------------+-----------------+------------------+------------------+------+
|  count|           150346|              150346|              150243|     150346|             150346|           150346|            150346|              150346|           150346|            150346|            150346|150346|
|   mean|7369.333333333333|                null|                null|       null| 0.7961502135075094|36.6711

In [10]:
#print distinct values of each column
from pyspark.sql.functions import col

# Iterate over each column in the DataFrame
for column in business_df.columns:
    distinct_values = business_df.select(col(column)).distinct().count()
    print(f"Distinct values of {column}:",distinct_values)

Distinct values of address: 122844
Distinct values of attributes: 67213
Distinct values of business_id: 150346
Distinct values of categories: 83161
Distinct values of city: 1416
Distinct values of hours: 49823
Distinct values of is_open: 2
Distinct values of latitude: 135593
Distinct values of longitude: 131918
Distinct values of name: 114117
Distinct values of postal_code: 3362
Distinct values of review_count: 1158
Distinct values of stars: 9
Distinct values of state: 27


In [11]:
#Missing values
from pyspark.sql.functions import col, isnull

# Missing values
total_rows = business_df.count()
for i in range(len(business_df.columns)):
    column_name = business_df.columns[i]
    n = business_df.filter(isnull(col(column_name))).count()
    p = (n / total_rows) * 100
    print(f"Missing values in {column_name}: {p:.2f}%",' and total count: ',n)

Missing values in address: 0.00%  and total count:  0
Missing values in attributes: 9.14%  and total count:  13744
Missing values in business_id: 0.00%  and total count:  0
Missing values in categories: 0.07%  and total count:  103
Missing values in city: 0.00%  and total count:  0
Missing values in hours: 15.45%  and total count:  23223
Missing values in is_open: 0.00%  and total count:  0
Missing values in latitude: 0.00%  and total count:  0
Missing values in longitude: 0.00%  and total count:  0
Missing values in name: 0.00%  and total count:  0
Missing values in postal_code: 0.00%  and total count:  0
Missing values in review_count: 0.00%  and total count:  0
Missing values in stars: 0.00%  and total count:  0
Missing values in state: 0.00%  and total count:  0


In [12]:
# Filter the DataFrame to show rows with missing values in a specific column
column_with_missing = "categories"
business_df.filter(isnull(col(column_with_missing))).show(truncate=False)

+------------------------------+----------+----------------------+----------+----------------+-----+-------+----------+------------+---------------------------------------+-----------+------------+-----+-----+
|address                       |attributes|business_id           |categories|city            |hours|is_open|latitude  |longitude   |name                                   |postal_code|review_count|stars|state|
+------------------------------+----------+----------------------+----------+----------------+-----+-------+----------+------------+---------------------------------------+-----------+------------+-----+-----+
|60 W White Horse Pike         |null      |SMYXOLPyM95JvZ-oqnsWUA|null      |Berlin          |null |1      |39.8004163|-74.9371806 |A A Berlin Glass & Mirror Co           |08009      |5           |3.0  |NJ   |
|10204 Saint Charles Rck Rd    |null      |9ryVeDaaR-le3kiSayTGow|null      |Saint Ann       |null |1      |38.7260321|-90.3793227 |Pauline African Hair Braidin

In [13]:
column_to_check = "categories"
distinct_values = business_df.select(column_to_check).distinct().collect()
print(f"Distinct values in {column_to_check}:")
for row in distinct_values:
    print(row[0])

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
column_with_missing = "hours"
business_df.filter(isnull(col(column_with_missing))).select('business_id','categories','hours','attributes').show()

+--------------------+--------------------+-----+--------------------+
|         business_id|          categories|hours|          attributes|
+--------------------+--------------------+-----+--------------------+
|Pns2l4eNsfO8kk83d...|Doctors, Traditio...| null|{null, null, null...|
|k0hlBqXX-Bt0vf1op...|Pubs, Restaurants...| null|{null, null, u'fu...|
|4iRzR7OaS-QaSXuvY...|Hot Dogs, Restaur...| null|{null, null, u'be...|
|w_AMNoI1iG9eay7nc...|Event Planning & ...| null|{null, null, null...|
|2xVsWBNFwZOxIOdd9...|Restaurants, Burgers| null|{null, null, u'fu...|
|bCBPXIVfVzBZBEpFu...|Shipping Centers,...| null|{null, null, null...|
|REwfwz-_-CMQ7Np5U...|Real Estate Agent...| null|                null|
|0qNpTGTcqPwOLi2hA...|Food, Grocery, Co...| null|{null, null, null...|
|xM6LoUcnpDpMBzXs_...|Hotels, Hotels & ...| null|{null, null, null...|
|L_TT0BFmFwORAMaA8...|Gastropubs, Cockt...| null|{null, null, u'no...|
|mjWvAkNqoA6Y5P03s...|  Uniforms, Shopping| null|{null, null, null...|
|qfWJm

In [15]:
business_df.filter((isnull(col('hours')))&(isnull(col('categories')))&(isnull(col('attributes')))).select('business_id','categories','hours','attributes').show()

+--------------------+----------+-----+----------+
|         business_id|categories|hours|attributes|
+--------------------+----------+-----+----------+
|SMYXOLPyM95JvZ-oq...|      null| null|      null|
|9ryVeDaaR-le3kiSa...|      null| null|      null|
|xT3J-SP5g49g2FjQf...|      null| null|      null|
|_obl2-rphXvtzP3y_...|      null| null|      null|
|mKxCNYEoKt6d_1rXm...|      null| null|      null|
|9QoKKDZB_YuDeS5Tx...|      null| null|      null|
|lxaSo0sBK36BNDRL6...|      null| null|      null|
|ZERQMWb1PFzCfbfkn...|      null| null|      null|
|cs7i8-NtrT2P4dMYa...|      null| null|      null|
|tfQEd3kakCQdbjfdp...|      null| null|      null|
|NpQowTAUYGeylRCAs...|      null| null|      null|
|7cEbbI3wjuGSsJUIG...|      null| null|      null|
|6aVfpb46kY1FN7nFc...|      null| null|      null|
|_LoMSJiz4dLYnqwRN...|      null| null|      null|
|Dngea1LMy4JhJiC5m...|      null| null|      null|
|2M_l_vsJx2T_Ihu2X...|      null| null|      null|
|Ap9LaMbl1hJX0c7q3...|      nul

In [16]:
c=business_df.filter((isnull(col('hours')))&(isnull(col('categories')))&(isnull(col('attributes')))).select('business_id','categories','hours','attributes').count()
print("Total rows where all 3 columns are Null: ", c)

Total rows where all 3 columns are Null:  102


In [17]:
c=business_df.filter((isnull(col('hours')))&~(isnull(col('categories')))&~(isnull(col('attributes')))).select('business_id','categories','hours','attributes').count()
print("Total rows where hour is Null but other two columns are not null: ", c)

Total rows where hour is Null but other two columns are not null:  18983


In [18]:
business_df.filter((isnull(col('hours')))&~(isnull(col('categories')))&~(isnull(col('attributes')))).select('business_id','categories','hours','attributes').show()

+--------------------+--------------------+-----+--------------------+
|         business_id|          categories|hours|          attributes|
+--------------------+--------------------+-----+--------------------+
|Pns2l4eNsfO8kk83d...|Doctors, Traditio...| null|{null, null, null...|
|k0hlBqXX-Bt0vf1op...|Pubs, Restaurants...| null|{null, null, u'fu...|
|4iRzR7OaS-QaSXuvY...|Hot Dogs, Restaur...| null|{null, null, u'be...|
|w_AMNoI1iG9eay7nc...|Event Planning & ...| null|{null, null, null...|
|2xVsWBNFwZOxIOdd9...|Restaurants, Burgers| null|{null, null, u'fu...|
|bCBPXIVfVzBZBEpFu...|Shipping Centers,...| null|{null, null, null...|
|0qNpTGTcqPwOLi2hA...|Food, Grocery, Co...| null|{null, null, null...|
|xM6LoUcnpDpMBzXs_...|Hotels, Hotels & ...| null|{null, null, null...|
|L_TT0BFmFwORAMaA8...|Gastropubs, Cockt...| null|{null, null, u'no...|
|mjWvAkNqoA6Y5P03s...|  Uniforms, Shopping| null|{null, null, null...|
|qfWJmJ0g96eM_fWma...|Seafood, Restaura...| null|{null, null, u'fu...|
|KWA2q

In [5]:
# Register the DataFrame as a temporary table
business_df.createOrReplaceTempView("business_table")

In [19]:
result = spark.sql("SELECT count(*) as total_count FROM business_table")
result.show()

+-----------+
|total_count|
+-----------+
|     150346|
+-----------+



In [20]:
result = spark.sql("SELECT count(*) as total_count FROM (select distinct * from business_table)")
result.show()

+-----------+
|total_count|
+-----------+
|     150346|
+-----------+



In [21]:
result=spark.sql('''select * from 
(select b.business_id,
row_number() over (partition by address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,
name,postal_code,review_count,state,stars
order by address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,
name,postal_code,review_count,state,stars) as rown
from business_table b)
where rown>2
''')
result.show()

+-----------+----+
|business_id|rown|
+-----------+----+
+-----------+----+



In [22]:
from pyspark.sql.functions import monotonically_increasing_id
df_with_rowid = business_df.withColumn("rowid", monotonically_increasing_id())
df_with_rowid.createOrReplaceTempView("business_table1")
result=spark.sql('''SELECT * FROM business_table1 b1
WHERE rowid > (
  SELECT MIN(rowid) FROM business_table1 b2  
  WHERE 1=1
  and b1.address = b2.address
  and b1.attributes = b2.attributes
  and b1.business_id = b2.business_id
  and b1.categories = b2.categories
  and b1.city = b2.city
  and b1.hours = b2.hours
  and b1.is_open = b2.is_open
  and b1.latitude = b2.latitude
  and b1.longitude = b2.longitude
  and b1.name = b2.name
  and b1.postal_code = b2.postal_code
  and b1.review_count = b2.review_count
  and b1.state = b2.state
  and b1.stars = b2.stars
)''')
result.show()

+-------+----------+-----------+----------+----+-----+-------+--------+---------+----+-----------+------------+-----+-----+-----+
|address|attributes|business_id|categories|city|hours|is_open|latitude|longitude|name|postal_code|review_count|stars|state|rowid|
+-------+----------+-----------+----------+----+-----+-------+--------+---------+----+-----------+------------+-----+-----+-----+
+-------+----------+-----------+----------+----+-----+-------+--------+---------+----+-----------+------------+-----+-----+-----+



In [23]:
#To check if business_id is unique
result=spark.sql("select count(distinct business_id) from business_table")
result.show()

+---------------------------+
|count(DISTINCT business_id)|
+---------------------------+
|                     150346|
+---------------------------+



In [20]:
output =spark.sql("""select * from (SELECT categories, COUNT(*)  as count
FROM business_table
GROUP BY categories) order by count desc
                    """)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/categories_count.csv")

+--------------------+-----+
|          categories|count|
+--------------------+-----+
|Beauty & Spas, Na...| 1012|
|  Restaurants, Pizza|  935|
|Nail Salons, Beau...|  934|
|  Pizza, Restaurants|  823|
|Restaurants, Mexican|  728|
|Restaurants, Chinese|  708|
|Mexican, Restaurants|  672|
|Chinese, Restaurants|  651|
|  Food, Coffee & Tea|  508|
|Beauty & Spas, Ha...|  493|
|Hair Salons, Beau...|  480|
|  Coffee & Tea, Food|  473|
|Automotive, Auto ...|  361|
|Auto Repair, Auto...|  351|
|       Grocery, Food|  329|
|Italian, Restaurants|  328|
|       Food, Grocery|  306|
| Veterinarians, Pets|  300|
|Ice Cream & Froze...|  298|
| Pets, Veterinarians|  285|
+--------------------+-----+
only showing top 20 rows



In [25]:
output =spark.sql("""select * from (SELECT stars, COUNT(*)  as count
FROM business_table
GROUP BY stars)
order by count desc
                    """)
output.show()

+-----+-----+
|stars|count|
+-----+-----+
|  4.0|31125|
|  4.5|27181|
|  3.5|26519|
|  3.0|18453|
|  5.0|16307|
|  2.5|14316|
|  2.0| 9527|
|  1.5| 4932|
|  1.0| 1986|
+-----+-----+



In [21]:
output =spark.sql("""select * from (SELECT city, COUNT(*)  as count
FROM business_table
GROUP BY city)
order by count desc
                    """)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/city_count.csv")

+----------------+-----+
|            city|count|
+----------------+-----+
|    Philadelphia|14569|
|          Tucson| 9250|
|           Tampa| 9050|
|    Indianapolis| 7540|
|       Nashville| 6971|
|     New Orleans| 6209|
|            Reno| 5935|
|        Edmonton| 5054|
|     Saint Louis| 4827|
|   Santa Barbara| 3829|
|           Boise| 2937|
|      Clearwater| 2221|
|Saint Petersburg| 1663|
|        Metairie| 1643|
|          Sparks| 1624|
|      Wilmington| 1446|
|        Franklin| 1414|
|       St. Louis| 1255|
|  St. Petersburg| 1185|
|        Meridian| 1043|
+----------------+-----+
only showing top 20 rows



In [22]:
output =spark.sql("""select * from (SELECT state, COUNT(*)  as count
FROM business_table
GROUP BY state)
order by count desc
                    """)
output.show(40)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/state_count.csv")

+-----+-----+
|state|count|
+-----+-----+
|   PA|34039|
|   FL|26330|
|   TN|12056|
|   IN|11247|
|   MO|10913|
|   LA| 9924|
|   AZ| 9912|
|   NJ| 8536|
|   NV| 7715|
|   AB| 5573|
|   CA| 5203|
|   ID| 4467|
|   DE| 2265|
|   IL| 2145|
|   TX|    4|
|   CO|    3|
|   MA|    2|
|   HI|    2|
|   WA|    2|
|   NC|    1|
|   UT|    1|
|   SD|    1|
|   MI|    1|
|  XMS|    1|
|   MT|    1|
|   VI|    1|
|   VT|    1|
+-----+-----+



In [28]:
output =spark.sql("""select * from (SELECT review_count, COUNT(*)  as count
FROM business_table
GROUP BY review_count)
order by count desc
                    """)
output.show()

+------------+-----+
|review_count|count|
+------------+-----+
|           5|14921|
|           6|11673|
|           7| 9594|
|           8| 8040|
|           9| 6875|
|          10| 5921|
|          11| 5087|
|          12| 4676|
|          13| 4194|
|          14| 3635|
|          15| 3258|
|          16| 3007|
|          17| 2756|
|          18| 2443|
|          19| 2347|
|          20| 2220|
|          21| 1972|
|          22| 1857|
|          23| 1789|
|          24| 1702|
+------------+-----+
only showing top 20 rows



In [23]:
output =spark.sql("""select * from (SELECT is_open, COUNT(*)  as count
FROM business_table
GROUP BY is_open)
order by count desc
                    """)
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/bus_open_count.csv")

+-------+------+
|is_open| count|
+-------+------+
|      1|119698|
|      0| 30648|
+-------+------+



In [30]:
output=spark.sql('''select attributes from business_table where attributes is not null''')
output.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|attributes                                                                                                                                                                                                                                                                                                                                                                                                              

In [31]:
# Extract JSON values using Spark SQL
spark.sql("""
    CREATE OR REPLACE TEMPORARY VIEW attributes_table AS
    SELECT
    business_id,
        attributes.AcceptsInsurance,
        attributes.AgesAllowed,
        attributes.Alcohol,
       attributes.Ambience,
 attributes.BYOB,
  attributes.BYOBCorkage,
  attributes.BestNights,
 attributes.BikeParking,
 attributes.BusinessAcceptsBitcoin,
 attributes.BusinessAcceptsCreditCards,
 attributes.BusinessParking,
 attributes.ByAppointmentOnly,
 attributes.Caters,
 attributes.CoatCheck,
 attributes.Corkage,
  attributes.DietaryRestrictions,
  attributes.DogsAllowed,
 attributes.DriveThru,
  attributes.GoodForDancing,
  attributes.GoodForKids,
  attributes.GoodForMeal,
  attributes.HairSpecializesIn,
  attributes.HappyHour,
  attributes.HasTV,
  attributes.Music,
  attributes.NoiseLevel,
  attributes.Open24Hours,
  attributes.OutdoorSeating,
  attributes.RestaurantsAttire,
  attributes.RestaurantsCounterService,
  attributes.RestaurantsDelivery,
  attributes.RestaurantsGoodForGroups,
  attributes.RestaurantsPriceRange2,
  attributes.RestaurantsReservations,
  attributes.RestaurantsTableService,
  attributes.RestaurantsTakeOut,
  attributes.Smoking,
 attributes.WheelchairAccessible,
 attributes.WiFi
    FROM
        business_table
""")

# Query the new table
spark.sql("SELECT * FROM attributes_table").show()



+--------------------+----------------+-----------+-----------+--------------------+----+-----------+----------+-----------+----------------------+--------------------------+--------------------+-----------------+------+---------+-------+-------------------+-----------+---------+--------------+-----------+--------------------+-----------------+---------+-----+-----+----------+-----------+--------------+-----------------+-------------------------+-------------------+------------------------+----------------------+-----------------------+-----------------------+------------------+-------+--------------------+-------+
|         business_id|AcceptsInsurance|AgesAllowed|    Alcohol|            Ambience|BYOB|BYOBCorkage|BestNights|BikeParking|BusinessAcceptsBitcoin|BusinessAcceptsCreditCards|     BusinessParking|ByAppointmentOnly|Caters|CoatCheck|Corkage|DietaryRestrictions|DogsAllowed|DriveThru|GoodForDancing|GoodForKids|         GoodForMeal|HairSpecializesIn|HappyHour|HasTV|Music|NoiseLeve

In [32]:
spark.sql('''select count(*) from attributes_table''').show()

+--------+
|count(1)|
+--------+
|  150346|
+--------+



In [33]:
attributes_df=spark.table("attributes_table")

In [34]:
attributes_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- AcceptsInsurance: string (nullable = true)
 |-- AgesAllowed: string (nullable = true)
 |-- Alcohol: string (nullable = true)
 |-- Ambience: string (nullable = true)
 |-- BYOB: string (nullable = true)
 |-- BYOBCorkage: string (nullable = true)
 |-- BestNights: string (nullable = true)
 |-- BikeParking: string (nullable = true)
 |-- BusinessAcceptsBitcoin: string (nullable = true)
 |-- BusinessAcceptsCreditCards: string (nullable = true)
 |-- BusinessParking: string (nullable = true)
 |-- ByAppointmentOnly: string (nullable = true)
 |-- Caters: string (nullable = true)
 |-- CoatCheck: string (nullable = true)
 |-- Corkage: string (nullable = true)
 |-- DietaryRestrictions: string (nullable = true)
 |-- DogsAllowed: string (nullable = true)
 |-- DriveThru: string (nullable = true)
 |-- GoodForDancing: string (nullable = true)
 |-- GoodForKids: string (nullable = true)
 |-- GoodForMeal: string (nullable = true)
 |-- HairSpecializesIn: s

In [35]:
#Missing values
from pyspark.sql.functions import col, isnull

# Missing values
total_rows = attributes_df.count()
for i in range(len(attributes_df.columns)):
    column_name = attributes_df.columns[i]
    n = attributes_df.filter(isnull(col(column_name))).count()
    p = (n / total_rows) * 100
    print(f"Missing values in {column_name}: {p:.2f}%",' and total count: ',n)

Missing values in business_id: 0.00%  and total count:  0
Missing values in AcceptsInsurance: 96.20%  and total count:  144633
Missing values in AgesAllowed: 99.91%  and total count:  150217
Missing values in Alcohol: 71.27%  and total count:  107157
Missing values in Ambience: 70.55%  and total count:  106067
Missing values in BYOB: 97.04%  and total count:  145895
Missing values in BYOBCorkage: 99.04%  and total count:  148902
Missing values in BestNights: 96.21%  and total count:  144652
Missing values in BikeParking: 51.69%  and total count:  77708
Missing values in BusinessAcceptsBitcoin: 88.41%  and total count:  132916
Missing values in BusinessAcceptsCreditCards: 20.34%  and total count:  30581
Missing values in BusinessParking: 39.42%  and total count:  59261
Missing values in ByAppointmentOnly: 71.84%  and total count:  108007
Missing values in Caters: 73.31%  and total count:  110219
Missing values in CoatCheck: 96.29%  and total count:  144762
Missing values in Corkage: 97.

In [36]:
spark.sql('''select b.business_id,
a.BusinessAcceptsBitcoin,
a.BusinessAcceptsCreditCards,
b.categories,
b.city,
b.state,
b.name
from business_table b,
attributes_table a
where b.business_id=a.business_id
and b.business_id='0bPLkL0QhhPO5kt1_EXmNQ'
''').show(truncate=False)

+----------------------+----------------------+--------------------------+-------------------------------------------+-----+-----+--------------------+
|business_id           |BusinessAcceptsBitcoin|BusinessAcceptsCreditCards|categories                                 |city |state|name                |
+----------------------+----------------------+--------------------------+-------------------------------------------+-----+-----+--------------------+
|0bPLkL0QhhPO5kt1_EXmNQ|False                 |True                      |Food, Delis, Italian, Bakeries, Restaurants|Largo|FL   |Zio's Italian Market|
+----------------------+----------------------+--------------------------+-------------------------------------------+-----+-----+--------------------+



In [37]:
spark.sql('''
select min(stars) as min_stars, max(stars) as max_stars, round(avg(stars),2) as avg_stars
from business_table
''').show()

+---------+---------+---------+
|min_stars|max_stars|avg_stars|
+---------+---------+---------+
|      1.0|      5.0|      3.6|
+---------+---------+---------+



In [38]:
#Find category with more number of 5 stars
output=spark.sql(
'''SELECT categories
FROM business_table
WHERE stars = 5
GROUP BY categories
HAVING COUNT(*) = (
    SELECT MAX(category_count)
    FROM (
        SELECT COUNT(*) AS category_count
        FROM business_table
        WHERE stars = 5
        GROUP BY categories
    )
)
'''
)
output.show(truncate=False)

+-----------------------+
|categories             |
+-----------------------+
|Auto Repair, Automotive|
+-----------------------+



In [24]:
output=spark.sql('''
select * from (select categories, stars, count(stars) as total_count
from business_table
group by categories,stars
)
where categories='Auto Repair, Automotive'
order by total_count desc
''')
output.show(truncate=False)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/category_more_5stars_stardistribution.csv")

+-----------------------+-----+-----------+
|categories             |stars|total_count|
+-----------------------+-----+-----------+
|Auto Repair, Automotive|5.0  |128        |
|Auto Repair, Automotive|4.5  |100        |
|Auto Repair, Automotive|4.0  |50         |
|Auto Repair, Automotive|3.5  |36         |
|Auto Repair, Automotive|3.0  |14         |
|Auto Repair, Automotive|2.5  |13         |
|Auto Repair, Automotive|1.0  |4          |
|Auto Repair, Automotive|2.0  |4          |
|Auto Repair, Automotive|1.5  |2          |
+-----------------------+-----+-----------+



In [25]:
output=spark.sql('''
select * from (select categories, stars,city,state, count(stars) as total_count
from business_table
group by categories,stars,city,state
order by stars desc,total_count desc
)
where categories='Auto Repair, Automotive'
''')
output.show(truncate=False)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/categories_more_5stars_state_dist.csv")

+-----------------------+-----+----------------+-----+-----------+
|categories             |stars|city            |state|total_count|
+-----------------------+-----+----------------+-----+-----------+
|Auto Repair, Automotive|5.0  |Philadelphia    |PA   |13         |
|Auto Repair, Automotive|5.0  |Tucson          |AZ   |9          |
|Auto Repair, Automotive|5.0  |Nashville       |TN   |7          |
|Auto Repair, Automotive|5.0  |Saint Louis     |MO   |6          |
|Auto Repair, Automotive|5.0  |Tampa           |FL   |6          |
|Auto Repair, Automotive|5.0  |Santa Barbara   |CA   |5          |
|Auto Repair, Automotive|5.0  |Wilmington      |DE   |5          |
|Auto Repair, Automotive|5.0  |Goleta          |CA   |4          |
|Auto Repair, Automotive|5.0  |Sparks          |NV   |4          |
|Auto Repair, Automotive|5.0  |Safety Harbor   |FL   |3          |
|Auto Repair, Automotive|5.0  |Pinellas Park   |FL   |3          |
|Auto Repair, Automotive|5.0  |Levittown       |PA   |2       

In [41]:
#Find category with least number of 5 stars
spark.sql(
'''SELECT categories
FROM business_table
WHERE stars = 5
GROUP BY categories
HAVING COUNT(*) = (
    SELECT MIN(category_count)
    FROM (
        SELECT COUNT(*) AS category_count
        FROM business_table
        WHERE stars = 5
        GROUP BY categories
    )
)
'''
).show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|categories                                                                                                                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Dog Walkers, Pet Services, Health & Medical, Reiki, Pet Sitting, Pets                                                                                                                          |
|Community Service/Non-Profit, Local Services, Active Life, Botanical Gardens, Arts & Entertainment, Local Flavor, Parks                                                                        |
|Home Services, Contractors, L

In [42]:
spark.sql('''
select * from (select categories, stars, count(stars) as total_count
from business_table
group by categories,stars
)
where categories='Contractors, Painters, Home Services'
order by stars desc
''').show(truncate=False)

+------------------------------------+-----+-----------+
|categories                          |stars|total_count|
+------------------------------------+-----+-----------+
|Contractors, Painters, Home Services|5.0  |1          |
|Contractors, Painters, Home Services|3.5  |2          |
|Contractors, Painters, Home Services|3.0  |2          |
|Contractors, Painters, Home Services|2.5  |1          |
+------------------------------------+-----+-----------+



In [43]:
#Find business with more number of 5 stars
spark.sql(
'''SELECT name
FROM business_table
WHERE stars = 5
GROUP BY name
HAVING COUNT(*) = (
    SELECT MAX(name_count)
    FROM (
        SELECT COUNT(*) AS name_count
        FROM business_table
        WHERE stars = 5
        GROUP BY name
    )
)
'''
).show(truncate=False)

+---------------------+
|name                 |
+---------------------+
|Painting with a Twist|
+---------------------+



In [44]:
#Find business_id with more number of 5 stars
spark.sql(
'''SELECT business_id
FROM business_table
WHERE stars = 5
GROUP BY business_id
HAVING COUNT(*) = (
    SELECT MAX(id_count)
    FROM (
        SELECT COUNT(*) AS id_count
        FROM business_table
        WHERE stars = 5
        GROUP BY business_id
    )
)
'''
).show(truncate=False)

+----------------------+
|business_id           |
+----------------------+
|1dSKEitDDgIkaApe6UNMSA|
|adATTqggIQX5xxLDISkFTw|
|4grfwtBDji5tZ7AxYCXCzQ|
|iipnazeY9eoANJ37l5fKwA|
|sk2lZI4zmuGAccd3DLCnBw|
|SPhcwEy32VV8dsaRStj-qg|
|sEBL8OtHzw2ctY69D1aoaA|
|6xXgtiKNqni1UdMacNGTFw|
|PPJL9EbIM6B6r86Gf-H0nw|
|uft8PlHObqze95tve1yQVA|
|efou6IrwQhHAXZ0e0v0Gkw|
|bi3SICnYEuFC7Cihyqz8Nw|
|kKb00Oj96n5YYqMSDzx1IA|
|GL6QZaCH8EGv1Rn3vDNIVg|
|mxrl1SSM04F4DYP8dUwL2A|
|JujePuViptUIgGNiaWblkA|
|FxAnPjRkh-3OY7qdZ7hqFg|
|qmxmUc4xzNt3ogaZLH8Eiw|
|nDeDis86-fkO_2m2N0EELw|
|HtplZd3pECmiewb9Lj88yQ|
+----------------------+
only showing top 20 rows



In [45]:
spark.sql('''
select * from (select name, stars,city,state, count(stars) as total_count
from business_table
group by name,stars,city,state
order by stars desc,total_count desc
)
where name='Painting with a Twist'
''').show(truncate=False)

+---------------------+-----+------------+-----+-----------+
|name                 |stars|city        |state|total_count|
+---------------------+-----+------------+-----+-----------+
|Painting with a Twist|5.0  |Philadelphia|PA   |3          |
|Painting with a Twist|5.0  |Ballwin     |MO   |1          |
|Painting with a Twist|5.0  |Nashville   |TN   |1          |
|Painting with a Twist|5.0  |Trinity     |FL   |1          |
|Painting with a Twist|5.0  |Bensalem    |PA   |1          |
|Painting with a Twist|5.0  |Brandon     |FL   |1          |
|Painting with a Twist|5.0  |Mount Laurel|NJ   |1          |
|Painting with a Twist|5.0  |Haddonfield |NJ   |1          |
|Painting with a Twist|5.0  |Gretna      |LA   |1          |
|Painting with a Twist|5.0  |Skippack    |PA   |1          |
|Painting with a Twist|5.0  |Clearwater  |FL   |1          |
|Painting with a Twist|5.0  |Media       |PA   |1          |
|Painting with a Twist|4.5  |Tampa       |FL   |3          |
|Painting with a Twist|4

In [46]:
spark.sql('''
select * from (select name, stars,categories,city,state, count(stars) as total_count
from business_table
group by name,stars,city,state,categories
order by stars desc,total_count desc
)
where name='Painting with a Twist'
and city='Philadelphia'
''').show()

+--------------------+-----+--------------------+------------+-----+-----------+
|                name|stars|          categories|        city|state|total_count|
+--------------------+-----+--------------------+------------+-----+-----------+
|Painting with a T...|  5.0|Arts & Entertainm...|Philadelphia|   PA|          1|
|Painting with a T...|  5.0|Education, Arts &...|Philadelphia|   PA|          1|
|Painting with a T...|  5.0|Paint & Sip, Art ...|Philadelphia|   PA|          1|
|Painting with a T...|  4.5|Paint & Sip, Art ...|Philadelphia|   PA|          1|
+--------------------+-----+--------------------+------------+-----+-----------+



In [14]:
hours_df = business_df.select(
    "business_id",
    "hours.Monday",
    "hours.Tuesday",
    "hours.Wednesday",
    "hours.Thursday",
    "hours.Friday",
    "hours.Saturday",
    "hours.Sunday",
)

# Create a temporary view for the hours DataFrame
hours_df.createOrReplaceTempView("hours_table")

# Perform any desired operations on the hours table
result = spark.sql("SELECT * FROM hours_table")

# Show the result
result.show(truncate=False)

+----------------------+-----------+-----------+-----------+-----------+-----------+-----------+----------+
|business_id           |Monday     |Tuesday    |Wednesday  |Thursday   |Friday     |Saturday   |Sunday    |
+----------------------+-----------+-----------+-----------+-----------+-----------+-----------+----------+
|Pns2l4eNsfO8kk83dixA6A|null       |null       |null       |null       |null       |null       |null      |
|mpf3x-BjTdTEA3yCZrAYPw|0:0-0:0    |8:0-18:30  |8:0-18:30  |8:0-18:30  |8:0-18:30  |8:0-14:0   |null      |
|tUFrWirKiKi_TAnsVWINQQ|8:0-22:0   |8:0-22:0   |8:0-22:0   |8:0-22:0   |8:0-23:0   |8:0-23:0   |8:0-22:0  |
|MTSW4McQd7CbVtyjqoe9mw|7:0-20:0   |7:0-20:0   |7:0-20:0   |7:0-20:0   |7:0-21:0   |7:0-21:0   |7:0-21:0  |
|mWMc6_wTdE0EUBKIGXDVfA|null       |null       |14:0-22:0  |16:0-22:0  |12:0-22:0  |12:0-22:0  |12:0-18:0 |
|CF33F8-E6oudUQ46HnavjQ|0:0-0:0    |6:0-22:0   |6:0-22:0   |6:0-22:0   |9:0-0:0    |9:0-22:0   |8:0-22:0  |
|n_0UpQx1hsNbnPUSlodU8w|0:0-

In [48]:
spark.sql('''select b.name,h.* 
from business_table b,hours_table h 
where b.business_id=h.business_id''').show()

+--------------------+--------------------+-----------+-----------+-----------+-----------+-----------+-----------+----------+
|                name|         business_id|     Monday|    Tuesday|  Wednesday|   Thursday|     Friday|   Saturday|    Sunday|
+--------------------+--------------------+-----------+-----------+-----------+-----------+-----------+-----------+----------+
|Abby Rappoport, L...|Pns2l4eNsfO8kk83d...|       null|       null|       null|       null|       null|       null|      null|
|       The UPS Store|mpf3x-BjTdTEA3yCZ...|    0:0-0:0|  8:0-18:30|  8:0-18:30|  8:0-18:30|  8:0-18:30|   8:0-14:0|      null|
|              Target|tUFrWirKiKi_TAnsV...|   8:0-22:0|   8:0-22:0|   8:0-22:0|   8:0-22:0|   8:0-23:0|   8:0-23:0|  8:0-22:0|
|  St Honore Pastries|MTSW4McQd7CbVtyjq...|   7:0-20:0|   7:0-20:0|   7:0-20:0|   7:0-20:0|   7:0-21:0|   7:0-21:0|  7:0-21:0|
|Perkiomen Valley ...|mWMc6_wTdE0EUBKIG...|       null|       null|  14:0-22:0|  16:0-22:0|  12:0-22:0|  12:0-2

In [49]:
spark.sql('''select b.name,b.categories,b.city,b.state,h.* 
from business_table b,hours_table h 
where b.business_id=h.business_id
and b.name='Painting with a Twist'
and b.business_id = 'N2vUq6LOvvRzKB5zj4v0Rw'
''').show(truncate=False)

+---------------------+---------------------------------------------------------+-------+-----+----------------------+------+-------+---------+--------+------+--------+------+
|name                 |categories                                               |city   |state|business_id           |Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|
+---------------------+---------------------------------------------------------+-------+-----+----------------------+------+-------+---------+--------+------+--------+------+
|Painting with a Twist|Paint & Sip, Education, Art Classes, Arts & Entertainment|Ballwin|MO   |N2vUq6LOvvRzKB5zj4v0Rw|null  |null   |null     |null    |null  |null    |null  |
+---------------------+---------------------------------------------------------+-------+-----+----------------------+------+-------+---------+--------+------+--------+------+



In [26]:
#top 10 cities with most reviews
output=spark.sql('''
SELECT city, SUM(review_count) AS total_reviews
FROM business_table
GROUP BY city
ORDER BY total_reviews DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_cities_reviews.csv")

+-------------+-------------+
|         city|total_reviews|
+-------------+-------------+
| Philadelphia|       936240|
|  New Orleans|       621361|
|    Nashville|       441053|
|        Tampa|       439506|
|       Tucson|       387254|
| Indianapolis|       349228|
|         Reno|       334610|
|Santa Barbara|       262853|
|  Saint Louis|       244360|
|        Boise|       101893|
+-------------+-------------+



In [27]:
#top 10 cities with more number of business_id
output=spark.sql('''
SELECT city, count(business_id) AS total_bus
FROM business_table
GROUP BY city
ORDER BY total_bus DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_cities_ids.csv")

+-------------+---------+
|         city|total_bus|
+-------------+---------+
| Philadelphia|    14569|
|       Tucson|     9250|
|        Tampa|     9050|
| Indianapolis|     7540|
|    Nashville|     6971|
|  New Orleans|     6209|
|         Reno|     5935|
|     Edmonton|     5054|
|  Saint Louis|     4827|
|Santa Barbara|     3829|
+-------------+---------+



In [52]:
#top 10 cities with more number of name
spark.sql('''
SELECT city, count(name) AS total_name
FROM business_table
GROUP BY city
ORDER BY total_name DESC
LIMIT 10
''').show()

+-------------+----------+
|         city|total_name|
+-------------+----------+
| Philadelphia|     14569|
|       Tucson|      9250|
|        Tampa|      9050|
| Indianapolis|      7540|
|    Nashville|      6971|
|  New Orleans|      6209|
|         Reno|      5935|
|     Edmonton|      5054|
|  Saint Louis|      4827|
|Santa Barbara|      3829|
+-------------+----------+



In [28]:
#cities with more stars
output=spark.sql('''
SELECT city, count(stars) AS total_stars
FROM business_table
GROUP BY city
ORDER BY total_stars DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_cities_stars.csv")

+-------------+-----------+
|         city|total_stars|
+-------------+-----------+
| Philadelphia|      14569|
|       Tucson|       9250|
|        Tampa|       9050|
| Indianapolis|       7540|
|    Nashville|       6971|
|  New Orleans|       6209|
|         Reno|       5935|
|     Edmonton|       5054|
|  Saint Louis|       4827|
|Santa Barbara|       3829|
+-------------+-----------+



In [29]:
#top 10 states with more number of businesses
output=spark.sql('''
SELECT state, count(business_id) AS total_bus
FROM business_table
GROUP BY state
ORDER BY total_bus DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_states_business.csv")

+-----+---------+
|state|total_bus|
+-----+---------+
|   PA|    34039|
|   FL|    26330|
|   TN|    12056|
|   IN|    11247|
|   MO|    10913|
|   LA|     9924|
|   AZ|     9912|
|   NJ|     8536|
|   NV|     7715|
|   AB|     5573|
+-----+---------+



In [31]:
#top 10 states with most reviews
output=spark.sql('''
SELECT state, SUM(review_count) AS total_reviews
FROM business_table
GROUP BY state
ORDER BY total_reviews DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_state_reviews.csv")

+-----+-------------+
|state|total_reviews|
+-----+-------------+
|   PA|      1540790|
|   FL|      1119926|
|   LA|       743176|
|   TN|       598195|
|   MO|       483897|
|   IN|       472565|
|   AZ|       412639|
|   NV|       409950|
|   CA|       339637|
|   NJ|       249837|
+-----+-------------+



In [30]:
#top 10 states with more stars
output=spark.sql('''
SELECT state, count(stars) AS total_stars
FROM business_table
GROUP BY state
ORDER BY total_stars DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_state_star.csv")

+-----+-----------+
|state|total_stars|
+-----+-----------+
|   PA|      34039|
|   FL|      26330|
|   TN|      12056|
|   IN|      11247|
|   MO|      10913|
|   LA|       9924|
|   AZ|       9912|
|   NJ|       8536|
|   NV|       7715|
|   AB|       5573|
+-----+-----------+



In [57]:
# all details of business with min star
spark.sql('''
select * 
from business_table
where stars=(select min(stars) from business_table)
''').show()

+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|        city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|904 Clearwater La...|{null, null, null...|jkY8aotwcAlSHCgoZ...|Home & Garden, Lo...|       Largo|                null|      0|   27.9251193|    -82.7959877|  Bay Area Appliance|      33770|           5|  1.0|   FL|
|                    |{null, null, null...|8oi1Gm7XLyK31Y3N3...|Plumbing, Home Se...|    Smithton|{9:0-17:0, 9:0-17...|      1|      38.

In [58]:
# all details of business with max star
spark.sql('''
select * 
from business_table
where stars=(select max(stars) from business_table)
''').show()

+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|         city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...|Santa Barbara|                null|      0|   34.4266787|   -119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|   15 N Missouri Ave|{null, null, null...|jaxMSoInw8Poo3XeM...|General Dentistry...|   Clearwater|{null, 7:30-15:30...|      1|    

In [59]:
# all details of business with max review counts
spark.sql('''
select * 
from business_table
where review_count=(select max(review_count) from business_table)
''').show()

+----------------+--------------------+--------------------+--------------------+-----------+--------------------+-------+----------+-----------+-----------------+-----------+------------+-----+-----+
|         address|          attributes|         business_id|          categories|       city|               hours|is_open|  latitude|  longitude|             name|postal_code|review_count|stars|state|
+----------------+--------------------+--------------------+--------------------+-----------+--------------------+-------+----------+-----------+-----------------+-----------+------------+-----+-----+
|724 Iberville St|{null, null, 'ful...|_ab50qdWOk0DdB6XO...|Live/Raw Food, Se...|New Orleans|{11:0-22:0, 11:0-...|      1|29.9542735|-90.0689651|Acme Oyster House|      70130|        7568|  4.0|   LA|
+----------------+--------------------+--------------------+--------------------+-----------+--------------------+-------+----------+-----------+-----------------+-----------+------------+-----+--

In [60]:
# all details of business with min review counts
spark.sql('''
select * 
from business_table
where review_count=(select min(review_count) from business_table)
''').show()

+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|          city|               hours|is_open|     latitude|      longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+
|  400 Pasadena Ave S|                null|qkRM_2X51Yqxk3btl...|Synagogues, Relig...|St. Petersburg|{9:0-17:0, 9:0-17...|      1|     27.76659|     -82.732983|      Temple Beth-El|      33707|           5|  3.5|   FL|
|        712 Adams St|{null, null, null...|M0XSSHqrASOnhgbWD...|Vape Shops, Tobac...|   New Orleans|{10:0-19:0, 10:0-...|      1

In [32]:
#top 10 categories with more stars
output=spark.sql('''
SELECT categories, count(stars) AS total_stars
FROM business_table
GROUP BY categories
ORDER BY total_stars DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_cat_stars.csv")

+--------------------+-----------+
|          categories|total_stars|
+--------------------+-----------+
|Beauty & Spas, Na...|       1012|
|  Restaurants, Pizza|        935|
|Nail Salons, Beau...|        934|
|  Pizza, Restaurants|        823|
|Restaurants, Mexican|        728|
|Restaurants, Chinese|        708|
|Mexican, Restaurants|        672|
|Chinese, Restaurants|        651|
|  Food, Coffee & Tea|        508|
|Beauty & Spas, Ha...|        493|
+--------------------+-----------+



In [33]:
#top 10 categories with more review_counts
output=spark.sql('''
SELECT categories, sum(review_count) AS total_review
FROM business_table
GROUP BY categories
ORDER BY total_review DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_cat_review.csv")

+--------------------+------------+
|          categories|total_review|
+--------------------+------------+
|Mexican, Restaurants|       53043|
|Restaurants, Mexican|       52996|
|  Restaurants, Pizza|       32746|
|  Pizza, Restaurants|       28477|
|Beauty & Spas, Na...|       27484|
|Restaurants, Italian|       26320|
|Italian, Restaurants|       25388|
|Restaurants, Chinese|       23985|
|Nail Salons, Beau...|       23100|
|Chinese, Restaurants|       22071|
+--------------------+------------+



In [34]:
#top 10 businesses with more review_counts
output=spark.sql('''
SELECT name, sum(review_count) AS total_review
FROM business_table
GROUP BY name
ORDER BY total_review DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_bus_review.csv")

+--------------------+------------+
|                name|total_review|
+--------------------+------------+
|           Starbucks|       20692|
|          McDonald's|       17359|
|             Dunkin'|        9864|
|Chipotle Mexican ...|        9383|
|         First Watch|        8921|
|   Acme Oyster House|        8372|
|           Taco Bell|        8325|
|         Chick-fil-A|        8004|
|        Oceana Grill|        7400|
|        Panera Bread|        7246|
+--------------------+------------+



In [35]:
#top 10 businesses with more stars
output=spark.sql('''
SELECT name, count(stars) AS total_stars
FROM business_table
GROUP BY name
ORDER BY total_stars DESC
LIMIT 10
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_ten_bus_stars.csv")

+------------+-----------+
|        name|total_stars|
+------------+-----------+
|   Starbucks|        724|
|  McDonald's|        703|
|     Dunkin'|        510|
|      Subway|        459|
|   Taco Bell|        365|
|CVS Pharmacy|        345|
|   Walgreens|        341|
| Burger King|        338|
|     Wendy's|        331|
|        Wawa|        307|
+------------+-----------+



In [65]:
#top business based on its count
spark.sql('''
select  name, count(name) as total
from business_table
group by name
having count(name)=
(
    SELECT MAX(bus_count)
    FROM (
        SELECT COUNT(*) AS bus_count
        FROM business_table
        GROUP BY name
    )
)
''').show()

+---------+-----+
|     name|total|
+---------+-----+
|Starbucks|  724|
+---------+-----+



In [36]:
#top 20 business based on its count
output=spark.sql('''
SELECT name, count(name) AS total_bus
FROM business_table
GROUP BY name
ORDER BY total_bus DESC
LIMIT 20
''')
output.show()
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_twenty_bus_count.csv")

+--------------------+---------+
|                name|total_bus|
+--------------------+---------+
|           Starbucks|      724|
|          McDonald's|      703|
|             Dunkin'|      510|
|              Subway|      459|
|           Taco Bell|      365|
|        CVS Pharmacy|      345|
|           Walgreens|      341|
|         Burger King|      338|
|             Wendy's|      331|
|                Wawa|      307|
|      Domino's Pizza|      295|
|       The UPS Store|      281|
|           Pizza Hut|      272|
|Enterprise Rent-A...|      232|
|   Papa John's Pizza|      196|
|         Great Clips|      185|
|        Jimmy John's|      175|
|      US Post Office|      174|
|                 KFC|      171|
|         Chick-fil-A|      162|
+--------------------+---------+



In [6]:
#Function that returns Y/N if business is available in the Morning/Night based on hours_df
def availability(t):
    if t is None:
        return [None,None]
    l=[]
    start_t,end_t=t.split('-')
    hour,time=start_t.split(':')
    if int(hour)<= 12 or int(hour)==0:
        l.append('Y')
    else:
        l.append('N')
    hour,time=end_t.split(':')
    if int(hour)> 12 or int(hour)==0:
        l.append('Y')
    else:
        l.append('N')
    return l

In [7]:
def star_weights(stars):
    if stars>=3 and stars<=5:
        return  500
    elif stars>=1 and stars<3:
        return  250
    else:
        return 0

In [8]:
def review_weights(review_count):
    if review_count<= 20692 and review_count>8000:
        return 100
    elif review_count<= 8000 and review_count>1000:
        return 200
    elif review_count<=1000 and review_count>=500:
        return 100
    elif review_count<=500 and review_count>=100:
        return 50
    elif review_count<=100 and review_count>=50:
        return 25
    elif review_count<=50 and review_count>=20:
        return 15
    elif review_count<=20 and review_count>=1:
        return 5
    else:
        return 0

In [9]:
def open_weights(is_open):
    if is_open==1:
        return 50
    elif is_open==0:
        return 10
    else:
        return 0

In [10]:
def count_name_weights(total_bus):
    if total_bus<=800 and total_bus>500:
        return 100
    elif total_bus<=500 and total_bus>100:
        return 70
    elif total_bus<=100 and total_bus>0:
        return 40
    else:
        return 0

In [11]:
# converting python function to spark UDF (user defined function)
udf_availability = udf(availability, ArrayType(StringType())) #mention function name and its return type
udf_star_weights = udf(star_weights,IntegerType())
udf_review_weights = udf(review_weights,IntegerType())
udf_open_weights = udf(open_weights,IntegerType())
udf_count_name_weights = udf(count_name_weights,IntegerType())

In [12]:
# Registering the function in spark with any name or same name to use in spark SQL
spark.udf.register("udf_availability", udf_availability)
spark.udf.register("udf_star_weights", udf_star_weights)
spark.udf.register("udf_review_weights", udf_review_weights)
spark.udf.register("udf_open_weights", udf_open_weights)
spark.udf.register("udf_count_name_weights", udf_count_name_weights)

<function __main__.count_name_weights(total_bus)>

In [17]:
#Assigning a custom score to each business
output=spark.sql('''
select 
business_id, 
    review_count,
    stars,
    tot_name,
    is_open,
    Monday,
    morning_flag,
    night_flag,
    score + 
    DECODE(NVL(morning_flag,'NA'),'Y',10,'N',5,'NA',0) +
    DECODE(NVL(night_flag,'NA'),'Y',20,'N',5,'NA',0)
    as custom_score
from 
(
SELECT 
    business_id, 
    review_count,
    stars,
    tot_name,
    is_open,
    Monday,
    udf_star_weights(NVL(stars,0)) +
    udf_review_weights(NVL(review_count,0)) +
    udf_open_weights(NVL(is_open,-1)) +
    udf_count_name_weights(NVL(tot_name,0))
    AS score,
    CASE 
        WHEN Monday IS NOT NULL THEN udf_availability(Monday)[0]
        ELSE NULL
    END AS morning_flag,
    CASE 
        WHEN Monday IS NOT NULL THEN udf_availability(Monday)[1] 
        ELSE NULL
    END AS night_flag
FROM 
(
    SELECT 
        b.business_id,
        b.review_count,
        b.stars,
        COUNT(b.name) OVER (PARTITION BY b.name) AS tot_name,
        b.is_open,
        h.Monday
    FROM
        business_table b,
        hours_table h
    WHERE 1=1
    AND b.business_id = h.business_id
    --and b.business_id ='mpf3x-BjTdTEA3yCZrAYPw'
    )
)
order by custom_score desc
''')
output.show(truncate=False)
output.write.option("header",True).csv("C:/Supriyaa-spark-notes/tableau files/Business/top_bus_custom_score.csv")

+----------------------+------------+-----+--------+-------+---------+------------+----------+------------+
|business_id           |review_count|stars|tot_name|is_open|Monday   |morning_flag|night_flag|custom_score|
+----------------------+------------+-----+--------+-------+---------+------------+----------+------------+
|5oX4G1cptixPZecih9L3dg|1147        |4.0  |1       |1      |0:0-0:0  |Y           |Y         |820.0       |
|gTC8IQ_i8zXytWSly3Ttvg|3837        |4.5  |2       |1      |11:0-22:0|Y           |Y         |820.0       |
|3WU1ZobAqXQ07xYoKE2Vyg|1437        |3.5  |1       |1      |7:0-14:0 |Y           |Y         |820.0       |
|Ih6_y2nnbg2Jw9Qdc876GA|1106        |4.0  |8       |1      |0:0-0:0  |Y           |Y         |820.0       |
|bp5Mk2d0qofUeF5uLauIbg|1350        |4.5  |1       |1      |0:0-0:0  |Y           |Y         |820.0       |
|Rl42JbSMsmNW3LRjsTMYAg|1270        |4.0  |2       |1      |11:0-23:0|Y           |Y         |820.0       |
|cE_gYW3CHEducTkEyxTDdA|1368