In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Assigment4').getOrCreate()

In [2]:
# Load the new hotels file
base_df = spark.read.csv('Hotels_data_Changed.csv',inferSchema=True,header=True)

In [3]:
base_df.show(5)

+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+------------+-------------+-------+
|Snapshot ID|      Snapshot Date|       Checkin Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|          Hotel Name|Hotel Stars|WeekDay|DiscountDiff| DiscountPerc|DayDiff|
+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+------------+-------------+-------+
|          1|2015-07-17 00:00:00|2015-08-12 00:00:00|   5|          1178|          1040|            1|              6|Best Western Plus...|          3|    Wed|         138| 11.714770798|     26|
|          1|2015-07-17 00:00:00|2015-08-19 00:00:00|   5|          1113|           982|            1|              8|Best Western Plus...|          3|    Wed|         131|11.7699910153|     33|
|          1|2015-07-17 0

In [4]:
from pyspark.sql.functions import col

# Get 150 Hotels that have the most rows in data
tophotels = base_df.groupBy("Hotel Name").count().sort(col("count").desc()).head(150)
tophotels

[Row(Hotel Name='Newark Liberty International Airport Marriott', count=5346),
 Row(Hotel Name='Hilton Garden Inn Times Square', count=4892),
 Row(Hotel Name='Residence Inn Newark Elizabeth Liberty International Airport', count=4314),
 Row(Hotel Name='Westin New York at Times Square', count=3792),
 Row(Hotel Name='Loews Regency New York Hotel', count=3617),
 Row(Hotel Name='Viceroy New York', count=3565),
 Row(Hotel Name='Four Seasons Hotel New York', count=3243),
 Row(Hotel Name='Langham Place New York Fifth Avenue', count=3203),
 Row(Hotel Name='The Carlyle A Rosewood Hotel', count=3078),
 Row(Hotel Name='DoubleTree by Hilton Metropolitan - New York City', count=2866),
 Row(Hotel Name='Magnuson Convention Center Hotel', count=2862),
 Row(Hotel Name='Hilton Garden Inn New York West 35th Street', count=2822),
 Row(Hotel Name='Hilton Garden Inn New York-Times Square Central', count=2772),
 Row(Hotel Name='Conrad New York', count=2677),
 Row(Hotel Name='Wyndham Garden Brooklyn Sunset Park

In [5]:
# convert the name + count list to dataframe and create a view for it
top_hotel_names_df = spark.sparkContext.parallelize(tophotels).toDF(['NAME','COUNT'])
top_hotel_names_df.createOrReplaceTempView("topHotelNames")

# Change a column name to handle in query
base_df.withColumnRenamed('Hotel Name','Hotel_Name').createOrReplaceTempView("base_df")

# Reduced the rows listed to only the ones that are among top 150 names
top_hotels_filtered_df = spark.sql("SELECT * FROM base_df WHERE Hotel_Name IN (SELECT NAME FROM topHotelNames)")
top_hotels_filtered_df.show()

+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|Snapshot ID|      Snapshot Date|       Checkin Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|   Hotel_Name|Hotel Stars|WeekDay|DiscountDiff| DiscountPerc|DayDiff|
+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|        101|2015-08-16 00:00:00|2015-08-17 00:00:00|   5|          2055|          1989|            1|              1|Bentley Hotel|          4|    Mon|          66|3.21167883212|      1|
|        101|2015-08-16 00:00:00|2015-09-06 00:00:00|   5|          1409|          1348|            2|              3|Bentley Hotel|          4|    Sun|          61|4.32931156849|     21|
|        101|2015-08-16 00:00:00|2015-09-05 00:00:00|   5|  

In [6]:
# Get 40 Checkin dates that have the most rows in top 150 hotels data
top_checkin_dates = top_hotels_filtered_df.groupBy("Checkin Date").count().sort(col("count").desc()).head(40)

# convert the checkin date + count list to dataframe and create a view for it
top_checkin_dates_df = spark.sparkContext.parallelize(top_checkin_dates).toDF()
top_checkin_dates_df.show()

+-------------------+-----+
|       Checkin Date|count|
+-------------------+-----+
|2015-11-11 00:00:00| 2302|
|2015-10-14 00:00:00| 1887|
|2015-11-04 00:00:00| 1885|
|2015-08-19 00:00:00| 1883|
|2015-10-28 00:00:00| 1861|
|2015-10-21 00:00:00| 1817|
|2015-11-06 00:00:00| 1808|
|2015-08-12 00:00:00| 1765|
|2015-11-05 00:00:00| 1684|
|2015-10-22 00:00:00| 1662|
|2015-11-12 00:00:00| 1649|
|2015-10-29 00:00:00| 1623|
|2015-09-10 00:00:00| 1623|
|2015-09-09 00:00:00| 1616|
|2015-11-18 00:00:00| 1582|
|2015-08-26 00:00:00| 1559|
|2015-11-10 00:00:00| 1548|
|2015-11-13 00:00:00| 1547|
|2015-10-15 00:00:00| 1473|
|2015-11-21 00:00:00| 1469|
+-------------------+-----+
only showing top 20 rows



In [9]:
# Change a column name to handle in query
top_hotels_filtered_df.withColumnRenamed('Checkin Date','Checkin_Date').createOrReplaceTempView("topCheckinDates")

# Change a column name to handle in query
top_hotels_filtered_df.withColumnRenamed('Checkin Date','Checkin_Date').createOrReplaceTempView("top_hotels_filtered")

# Reduced the rows listed to only the ones that are among top 40 checkin dates
hotel_rows_for_top_dates = spark.sql("SELECT * FROM top_hotels_filtered WHERE Checkin_Date IN (SELECT Checkin_Date FROM topCheckinDates)")

hotel_rows_for_top_dates.show()

+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|Snapshot ID|      Snapshot Date|       Checkin_Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|   Hotel_Name|Hotel Stars|WeekDay|DiscountDiff| DiscountPerc|DayDiff|
+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+-------------+-----------+-------+------------+-------------+-------+
|        101|2015-08-16 00:00:00|2015-08-17 00:00:00|   5|          2055|          1989|            1|              1|Bentley Hotel|          4|    Mon|          66|3.21167883212|      1|
|        101|2015-08-16 00:00:00|2015-09-06 00:00:00|   5|          1409|          1348|            2|              3|Bentley Hotel|          4|    Sun|          61|4.32931156849|     21|
|        101|2015-08-16 00:00:00|2015-09-05 00:00:00|   5|  