In [None]:
**Question 2 - Property Type Distribution:**


Using groupBy, create a count of listings by property type in Boston. Rename the resulting column to "Property Type Count."


from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

# Assuming you have a "property_type" and "city" column
listing_df_new = listing_df.withColumnRenamed("property_type", "Property Type")
listing_df_new = listing_df_new.withColumnRenamed("city", "City")

# Filter the DataFrame to select only listings in Boston
boston_listings = listing_df_new.filter(col("City") == "Boston")

# Group by "Property Type" and count the number of listings
result_df = boston_listings.groupBy("Property Type") \
    .agg(count("*").alias("Property Type Count"))

# Show the resulting DataFrame with the renamed column
result_df.show()


In [None]:
**Price Trend Over Time:**

Calculate the average price change for each listing compared to the previous month.


from pyspark.sql.window import Window
from pyspark.sql.functions import lag, avg 


# Define a window specification to order data by listing_id and date
window_spec = Window.partitionBy("listing_id").orderBy("date")

# Calculate the lagged price (price of the previous month) for each listing
calender_df_lag = calender_df.withColumn("lagged_price", lag("price").over(window_spec))

# Calculate the price change by subtracting the lagged price from the current price
calender_df_change = calender_df_lag.withColumn("price_change", calender_df["price"] - calender_df_lag["lagged_price"])

# Group by listing_id and calculate the average price change
average_price_change = calender_df_change.groupBy("listing_id").agg(avg("price_change").alias("average_price_change"))

# Show the resulting DataFrame
average_price_change.show()


In [None]:
**Property Type Popularity:**

Combine listing_df and calendar_df to find the most popular property type in Boston based on the number of bookings. Order the results by the number of bookings in descending order.
from pyspark.sql.functions import count, desc

# Assuming you have a "property_type" column in listing_df
listing_df_q = listing_df.withColumnRenamed("property_type", "PropertyType")


# Join listing_df_q and boston_calendar_df on "listing_id"
combined_df = calender_df.join(listing_df_q, calender_df.listing_id  == listing_df_q.id, "inner")

# Group by "PropertyType" and count the number of bookings for each property type
property_type_counts = combined_df.groupBy("PropertyType") \
    .agg(count("*").alias("BookingCount"))

# Order the results by the number of bookings in descending order
property_type_counts = property_type_counts.orderBy(desc("BookingCount"))

# Show the resulting DataFrame
property_type_counts.show()

In [None]:
**Question 3 - Superhosts and Response Times:**
Using groupBy, count the number of superhosts in Boston by their response time categories. Rename the resulting columns to "Response Time" and "Superhost Count."

from pyspark.sql.functions import col, count, when




# Assuming you have columns named "host_response_time," "host_is_superhost," and "city"
listing_df_q = listing_df.withColumnRenamed("host_response_time", "Response Time")
listing_df_q = listing_df_q.withColumnRenamed("host_is_superhost", "Superhost")

# Filter the DataFrame to select only superhosts in Boston
boston_superhosts_df = listing_df_q.filter((col("Superhost") == "true") & (col("city") == "Boston"))

# Group by "Response Time" and count the number of superhosts in each category
superhost_count_df = boston_superhosts_df.groupBy("Response Time") \
    .agg(count("Superhost").alias("Superhost Count"))

# Order the results by "Superhost Count" in descending order
ordered_superhost_count_df = superhost_count_df.orderBy(col("Superhost Count").desc())

# Show the resulting DataFrame with the renamed columns and ordered results
ordered_superhost_count_df.show()




In [None]:
**Top 5 Superhosts with the Most Listings:** 

Filter the DataFrame to select only superhosts, group them by host_name, count the number of listings each superhost has, and order the results by the count in descending order. Show only the top 5 superhosts.


from pyspark.sql.functions import count

listing_df_q2 = listing_df.withColumnRenamed("host_name", "HostName")
listing_df_q2 = listing_df_q2.withColumnRenamed("host_is_superhost", "Superhost")

# Filter the DataFrame to select only superhosts
superhosts_df = listing_df_q2.filter(col("Superhost") == "true")

# Group by "Host Name" and count the number of listings for each superhost
superhost_counts = superhosts_df.groupBy("HostName") \
    .agg(count("*").alias("ListingCount"))

# Order the results by "Listing Count" in descending order
superhost_counts = superhost_counts.orderBy(col("ListingCount").desc())

# Show only the top 5 superhosts
top_5_superhosts = superhost_counts.limit(5)
top_5_superhosts.show()