In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType,FloatType,BooleanType,DateType


In [0]:
from  delta.tables import delta

In [0]:
catalog_name = "practice_db_catalog"
schema_name = "airbnb"

In [0]:
df = spark.read.json(f"/Volumes/{catalog_name}/{schema_name}/data_volume/airbnb/raw/listings.json/")


In [0]:
# Explode the amenities array so that each row contains single amenity
df_explod = df.withColumn("amenities",F.explode("amenities"))

#Normalizing the boolean values.
anomolies = {
    "true": "True",
    "TRUE": "True",
    "Yes": "True",
    "yes": "True",
    "false": "False",
    "No": "False",
    "no": "False",
    "FALSE": "False",
}
df_explod = df_explod.replace(anomolies, subset=["has_parking", "is_superhost"])
df_explod = df_explod.withColumn("has_parking", F.col("has_parking").cast(BooleanType())) \
                     .withColumn("is_superhost", F.col("is_superhost").cast(BooleanType()))\
                     .withColumn("created_date",F.col("created_date").cast(DateType()))\
                     .withColumn("last_booked_date",F.col("last_booked_date").cast(DateType()))

df_explod = df_explod.withColumnRenamed("amenities", "amenity")


In [0]:
df_silver = df_explod.select( "id", "name","amenity","is_superhost","has_parking")

In [0]:

df_silver.write.format("delta").mode("overwrite").save(f"/Volumes/{catalog_name}/{schema_name}/data_volume/airbnb/silver/listings.json/")

In [0]:
df_final = df_explod.select( "id", "name","amenity","is_superhost","has_parking")

In [0]:
# 1.How amenities are listed where id = 101 
df_final.filter(F.col("id")==101).select("amenity").show()

# 2.How many listing have is_superhost = true
df_final.filter(F.col("is_superhost")==True).count()

# 3. what are the unique amenities available for listing ID = 103
df_final.filter(F.col("id")==103).select("amenity").distinct().show()

# 4.Count how many listings have has_parking = true
df_final.filter(F.col("has_parking")==True).count()

# 5.For each listing how many amenities are available are available?
df_final.groupBy("id").agg(F.countDistinct("amenity")).show()

+------------+
|     amenity|
+------------+
|         Gym|
|      Washer|
|   Fireplace|
|     Kitchen|
|        Wifi|
|Pet friendly|
|       Dryer|
|   Workspace|
+------------+

