In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import spark_partition_id
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession.builder.master("local[2]").appName("Bangloe").getOrCreate()

In [3]:
filepath = "../data/Bengaluru_House_Data*.csv"
dataframe = spark.read.format("csv").option("path",filepath).option("inferSchema","true").option("header","true").load()

In [4]:
dataframe.show(2)

+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|           area_type| availability|            location|     size|society|total_sqft|bath|balcony|price|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |      1056|   2|      1|39.07|
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|      2600|   5|      3|120.0|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
only showing top 2 rows



In [10]:
dataframe.select(sf.count("*").alias("count"),
                 sf.sum("price").alias("price_sum"),
                 sf.avg("bath").alias("bath_avg"),
                 sf.countDistinct("bath").alias("distinctCnt")).show()

+-----+------------------+------------------+-----------+
|count|         price_sum|          bath_avg|distinctCnt|
+-----+------------------+------------------+-----------+
|13320|1499374.1449999989|2.6926096474673513|         19|
+-----+------------------+------------------+-----------+



In [11]:
dataframe.selectExpr("count(*) as count","sum(price) as price_sum","avg(bath) as bath_avg").show()

+-----+------------------+------------------+
|count|         price_sum|          bath_avg|
+-----+------------------+------------------+
|13320|1499374.1449999984|2.6926096474673513|
+-----+------------------+------------------+



## Grouping Aggregation

In [13]:
dataframe.createOrReplaceTempView("banglore")

In [14]:
sql_summary = spark.sql("""
select area_type,sum(bath + balcony) as batch_bal,avg(price) as price_avg
from banglore group by area_type
""")

In [16]:
sql_summary.show()
#sf.sum(sf.expr("bath + balcony")).alias("batch_bal"),

+--------------------+---------+------------------+
|           area_type|batch_bal|         price_avg|
+--------------------+---------+------------------+
|      Built-up  Area|     9376|104.28549834574028|
|Super built-up  Area|    34342| 92.97175711035274|
|          Plot  Area|     9378|208.49548641975312|
|        Carpet  Area|      312| 89.50235632183907|
+--------------------+---------+------------------+



In [29]:
dataframe.groupBy("area_type").agg(sf.avg("price").alias("price_avg"),
                                  sf.sum(sf.expr("bath + balcony")).alias("batch_bal"),
                                   sf.expr("sum(bath + balcony) as batch_bal")
                                  ).show()

+--------------------+------------------+---------+---------+
|           area_type|         price_avg|batch_bal|batch_bal|
+--------------------+------------------+---------+---------+
|      Built-up  Area|104.28549834574028|     9376|     9376|
|Super built-up  Area| 92.97175711035274|    34342|    34342|
|          Plot  Area|208.49548641975312|     9378|     9378|
|        Carpet  Area| 89.50235632183907|      312|      312|
+--------------------+------------------+---------+---------+



# find second largest salary of employee