In [3]:
from pyspark.sql import SparkSession

In [51]:
from pyspark.sql.functions import spark_partition_id

In [52]:
from pyspark.sql import functions as sf

In [4]:
spark = SparkSession.builder.master("local[2]").appName("Bangloe").getOrCreate()

## Read Banglore dataset

In [6]:
filepath = "../data/Bengaluru_House_Data*.csv"

In [9]:
dataframe = spark.read.format("csv").option("path",filepath).option("inferSchema","true").option("header","true").load()

In [100]:
dataframe.show(5)

+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|           area_type| availability|            location|     size|society|total_sqft|bath|balcony|price|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |      1056|   2|      1|39.07|
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|      2600|   5|      3|120.0|
|      Built-up  Area|Ready To Move|         Uttarahalli|    3 BHK|   NULL|      1440|   2|      3| 62.0|
|Super built-up  Area|Ready To Move|  Lingadheeranahalli|    3 BHK|Soiewre|      1521|   3|      1| 95.0|
|Super built-up  Area|Ready To Move|            Kothanur|    2 BHK|   NULL|      1200|   2|      1| 51.0|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
only showing top 5 rows



In [41]:
dataframe.printSchema()

root
 |-- area_type: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- location: string (nullable = true)
 |-- size: string (nullable = true)
 |-- society: string (nullable = true)
 |-- total_sqft: string (nullable = true)
 |-- bath: integer (nullable = true)
 |-- balcony: integer (nullable = true)
 |-- price: double (nullable = true)



In [46]:
dataframe.columns

['area_type',
 'availability',
 'location',
 'size',
 'society',
 'total_sqft',
 'bath',
 'balcony',
 'price']

## GroupBy Area count, Sum

In [49]:
dataframe.groupBy("area_type").count().orderBy("count",ascending=False).show()

+--------------------+-----+
|           area_type|count|
+--------------------+-----+
|Super built-up  Area| 8790|
|      Built-up  Area| 2418|
|          Plot  Area| 2025|
|        Carpet  Area|   87|
+--------------------+-----+



In [68]:
dataframe.groupBy("area_type").agg(sf.sum(sf.column("bath") + sf.column("balcony")).alias("bhk")).orderBy("bhk",ascending=False).show()

+--------------------+-----+
|           area_type|  bhk|
+--------------------+-----+
|Super built-up  Area|34342|
|          Plot  Area| 9378|
|      Built-up  Area| 9376|
|        Carpet  Area|  312|
+--------------------+-----+



In [None]:
dataframe.groupBy("area_type").agg(sf.sum("price").alias("price2")).\
withColumn("price2",round('price2',2)).orderBy("price2",ascending=False).show()

## Selection Column

In [73]:
dataframe.select("area_type",column("availability"),col("price"),dataframe.total_sqft).limit(2).show()

+--------------------+-------------+-----+----------+
|           area_type| availability|price|total_sqft|
+--------------------+-------------+-----+----------+
|Super built-up  Area|       19-Dec|39.07|      1056|
|          Plot  Area|Ready To Move|120.0|      2600|
+--------------------+-------------+-----+----------+



In [77]:
dataframe.where("balcony > 2").select("area_type","price").limit(2).show()
dataframe.filter("balcony > 2").select("area_type","price").limit(2).show()

+--------------+-----+
|     area_type|price|
+--------------+-----+
|    Plot  Area|120.0|
|Built-up  Area| 62.0|
+--------------+-----+

+--------------+-----+
|     area_type|price|
+--------------+-----+
|    Plot  Area|120.0|
|Built-up  Area| 62.0|
+--------------+-----+



In [80]:
dataframe.filter(dataframe['balcony'] > 2).select("area_type","price").limit(2).show()

+--------------+-----+
|     area_type|price|
+--------------+-----+
|    Plot  Area|120.0|
|Built-up  Area| 62.0|
+--------------+-----+



## Partitioning

In [82]:
dataframe.rdd.getNumPartitions() # Check partition

1

In [83]:
repartition_df = dataframe.repartition(5) # it perform random and equal partition

In [84]:
repartition_df.count()

13320

In [86]:
repartition_df.groupBy(spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0| 2664|
|                   1| 2664|
|                   2| 2664|
|                   3| 2664|
|                   4| 2664|
+--------------------+-----+



## Save Repartition Data

In [88]:
save_path = filepath = "../data/output/"

In [92]:
repartition_df.write.format("csv").mode('overwrite').option("path",save_path).save()

## Save PartitionBy data
It will partition by the given columns, at the time of read operation spark engin just look into specified folder

In [99]:
save_path = filepath = "../data/output/json/"
dataframe.write.format("json").mode('overwrite').\
option("path",save_path).\
partitionBy("balcony").option("maxRecordsPerFile",1000).save()

In [93]:
rdd1.coalesce(4)