In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import spark_partition_id
from pyspark.sql import functions as sf

In [2]:
spark = SparkSession.builder.master("local[2]").appName("sparkCols").getOrCreate()

In [7]:
filepath = "../data/Bengaluru_House_Data*.csv"

In [68]:
dataframe = spark.read.format("csv").option("path",filepath).option("inferSchema","true").option("header","true").load()

In [69]:
dataframe.show(2)

+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|           area_type| availability|            location|     size|society|total_sqft|bath|balcony|price|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |      1056|   2|      1|39.07|
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|      2600|   5|      3|120.0|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
only showing top 2 rows



## Column String and Column Object

In [66]:
dataframe.select("area_type",sf.col("availability"),sf.column("location"),dataframe.size,dataframe['society']).show(2)

+--------------------+-------------+--------------------+---------+-------+
|           area_type| availability|            location|     size|society|
+--------------------+-------------+--------------------+---------+-------+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|
+--------------------+-------------+--------------------+---------+-------+
only showing top 2 rows



## Column Expression
**1. String expression or SQL expression<br>
**2. Column Object Expression

In [24]:
# String expression or SQL expression
dataframe.select("area_type",sf.expr("concat(size,bath) as cat")).show(3)

+--------------------+----------+
|           area_type|       cat|
+--------------------+----------+
|Super built-up  Area|    2 BHK2|
|          Plot  Area|4 Bedroom5|
|      Built-up  Area|    3 BHK2|
+--------------------+----------+
only showing top 3 rows



In [25]:
# String expression or SQL expression
dataframe.select("area_type",sf.concat("size","bath").alias("cat")).show(3)

+--------------------+----------+
|           area_type|       cat|
+--------------------+----------+
|Super built-up  Area|    2 BHK2|
|          Plot  Area|4 Bedroom5|
|      Built-up  Area|    3 BHK2|
+--------------------+----------+
only showing top 3 rows



## User Defined Function
**To use UDF, weh have to register user_define_function to UDF

In [65]:
dataframe.select("size").distinct().show(2)

+---------+
|     size|
+---------+
|   14 BHK|
|5 Bedroom|
+---------+
only showing top 2 rows



In [41]:
def bedrrom_to_bhk(size):
    if size:
        text = size.replace("Bedroom", "BHK")
        return text
    else:
        return size 
        

In [64]:
bedrrom_to_bhk_udf = sf.udf(bedrrom_to_bhk,sf.StringType()) # Dataframe UDF
src = dataframe.withColumn("size",bedrrom_to_bhk_udf("size")).select("size").distinct()
src.show(2)

+------+
|  size|
+------+
|12 BHK|
|14 BHK|
+------+
only showing top 2 rows



## Register UDF in SQL expression

In [49]:
# for f in spark.catalog.listFunctions():
#     print(f)


In [51]:
spark.udf.register("bedrrom_to_bhk_udf",bedrrom_to_bhk,sf.StringType())

<function __main__.bedrrom_to_bhk(size)>

In [63]:
dataframe2 = dataframe.withColumn("size",sf.expr("bedrrom_to_bhk_udf(size)")).select("size").distinct()
dataframe2.show(2)

+------+
|  size|
+------+
|12 BHK|
|14 BHK|
+------+
only showing top 2 rows



In [70]:
dataframe.show(2)

+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|           area_type| availability|            location|     size|society|total_sqft|bath|balcony|price|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |      1056|   2|      1|39.07|
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|      2600|   5|      3|120.0|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
only showing top 2 rows



In [79]:
dataframe.withColumn("bath",sf.when(sf.col("bath") >2 , sf.col("bath") +100).otherwise(sf.col("bath"))).show(2)

+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|           area_type| availability|            location|     size|society|total_sqft|bath|balcony|price|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
|Super built-up  Area|       19-Dec|Electronic City P...|    2 BHK|Coomee |      1056|   2|      1|39.07|
|          Plot  Area|Ready To Move|    Chikka Tirupathi|4 Bedroom|Theanmp|      2600| 105|      3|120.0|
+--------------------+-------------+--------------------+---------+-------+----------+----+-------+-----+
only showing top 2 rows

