Here we are going to learn how to,

* drop rows with null values based on different criterias
* handle null values by replacing them

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark_session = SparkSession.builder.appName("pyspark_dataframe_part_02").getOrCreate()
spark_session

In [9]:
spark_df = spark_session.read.csv("file:///mnt/92D26AE0D26AC7D5/Python/pyspark/ararental.csv", header=True, inferSchema=True)
spark_df.show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|          null|    2|
|    35600|                null|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|                null|           null|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [18]:
# with na.drop() we can drop all rows which contain null values(each field should contain null value)
spark_df.na.drop().show()

+---------+--------------------+---------------+-------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|         City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+-------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct|Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|        Pella|952  -913-2229|641  -628-4983|    4|
+---------+--------------------+---------------+-------------+--------------+--------------+-----+



In [20]:
# there are three arguments we can use with drop as 'how', 'thresh' and 'subset'
# by set 'how' value as 'all' we can drop all rows with all values are null 
spark_df.na.drop(how="all").show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|          null|    2|
|    35600|                null|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|                null|           null|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [22]:
# by set 'how' value as 'any' we can drop all rows with any value is null 
# this work similar to spark_df.na.drop()
spark_df.na.drop(how="any").show()

+---------+--------------------+---------------+-------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|         City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+-------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct|Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|        Pella|952  -913-2229|641  -628-4983|    4|
+---------+--------------------+---------------+-------------+--------------+--------------+-----+



In [26]:
# by set 'thresh' value as some integer and this means, each row should have at least given number of non-null values and if not it should be dropped
# here if the row not contain at least 5 non-null values, then drop it
spark_df.na.drop(thresh=5).show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|          null|    2|
|    35600|                null|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|                null|           null|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [30]:
# with 'subset' we can define which columns should be considered for null-drop operation and other columns are not considered
spark_df.na.drop(subset=["Address", "Fax"]).show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|    35600|                null|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [35]:
# we can use combinations of these arguments
# here I drop each row if its' both "Vendor_Name" and "Fax" values are null
spark_df.na.drop(how="all", subset=["Vendor_Name", "Fax"]).show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|          null|    2|
|    35600|                null|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|                null|           null|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [37]:
# now we are going to fill all the missing values
spark_df.na.fill("FILLED").show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|        FILLED|    2|
|    35600|              FILLED|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|              FILLED|         FILLED|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [39]:
# here we going to fill missing values for given columns
spark_df.na.fill("FILLED", ["Vendor_Name", "Address"]).show()

+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|Vendor_ID|         Vendor_Name|        Address|          City|  Phone Number|           Fax|Years|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+
|   185227|American Rental S...|100 Grantley Ct| Sandy Springs|714  -580-4291|800  -714-7422|    3|
|    16054|        Attema Sales|  117 E 13th St|         Pella|952  -913-2229|641  -628-4983|    4|
|   195852|         B & S Sales|  218 Maquan St|        Hanson|781  -760-4590|          null|    2|
|    35600|              FILLED|    PO Box 3374|South Pasadena|626  -367-9157|818  -276-8409|    2|
|      761|              FILLED|         FILLED|   Lees Summit|816  -524-9666|816  -524-6983|    5|
+---------+--------------------+---------------+--------------+--------------+--------------+-----+



In [41]:
# now we are going to replace null values with calculated values
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=["Years"],
    outputCols=["Imp_{}".format(col) for col in ["Years"]],
).setStrategy("mean")