## Checking Null in All the columns in a csv file
In this example, we'll see how to check for null values in all of the available columns using foldLeft ().
We can use column.isNull function one by one for all the columns but if dataframe has 100s of columns, then this method is not appropriate. 

In [10]:
import org.apache.spark.sql.types.{StructField, StructType, StringType, IntegerType, TimestampType}

val inputSchema = StructType(List(
    StructField("Date", StringType, false),
    StructField("Transaction", IntegerType, false),
    StructField("Name", StringType, false),
    StructField("Transaction_ID", StringType, false)))


import org.apache.spark.sql.types.{StructField, StructType, StringType, IntegerType, TimestampType}
inputSchema: org.apache.spark.sql.types.StructType = StructType(StructField(Date,StringType,false), StructField(Transaction,IntegerType,false), StructField(Name,StringType,false), StructField(Transaction_ID,StringType,false))


In [11]:
val inputDataDF = spark.read
        .option("header",true)
        .schema(inputSchema)
        .csv("./data/tran_data.csv")
inputDataDF.show(3)

+----------+-----------+----+--------------+
|      Date|Transaction|Name|Transaction_ID|
+----------+-----------+----+--------------+
|29-10-2020|       null| ABC|          tid1|
|30-10-2020|        120| ABC|          tid2|
|      null|        121| ABC|          tid3|
+----------+-----------+----+--------------+
only showing top 3 rows



inputDataDF: org.apache.spark.sql.DataFrame = [Date: string, Transaction: int ... 2 more fields]


In [12]:
inputDataDF.printSchema

root
 |-- Date: string (nullable = true)
 |-- Transaction: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Transaction_ID: string (nullable = true)



In [15]:
var ids = spark.emptyDataFrame
    .withColumn("Transaction_ID",lit(""))
    .withColumn("NullColumns",lit(""))

ids: org.apache.spark.sql.DataFrame = [Transaction_ID: string, NullColumns: string]


### Approach - 1: Chekcing with individual columns

In [23]:
inputDataDF.select("Transaction_Id").where(col("Date").isNull || col("Transaction").isNull || col("Name").isNull).show()

+--------------+
|Transaction_Id|
+--------------+
|          tid1|
|          tid3|
|          tid5|
|          tid6|
|          tid7|
|          tid8|
+--------------+



### Approach - 2: Using foldLeft()

In [28]:
inputDataDF.columns.foldLeft(inputDataDF) { (inputDataDF,colName) => 
    ids = ids.unionByName(inputDataDF.select("Transaction_Id").where(col(colName).isNull)
        .withColumn("NullColumns",lit(colName)))
    inputDataDF
}
ids.show(false)

+--------------+-----------+
|Transaction_ID|NullColumns|
+--------------+-----------+
|tid3          |Date       |
|tid6          |Date       |
|tid8          |Date       |
|tid1          |Transaction|
|tid5          |Transaction|
|tid6          |Transaction|
|tid7          |Transaction|
|tid8          |Name       |
|tid3          |Date       |
|tid6          |Date       |
|tid8          |Date       |
|tid1          |Transaction|
|tid5          |Transaction|
|tid6          |Transaction|
|tid7          |Transaction|
|tid8          |Name       |
|tid3          |Date       |
|tid6          |Date       |
|tid8          |Date       |
|tid1          |Transaction|
+--------------+-----------+
only showing top 20 rows



In [30]:
val finalDF = ids.dropDuplicates("Transaction_ID","NullColumns")
    .groupBy("Transaction_ID")
    .agg(collect_list(col("NullColumns")).as("NullColumns"))

finalDF.show(false)

+--------------+-------------------+
|Transaction_ID|NullColumns        |
+--------------+-------------------+
|tid8          |[Date, Name]       |
|tid5          |[Transaction]      |
|tid7          |[Transaction]      |
|tid1          |[Transaction]      |
|tid6          |[Transaction, Date]|
|tid3          |[Date]             |
+--------------+-------------------+



finalDF: org.apache.spark.sql.DataFrame = [Transaction_ID: string, NullColumns: array<string>]
