# In this post we will see how to check null values and fill them

In [3]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.fs._
import org.apache.spark.sql
import org.apache.spark.sql.types._
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.functions.{expr,col,pow}
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions._

In [4]:
val conf = new SparkConf().setAppName("Flight status")
val sc = new SparkContext(conf)
val spark = new SQLContext(sc) 

conf = org.apache.spark.SparkConf@447743aa
sc = org.apache.spark.SparkContext@7b8b6cf0
spark = org.apache.spark.sql.SQLContext@57ce6a78




org.apache.spark.sql.SQLContext@57ce6a78

In [5]:
val retail_t = spark.read.format("csv").
            option("header","true").
option("delimiter", ",").
option("inferSchema", "true").
csv("/user/viswatejaster9073/nullvalues/Retail_Data_Transactions_null.csv")

retail_t = [customer_id: string, trans_date: string ... 1 more field]


[customer_id: string, trans_date: string ... 1 more field]

# Summary of our datasets

## If you see the count of the total its 125000 records and if you see the count for each column under summary there are mismatch, which means there are some missing values

In [15]:
retail_t.summary().show()

+-------+-----------+----------+------------------+
|summary|customer_id|trans_date|       tran_amount|
+-------+-----------+----------+------------------+
|  count|     124988|    124989|            124986|
|   mean|       null|      null| 64.99039892467957|
| stddev|       null|      null|22.859972905342932|
|    min|     CS1112| 01-Apr-12|                10|
|    25%|       null|      null|                47|
|    50%|       null|      null|                65|
|    75%|       null|      null|                83|
|    max|     CS9000| 31-Oct-14|               105|
+-------+-----------+----------+------------------+



lastException: Throwable = null


In [11]:
retail_t.count()

125000

In [46]:
retail_t.show(10)

+-----------+----------+-----------+
|customer_id|trans_date|tran_amount|
+-----------+----------+-----------+
|     CS5295| 11-Feb-13|         35|
|     CS4768| 15-Mar-15|         39|
|     CS2122| 26-Feb-13|         52|
|     CS1217| 16-Nov-11|         99|
|     CS1850| 20-Nov-13|         78|
|     CS5539|      null|         81|
|     CS2724| 06-Feb-12|       null|
|     CS5902| 30-Jan-15|       null|
|     CS6040| 08-Jan-13|         76|
|     CS3802| 20-Aug-13|         75|
+-----------+----------+-----------+
only showing top 10 rows



# How to count all null values in a DataFrame

### We are using map transformation to find if there are any null values with anyNull function and the response will be either false when no nulls or true when there are nulls

In [26]:
retail_t.map(x=>x.anyNull).show(7)

+-----+
|value|
+-----+
|false|
|false|
|false|
|false|
|false|
| true|
| true|
+-----+
only showing top 7 rows



### Now we are filtering  to find the the values of true and counting it
#### if you see the above result the name of the coulm is "value" so we are filtering based on it
#### As per the result we have 31 null values

In [22]:
retail_t.map(x=>x.anyNull).filter($"value" === true).count

31

### How to count null values for perticular column in a DataFrame
#### for customer_id there are 12 null values

In [4]:
retail_t.filter($"tran_amount".isNull).count()

14

# Filling/Imputing missing values
## we will fill the missing values with Mean for the column IntegerType and for stringtype with some hardcoarded values

### Below is the method to find the mean 
#### If you see the below  result for the mean the string is enclosed in square brackets, so we cant use it to fill the missing values for integertype columns, we are removing it by using regular expressions with replaceAll method

In [5]:
var mean =  retail_t.select(avg($"tran_amount")).first().toString()

mean = [64.99039892467957]


[64.99039892467957]

In [18]:
var mean =  retail_t.select(avg($"tran_amount")).first().toString().replaceAll("[\\[\\]]","")

mean = 64.99039892467957


64.99039892467957

### fill is the function used to fill the null values

In [34]:
val retail_fill = retail_t.na.fill(mean.toFloat,Seq("tran_amount"))

retail_fill = [customer_id: string, trans_date: string ... 1 more field]


[customer_id: string, trans_date: string ... 1 more field]

In [36]:
retail_fill.show()

+-----------+----------+-----------+
|customer_id|trans_date|tran_amount|
+-----------+----------+-----------+
|     CS5295| 11-Feb-13|         35|
|     CS4768| 15-Mar-15|         39|
|     CS2122| 26-Feb-13|         52|
|     CS1217| 16-Nov-11|         99|
|     CS1850| 20-Nov-13|         78|
|     CS5539|      null|         81|
|     CS2724| 06-Feb-12|         64|
|     CS5902| 30-Jan-15|         64|
|     CS6040| 08-Jan-13|         76|
|     CS3802| 20-Aug-13|         75|
|     CS3494| 02-Jul-13|         94|
|     CS3780| 25-Mar-13|         80|
|     CS1171| 03-Nov-12|         59|
|     CS2892|      null|         43|
|     CS5552| 29-Dec-14|         78|
|     CS6043| 15-Jan-14|         98|
|     CS4147| 08-Jul-13|         81|
|       null| 30-Dec-13|         93|
|     CS3904| 20-Jul-14|        103|
|     CS4102| 09-Jul-11|         64|
+-----------+----------+-----------+
only showing top 20 rows



# Now lets make it complex, I want to fill the null values for all missing values in all columns in my DataFrame
## lets say if my column is string I want ot replace with Mode if it is integer I need to replace with mean

### finding maximum transaction date and customer id 

In [15]:
val trans_date = retail_t.select($"trans_date").groupBy($"trans_date").count().orderBy($"count".desc).first()(0).toString.replaceAll("[\\[\\]]","")

trans_date = 16-Jul-11


16-Jul-11

### finding most visiting customer

In [16]:
val customer = retail_t.select($"customer_id").groupBy($"customer_id").count().orderBy($"count".desc).first()(0).toString.replaceAll("[\\[\\]]","")

customer = CS4424


CS4424

### now filling the missing values with mean, most visiting cutomer and maximum transaction date as shown below

In [19]:
val fillmissvalues = Map( "customer_id" -> customer,
                          "trans_date" -> trans_date,
                           "tran_amount" -> mean     // Mean we already calculated
                            )

fillmissvalues = Map(customer_id -> CS4424, trans_date -> 16-Jul-11, tran_amount -> 64.99039892467957)


Map(customer_id -> CS4424, trans_date -> 16-Jul-11, tran_amount -> 64.99039892467957)

In [20]:
val fillmiss = retail_t.na.fill(fillmissvalues)

fillmiss = [customer_id: string, trans_date: string ... 1 more field]


[customer_id: string, trans_date: string ... 1 more field]

In [21]:
fillmiss.show()

+-----------+----------+-----------+
|customer_id|trans_date|tran_amount|
+-----------+----------+-----------+
|     CS5295| 11-Feb-13|         35|
|     CS4768| 15-Mar-15|         39|
|     CS2122| 26-Feb-13|         52|
|     CS1217| 16-Nov-11|         99|
|     CS1850| 20-Nov-13|         78|
|     CS5539| 16-Jul-11|         81|
|     CS2724| 06-Feb-12|         64|
|     CS5902| 30-Jan-15|         64|
|     CS6040| 08-Jan-13|         76|
|     CS3802| 20-Aug-13|         75|
|     CS3494| 02-Jul-13|         94|
|     CS3780| 25-Mar-13|         80|
|     CS1171| 03-Nov-12|         59|
|     CS2892| 16-Jul-11|         43|
|     CS5552| 29-Dec-14|         78|
|     CS6043| 15-Jan-14|         98|
|     CS4147| 08-Jul-13|         81|
|     CS4424| 30-Dec-13|         93|
|     CS3904| 20-Jul-14|        103|
|     CS4102| 09-Jul-11|         64|
+-----------+----------+-----------+
only showing top 20 rows



# Dropping missing values

In [23]:
retail_t.count()

125000

In [24]:
retail_t.na.drop().count()

124969

In [25]:
retail_t.count()
retail_t.na.drop().count()

31

In [27]:
retail_t.na.drop("all", Seq("customer_id", "tran_amount")).count()

124997

# Difference between null and NAN
## Null means  missing values or no information present, where NaN means not an number some mathematical miscalculations 