In [1]:
# Spark Imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
# Start the Spark session.
spark = SparkSession.builder.appName('miss').getOrCreate()

In [3]:
# We can infer the schema/types (only in CSV), and header tells us
# that the first row are the names of the columns.
df = spark.read.csv('Data/contains_null.csv', 
                    header=True, inferSchema=True)

In [4]:
# We have 3 columns, in Name and Sales columns there's missing values.
# Some rows are missing one or the other, and some both.
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [5]:
# This drops any rows that contains any row with missing data.
# df.na.drop().show()
# If you specify a threshold value, it will drop rows that have
# less than this certain number of non-null values.
# 0 - Drops 0 - none has less than 0 non-null.
# 1 - Drops 0 - all columns have at least one value defined.
# 2 - Drops 1 - one column has only 1 non-null from 3 total. 
# 3 - Drops 3 - leaves only the one with all 3 columns defined.
# 4 - Drops all - with 3 columns its impossible to have 4 non-null.
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [6]:
# You can also 'how' to drop either doing it if 'any'value is null,
# or if 'all' of them are. No row has all nulls so no row is dropped.
# The 'how' parameter is overwritten by 'thresh' so be careful.
df.na.drop(how='all').show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [7]:
# Another useful parameter is 'subset' which allows to specify from
# which columns you want to look for missing data for dropping.
# Sometimes some columns may be tolerable to have missing data but
# not others, like 'Id' or 'Sales'.
df.na.drop(subset=['Sales', 'Id']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
# Check the schema so you can see later how Spark can automatically
# fill certain columns according to the type of fill value given.
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [9]:
# From schema, Spark automatically knows to fill in string values
# if you provide it with a string, same with numeric values.
df.na.fill('FILL VALUE STR').show()

+----+--------------+-----+
|  Id|          Name|Sales|
+----+--------------+-----+
|emp1|          John| null|
|emp2|FILL VALUE STR| null|
|emp3|FILL VALUE STR|345.0|
|emp4|         Cindy|456.0|
+----+--------------+-----+



In [10]:
# The inner call returns a DataFrame on which you can in turn call
# again the fill method to keep filling data. You can only fill one
# at a time with a call but if you provide a subset of Columns, it
# will target those columns specifically (in case you don't want to
# target all string columns with null values in a single fill).
(df.na.fill('No Name', subset=['Name'])).na.fill(-1).show()

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| -1.0|
|emp2|No Name| -1.0|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [11]:
# Some people fill numerical values with the mean value. Use the mean
# function on a given column, and collect to put all results into a
# list which we can iterate over.
mean_vals = df.select(f.mean(df['Sales'])).collect()

In [12]:
# Remember collect gives a list of rows, and then each row is a Row
# object from which you access elements in order.
mean_sales = mean_vals[0][0]

In [None]:
df.na.fill(mean_sales, ['Sales']).show()