In [28]:
# Section must be included at the beginning of each new notebook. Remember to change the app name. 
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('missing').getOrCreate()

In [29]:
# Importing data which has a header and automatically configuring the schema.
df = spark.read.csv('Datasets/IVS_Country.csv', header=True, inferSchema=True)

# Let's visually inspect the data. You may notice some nulls in the total visitor spend and total visitors column. 
df.show()

# Let's see how many rows of data we originally have.
print("Total data points:", df.count())

+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|Pkey|Year|Quarter|Year ending|Airport of departure|  Purpose of visit|Country of permanent residence|Total visitor spend|Total visitors|
+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|   1|1997|      4| YEDec 1997|            Auckland|          Business|          Africa and Middle...|         4266524.07|          1684|
|   2|1997|      4| YEDec 1997|            Auckland|          Business|                     Australia|      1.325885058E8|         67277|
|   3|1997|      4| YEDec 1997|            Auckland|          Business|                        Canada|      1.558969421E7|          3596|
|   4|1997|      4| YEDec 1997|            Auckland|          Business|                         China|         9490117.52|          4510|
|   5|1997|      4| YEDec 1997|   

In [30]:
# 'na' stands for Not Available. Using na, we can then use drop. 
# After using show, you'll find that the rows with the null values are gone. 
df.na.drop().show()

# Let's see how many rows of data we have now. 
print("Total data points:", df.count())

+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|Pkey|Year|Quarter|Year ending|Airport of departure|  Purpose of visit|Country of permanent residence|Total visitor spend|Total visitors|
+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|   1|1997|      4| YEDec 1997|            Auckland|          Business|          Africa and Middle...|         4266524.07|          1684|
|   2|1997|      4| YEDec 1997|            Auckland|          Business|                     Australia|      1.325885058E8|         67277|
|   3|1997|      4| YEDec 1997|            Auckland|          Business|                        Canada|      1.558969421E7|          3596|
|   4|1997|      4| YEDec 1997|            Auckland|          Business|                         China|         9490117.52|          4510|
|   5|1997|      4| YEDec 1997|   

In [31]:
df.filter("pkey=55").show()

+----+----+-------+-----------+--------------------+----------------+------------------------------+-------------------+--------------+
|Pkey|Year|Quarter|Year ending|Airport of departure|Purpose of visit|Country of permanent residence|Total visitor spend|Total visitors|
+----+----+-------+-----------+--------------------+----------------+------------------------------+-------------------+--------------+
|  55|1997|      4| YEDec 1997|        Christchurch|        Business|                        Canada|               null|          null|
+----+----+-------+-----------+--------------------+----------------+------------------------------+-------------------+--------------+



In [33]:
# Oops! Forgot to assign the result to a variable. Let's try that again. Now you can see that records with null values have been removed.
dropped_df = df.na.drop()
dropped_df.show()
print("Total data points:", dropped_df.count())

+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|Pkey|Year|Quarter|Year ending|Airport of departure|  Purpose of visit|Country of permanent residence|Total visitor spend|Total visitors|
+----+----+-------+-----------+--------------------+------------------+------------------------------+-------------------+--------------+
|   1|1997|      4| YEDec 1997|            Auckland|          Business|          Africa and Middle...|         4266524.07|          1684|
|   2|1997|      4| YEDec 1997|            Auckland|          Business|                     Australia|      1.325885058E8|         67277|
|   3|1997|      4| YEDec 1997|            Auckland|          Business|                        Canada|      1.558969421E7|          3596|
|   4|1997|      4| YEDec 1997|            Auckland|          Business|                         China|         9490117.52|          4510|
|   5|1997|      4| YEDec 1997|   