In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col

spark = SparkSession.builder.appName("YourSparkApplication").master("local[*]").getOrCreate()

In [3]:
df=spark.read.csv(r"C:\Users\Muthu\Music\small_zipcode.csv",header=True,inferSchema=True)

In [4]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- population: integer (nullable = true)



In [5]:
df.show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



### fillna parameters (value, subset)

### value - int, float,string,bool or dict.

### subset - can be string , tuple or list. list of columns to consider

### if dict is leveraged in value parameter, subset is not needed.



In [6]:
df.fillna(value={'type':'unknown', 'population':0}).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704| unknown|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709| unknown|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|         0|
+---+-------+--------+-------------------+-----+----------+



In [7]:
df.fillna('unknown',['type','population']).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704| unknown|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709| unknown|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



### In the above scenario, 'unknown' value is assigned to replace columns 'type' and 'population' . since the 'population' column is integer, string value 'unknown' is not assigned.

In [8]:
df.show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



### - by default, it drops a row if any of the column value has null value.

### - 'all' : drops a row if all the column value has got null

### - subset : checks specific column for null values

In [9]:
df.dropna().show()

+---+-------+------+-----------------+-----+----------+
| id|zipcode|  type|             city|state|population|
+---+-------+------+-----------------+-----+----------+
|  4|  76166|UNIQUE|CINGULAR WIRELESS|   TX|     84000|
+---+-------+------+-----------------+-----+----------+



In [10]:
df.dropna('all').show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



In [11]:
df.dropna(subset=['population']).show()

+---+-------+--------+-----------------+-----+----------+
| id|zipcode|    type|             city|state|population|
+---+-------+--------+-----------------+-----+----------+
|  1|    704|STANDARD|             null|   PR|     30100|
|  3|    709|    null|     BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|CINGULAR WIRELESS|   TX|     84000|
+---+-------+--------+-----------------+-----+----------+



In [12]:
df.drop('city').show()

+---+-------+--------+-----+----------+
| id|zipcode|    type|state|population|
+---+-------+--------+-----+----------+
|  1|    704|STANDARD|   PR|     30100|
|  2|    704|    null|   PR|      null|
|  3|    709|    null|   PR|      3700|
|  4|  76166|  UNIQUE|   TX|     84000|
|  5|  76177|STANDARD|   TX|      null|
+---+-------+--------+-----+----------+

