## Pyspark Handling missing values
. Dropping columns
. Dropping rows
. Various parameters in dropping functionalities
. Handling missing values by mean, median and mode

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [34]:
df_pyspark = spark.read.csv('Info_2.csv', header=True, inferSchema=True)

In [4]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [5]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|  Sabeel|  32|      null| 28000|
| Ibrahim|  34|      null| 32000|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|   Virat|  30|      null| 25000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [6]:
#drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  33|        10| 30000|
|  31|         8|  null|
|  32|      null| 28000|
|  34|      null| 32000|
|null|        10|  null|
|  23|         2| 15000|
|  30|      null| 25000|
|  28|         5|  null|
+----+----------+------+



In [7]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|  Sabeel|  32|      null| 28000|
| Ibrahim|  34|      null| 32000|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|   Virat|  30|      null| 25000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [8]:
df_pyspark.na.drop().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Mohammed| 33|        10| 30000|
|   Rahul| 23|         2| 15000|
+--------+---+----------+------+



In [26]:
#thresh is the non null value to be shown
df_pyspark.na.drop(how='any', thresh=3).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Mohammed| 33|        10| 30000|
|  Ruhaan| 31|         8|  null|
|  Sabeel| 32|      null| 28000|
| Ibrahim| 34|      null| 32000|
|   Rahul| 23|         2| 15000|
|   Virat| 30|      null| 25000|
+--------+---+----------+------+



In [27]:
df_pyspark.na.drop(how="all").show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|  Sabeel|  32|      null| 28000|
| Ibrahim|  34|      null| 32000|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|   Virat|  30|      null| 25000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [28]:
#subset
df_pyspark.na.drop(how='any', subset=['Experience']).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [29]:
#subset
df_pyspark.na.drop(how='any', subset=['Experience','age']).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|Mohammed| 33|        10| 30000|
|  Ruhaan| 31|         8|  null|
|   Rahul| 23|         2| 15000|
|    null| 28|         5|  null|
+--------+---+----------+------+



In [38]:
#filling missing values

df_pyspark.na.fill('Missing Values', ['Experience','age']).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|  Sabeel|  32|      null| 28000|
| Ibrahim|  34|      null| 32000|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|   Virat|  30|      null| 25000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [39]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|Mohammed|  33|        10| 30000|
|  Ruhaan|  31|         8|  null|
|  Sabeel|  32|      null| 28000|
| Ibrahim|  34|      null| 32000|
|     Isa|null|        10|  null|
|   Rahul|  23|         2| 15000|
|   Virat|  30|      null| 25000|
|    null|  28|         5|  null|
+--------+----+----------+------+



In [66]:
from pyspark.ml.feature import Imputer
imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]).setStrategy("mode")

In [67]:
#add imputation columns to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+--------+----+----------+------+-----------+------------------+--------------+
|    Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+--------+----+----------+------+-----------+------------------+--------------+
|Mohammed|  33|        10| 30000|         33|                10|         30000|
|  Ruhaan|  31|         8|  null|         31|                 8|         15000|
|  Sabeel|  32|      null| 28000|         32|                10|         28000|
| Ibrahim|  34|      null| 32000|         34|                10|         32000|
|     Isa|null|        10|  null|         23|                10|         15000|
|   Rahul|  23|         2| 15000|         23|                 2|         15000|
|   Virat|  30|      null| 25000|         30|                10|         25000|
|    null|  28|         5|  null|         28|                 5|         15000|
+--------+----+----------+------+-----------+------------------+--------------+

