# dropDuplicates()

| Use Case                                | Code                                    |
| --------------------------------------- | --------------------------------------- |
| Drop exact duplicates                   | `df.dropDuplicates()`                   |
| Drop based on single column             | `df.dropDuplicates(["name"])`           |
| Drop based on multiple columns          | `df.dropDuplicates(["name", "salary"])` |
| Drop and keep latest/first using Window | Use `row_number()` + filter             |


In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
data = [
    (1, "Alice", "IT", 60000),
    (2, "Bob", "HR", 50000),
    (3, "Alice", "IT", 60000),     # duplicate
    (4, "David", "HR", 50000),     # same dept & salary as Bob
    (5, "Eve", "Finance", 70000),
    (6, "Frank", "IT", 60000),     # same dept & salary as Alice
    (7, "Alice", "HR", 60000),     # same name, different dept
    (8, "Grace", "Finance", 70000),# same dept & salary as Eve
    (9, "Eve", "Finance", 70000),  # duplicate of Eve
    (10, "Eve", "Finance", 72000)  # same name, different salary
]
columns = ["EmpID", "Name", "Dept", "Salary"]
df = spark.createDataFrame(data, columns)
df.display()

EmpID,Name,Dept,Salary
1,Alice,IT,60000
2,Bob,HR,50000
3,Alice,IT,60000
4,David,HR,50000
5,Eve,Finance,70000
6,Frank,IT,60000
7,Alice,HR,60000
8,Grace,Finance,70000
9,Eve,Finance,70000
10,Eve,Finance,72000


# Scenario 1: Drop Fully Duplicate Row

In [0]:
df1=df.dropDuplicates()\
    .display()


EmpID,Name,Dept,Salary
1,Alice,IT,60000
2,Bob,HR,50000
3,Alice,IT,60000
4,David,HR,50000
5,Eve,Finance,70000
6,Frank,IT,60000
7,Alice,HR,60000
8,Grace,Finance,70000
9,Eve,Finance,70000
10,Eve,Finance,72000


# Scenario 2: Drop Duplicates by Single Column (e.g., Name)

In [0]:
df2=df.dropDuplicates(["Name"])\
    .display()

EmpID,Name,Dept,Salary
1,Alice,IT,60000
2,Bob,HR,50000
4,David,HR,50000
5,Eve,Finance,70000
6,Frank,IT,60000
8,Grace,Finance,70000


# Scenario 3: Drop Duplicates by Multiple Columns (e.g., Name + Dept)

In [0]:
df3=df.dropDuplicates(["Name","Dept"])\
    .display()

EmpID,Name,Dept,Salary
1,Alice,IT,60000
2,Bob,HR,50000
5,Eve,Finance,70000
4,David,HR,50000
6,Frank,IT,60000
7,Alice,HR,60000
8,Grace,Finance,70000


# Scenario 4: Drop Duplicates Based on Business Logic

In [0]:
from pyspark.sql.window import Window

In [0]:
df4=df.withColumn("row_num",row_number().over(Window.partitionBy("Name").orderBy(col("Salary").desc())))\
    .filter(col("row_num")==1)

In [0]:
df4.display()

EmpID,Name,Dept,Salary,row_num
1,Alice,IT,60000,1
2,Bob,HR,50000,1
4,David,HR,50000,1
10,Eve,Finance,72000,1
6,Frank,IT,60000,1
8,Grace,Finance,70000,1


In [0]:
df5=df.withColumn("row_num",row_number().over(Window.partitionBy("Name").orderBy(col("EmpID").desc())))\
    .filter(col("row_num")==1)\
    .display()

EmpID,Name,Dept,Salary,row_num
7,Alice,HR,60000,1
2,Bob,HR,50000,1
4,David,HR,50000,1
10,Eve,Finance,72000,1
6,Frank,IT,60000,1
8,Grace,Finance,70000,1


# dropna()

| Use Case           | Code                                                    | Result                                    |
| ------------------ | ------------------------------------------------------- | ----------------------------------------- |
| Any Null           | `df.dropna()`                                           | Keep fully filled rows                    |
| All Null           | `df.dropna(how="all")`                                  | Drop fully empty rows                     |
| Subset Columns     | `df.dropna(subset=["Name", "Salary"])`                  | Drops if any of those columns are null    |
| Threshold          | `df.dropna(thresh=3)`                                   | Keeps rows with 3 or more filled columns  |
| Any null in subset | `df.dropna(how="any", subset=["Department", "Salary"])` | Drops based on condition in those columns |


In [0]:
data = [
    (1, "Alice", 5000, "IT"),
    (2, None, None, "HR"),
    (3, "Bob", None, None),
    (4, "David", 4000, "Finance"),
    (5, None, None, None),
    (6, "Eva", 3000, "IT"),
    (7, "Frank", None, None),
    (8, "Grace", 3500, "HR"),
    (9, None, 4200, None),
    (10, "Hannah", None, "Finance"),
    (11, "Ian", 3900, "IT"),
    (12, "Jack", None, "Sales"),
    (13, "Kate", 2800, None),
    (14, None, None, None),
    (15, "Liam", 3100, "Finance"),
    (16, "Mia", None, "IT"),
    (17, "Nick", 4000, "Sales"),
    (18, "Olivia", None, None),
    (19, None, 2700, "HR"),
    (20, "Paul", 3600, "Marketing"),
]

columns = ["ID", "Name", "Salary", "Department"]

df = spark.createDataFrame(data, columns)


In [0]:
df.display()

ID,Name,Salary,Department
1,Alice,5000.0,IT
2,,,HR
3,Bob,,
4,David,4000.0,Finance
5,,,
6,Eva,3000.0,IT
7,Frank,,
8,Grace,3500.0,HR
9,,4200.0,
10,Hannah,,Finance


# 1.Drop rows with any null
 Removes any row where at least one column is null

 Keeps only rows that are completely filled.

In [0]:
df1=df.dropna()\
    .display()

ID,Name,Salary,Department
1,Alice,5000,IT
4,David,4000,Finance
6,Eva,3000,IT
8,Grace,3500,HR
11,Ian,3900,IT
15,Liam,3100,Finance
17,Nick,4000,Sales
20,Paul,3600,Marketing


# 2.Drop rows where all values are null
Removes rows where every column is null

Keeps rows where at least one column has data.

In [0]:
df2=df.dropna(how="all")\
    .display()

ID,Name,Salary,Department
1,Alice,5000.0,IT
2,,,HR
3,Bob,,
4,David,4000.0,Finance
5,,,
6,Eva,3000.0,IT
7,Frank,,
8,Grace,3500.0,HR
9,,4200.0,
10,Hannah,,Finance


# 3.Drop rows where specific columns (e.g., Name & Salary) have null
 Only removes rows where either Name or Salary is null.

 Doesnâ€™t care about nulls in other columns.

In [0]:
df3=df.dropna(subset=["Name","Salary"])\
    .display()

ID,Name,Salary,Department
1,Alice,5000,IT
4,David,4000,Finance
6,Eva,3000,IT
8,Grace,3500,HR
11,Ian,3900,IT
13,Kate,2800,
15,Liam,3100,Finance
17,Nick,4000,Sales
20,Paul,3600,Marketing


# 4.Drop rows with less than 3 non-null values
Keeps rows that have at least 3 non-null values

Drops rows with 0, 1, or 2 filled fields.

In [0]:
df4=df.dropna(thresh=3)\
    .display()

ID,Name,Salary,Department
1,Alice,5000.0,IT
4,David,4000.0,Finance
6,Eva,3000.0,IT
8,Grace,3500.0,HR
10,Hannah,,Finance
11,Ian,3900.0,IT
12,Jack,,Sales
13,Kate,2800.0,
15,Liam,3100.0,Finance
16,Mia,,IT


# 5.Drop rows where any of a subset is null
Drops rows where either Department or Salary is null

In [0]:
df5=df.dropna(how="any", subset=["Department", "Salary"])\
    .display()


ID,Name,Salary,Department
1,Alice,5000,IT
4,David,4000,Finance
6,Eva,3000,IT
8,Grace,3500,HR
11,Ian,3900,IT
15,Liam,3100,Finance
17,Nick,4000,Sales
19,,2700,HR
20,Paul,3600,Marketing


# fillna()

| **Scenario**                              | **Code**                                   | **Use**                                      |
| ----------------------------------------- | ------------------------------------------ | -------------------------------------------- |
| Fill all nulls with a single value        | `df.fillna("N/A")`                         | For quick string or number replacements      |
| Fill nulls in specific column             | `df.fillna(0, subset=["salary"])`          | Replace nulls only in `salary` column        |
| Fill multiple columns with different vals | `df.fillna({"name": "Unknown", "age": 0})` | Set custom fill values for different columns |
| Fill numeric nulls with average           | `df.fillna({"salary": avg_salary})`        | Replace nulls with computed average          |



In [0]:
data = [
    (1, "Alice", 25, 5000.0),
    (2, "Bob", None, None),
    (3, "Charlie", 30, 6000.0),
    (4, None, 35, None),
    (5, "Eve", None, 7000.0),
    (6, "Frank", 40, None),
    (7, None, None, 8000.0),
    (8, "Grace", 28, 5200.0),
    (9, "Heidi", None, None),
    (10, "Ivan", 32, 5800.0),
    (11, None, 31, None),
    (12, "Judy", None, 6100.0),
    (13, "Mallory", 29, 5700.0),
    (14, "Niaj", None, 5400.0),
    (15, None, 33, 6200.0),
    (16, "Olivia", 34, None),
    (17, "Peggy", None, None),
    (18, "Rupert", 27, 5900.0),
    (19, None, 26, 5300.0),
    (20, "Sybil", None, None),
]

columns = ["ID", "Name", "Age", "Salary"]
df_new = spark.createDataFrame(data, columns)



In [0]:
df_new.display()

ID,Name,Age,Salary
1,Alice,25.0,5000.0
2,Bob,,
3,Charlie,30.0,6000.0
4,,35.0,
5,Eve,,7000.0
6,Frank,40.0,
7,,,8000.0
8,Grace,28.0,5200.0
9,Heidi,,
10,Ivan,32.0,5800.0


#  1. Fill all nulls with a fixed value

In [0]:
df_new1=df_new.fillna("Unknown")\
    .display()

ID,Name,Age,Salary
1,Alice,25.0,5000.0
2,Bob,,
3,Charlie,30.0,6000.0
4,Unknown,35.0,
5,Eve,,7000.0
6,Frank,40.0,
7,Unknown,,8000.0
8,Grace,28.0,5200.0
9,Heidi,,
10,Ivan,32.0,5800.0


#  2.Fill nulls in a specific column

In [0]:
df_new2=df_new.fillna(0,subset=["Age"])\
    .display()



ID,Name,Age,Salary
1,Alice,25,5000.0
2,Bob,0,
3,Charlie,30,6000.0
4,,35,
5,Eve,0,7000.0
6,Frank,40,
7,,0,8000.0
8,Grace,28,5200.0
9,Heidi,0,
10,Ivan,32,5800.0


# 3.Fill nulls in multiple columns differently

In [0]:
df_new3=df_new.fillna({"Name":"Unknown","Age":0,"Salary":1000})\
    .display()

ID,Name,Age,Salary
1,Alice,25,5000.0
2,Bob,0,1000.0
3,Charlie,30,6000.0
4,Unknown,35,1000.0
5,Eve,0,7000.0
6,Frank,40,1000.0
7,Unknown,0,8000.0
8,Grace,28,5200.0
9,Heidi,0,1000.0
10,Ivan,32,5800.0


# 4.Fill nulls in numeric column with average

In [0]:
df_avg=df_new.select(round(avg("salary"),0)).first()[0]
df_new5=df_new.fillna({"Salary":df_avg})\
  .display()


ID,Name,Age,Salary
1,Alice,25.0,5000.0
2,Bob,,5967.0
3,Charlie,30.0,6000.0
4,,35.0,5967.0
5,Eve,,7000.0
6,Frank,40.0,5967.0
7,,,8000.0
8,Grace,28.0,5200.0
9,Heidi,,5967.0
10,Ivan,32.0,5800.0
