#### Removing duplicate rows based on updated date

In [0]:
dbutils.fs.put("/scenarios/duplicates.csv","""id,name,loc,updated_date
1,ravi,bangalore,2021-01-01
1,ravi,chennai,2022-02-02
1,ravi,Hyderabad,2022-06-10
2,Raj,bangalore,2021-01-01
2,Raj,chennai,2022-02-02
3,Raj,Hyderabad,2022-06-10
4,Prasad,bangalore,2021-01-01
5,Mahesh,chennai,2022-02-02
4,Prasad,Hyderabad,2022-06-10
""")

Wrote 274 bytes.
Out[1]: True

In [0]:
df= spark.read.csv("/scenarios/duplicates.csv",header=True,inferSchema=True)
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- loc: string (nullable = true)
 |-- updated_date: timestamp (nullable = true)



In [0]:
display(df)

id,name,loc,updated_date
1,ravi,bangalore,2021-01-01T00:00:00.000+0000
1,ravi,chennai,2022-02-02T00:00:00.000+0000
1,ravi,Hyderabad,2022-06-10T00:00:00.000+0000
2,Raj,bangalore,2021-01-01T00:00:00.000+0000
2,Raj,chennai,2022-02-02T00:00:00.000+0000
3,Raj,Hyderabad,2022-06-10T00:00:00.000+0000
4,Prasad,bangalore,2021-01-01T00:00:00.000+0000
5,Mahesh,chennai,2022-02-02T00:00:00.000+0000
4,Prasad,Hyderabad,2022-06-10T00:00:00.000+0000


In [0]:
from pyspark.sql.functions import col
display(df.orderBy(col("updated_date").desc()).dropDuplicates(["id"]))

id,name,loc,updated_date
1,ravi,Hyderabad,2022-06-10T00:00:00.000+0000
2,Raj,chennai,2022-02-02T00:00:00.000+0000
3,Raj,Hyderabad,2022-06-10T00:00:00.000+0000
4,Prasad,Hyderabad,2022-06-10T00:00:00.000+0000
5,Mahesh,chennai,2022-02-02T00:00:00.000+0000


# Window function with row_number()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import *
df = df.withColumn("rowid",row_number().over(Window.partitionBy("id").orderBy(col("updated_date").desc())))

In [0]:
df_uniq = df.filter("rowid=1")

In [0]:
df_baddata = df.filter("rowid>1")

In [0]:
display(df_uniq)

id,name,loc,updated_date,rowid
1,ravi,Hyderabad,2022-06-10T00:00:00.000+0000,1
2,Raj,chennai,2022-02-02T00:00:00.000+0000,1
3,Raj,Hyderabad,2022-06-10T00:00:00.000+0000,1
4,Prasad,Hyderabad,2022-06-10T00:00:00.000+0000,1
5,Mahesh,chennai,2022-02-02T00:00:00.000+0000,1
