In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.master("local").appName("Duplicate").getOrCreate()
sc = spark.sparkContext

In [6]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")

"""
Identify Duplicate Records and report it
1.GroupBy approach and take count , no of each and every record >1. we have to add all columns in group. 
lots of shuffles involved
2.Window ranking function approach - ranking, aggregate, analytics function
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
                .option("delimiter",",") \
                .csv(filepath + "IntDUplicate.csv",inferSchema=True)
df.show()

+------+---+---------+----+
|  Name|Age|Education|Year|
+------+---+---------+----+
|   RAM| 28|       BE|2012|
|Rakesh| 53|      MBA|1985|
| Madhu| 22|    B.Com|2018|
|Rakesh| 53|      MBA|1985|
|  Bill| 32|       ME|2007|
| Madhu| 22|    B.Com|2018|
|Rakesh| 53|      MBA|1985|
+------+---+---------+----+



In [10]:
#Method1
df.groupBy("Name","Age","Education","Year").count().where("count > 1").drop("count").show()

+------+---+---------+----+
|  Name|Age|Education|Year|
+------+---+---------+----+
|Rakesh| 53|      MBA|1985|
| Madhu| 22|    B.Com|2018|
+------+---+---------+----+



In [25]:
#Method2
#Ranking - get row_number,rank,dense_rank,rercentile,ntile
from pyspark.sql.window import Window

#get Row_Num for duplicate records.
#We have to give orderBy function, if we use windowing function
win = Window.partitionBy("Name").orderBy(col("Year").desc())

In [26]:
df.withColumn("rank",row_number().over(win) ).show()

+------+---+---------+----+----+
|  Name|Age|Education|Year|rank|
+------+---+---------+----+----+
|  Bill| 32|       ME|2007|   1|
|   RAM| 28|       BE|2012|   1|
|Rakesh| 53|      MBA|1985|   1|
|Rakesh| 53|      MBA|1985|   2|
|Rakesh| 53|      MBA|1985|   3|
| Madhu| 22|    B.Com|2018|   1|
| Madhu| 22|    B.Com|2018|   2|
+------+---+---------+----+----+



In [24]:
df.withColumn("rank",row_number().over(win) ) \
.filter("rank > 1") \
.drop("rank").dropDuplicates().show()

+------+---+---------+----+
|  Name|Age|Education|Year|
+------+---+---------+----+
|Rakesh| 53|      MBA|1985|
| Madhu| 22|    B.Com|2018|
+------+---+---------+----+

