# How to Remove Duplicate Values

In [1]:
import pandas as pd

In [49]:
# Let's create a dataset 
student = {"Name" : ["Muhammad", "Alam", "Rishan", "Ayaan", "Dil", "Muhammad"],
          "Mark" : [71, 88, 89, 92, 89, 72],
           "City" : ["Sydney", "Melbourn", "Perth", "Darwin", "Dhaka", "Sydney"]
          }

In [50]:
# creating pandas dataframe
df = pd.DataFrame(student)

In [51]:
df

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Dil,89,Dhaka
5,Muhammad,72,Sydney


In [52]:
# Column name in Python appears within in speach mark, while in 
# Power Bi in [].
# It will assess the values as duplicate only based on the subset column.
# even if there are other duplicates in another column, they will be ignored.
# Once a duplicate value is found, entire column will be droped. 
df.drop_duplicates(subset = 'Name', keep = 'first' )

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Dil,89,Dhaka


In [53]:
# Now assessing based on the column "Mark", and the last one will be kept,
# and others will be dropped. 

df.drop_duplicates(subset = 'Mark', keep = 'last' )

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
3,Ayaan,92,Darwin
4,Dil,89,Dhaka
5,Muhammad,72,Sydney


In [54]:
# with keep as "False", none of the duplicates will be kept.

df.drop_duplicates(subset = 'Name', keep = False )

Unnamed: 0,Name,Mark,City
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Dil,89,Dhaka


In [55]:
# However, my DataFrame is still unchanged!
df

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Dil,89,Dhaka
5,Muhammad,72,Sydney


In [56]:
# with inplace = True, original dataframe will be affected. 

df.drop_duplicates(subset = 'Name', keep = False, inplace = True )
df

Unnamed: 0,Name,Mark,City
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Dil,89,Dhaka


## So far, duplicate values are assessed based on single column, multiple columns can be
## used instead.

In [73]:

student_1 = {"Name" : ["Muhammad", "Alam", "Rishan", "Ayaan", "Rishan", "Muhammad"],
          "Mark" : [71, 88, 89, 92, 89, 71],
           "City" : ["Sydney", "Melbourn", "Perth", "Darwin", "Dhaka", "Sydney"]
          }

In [74]:
df_1 = pd.DataFrame(student_1)

In [76]:
df_1

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Rishan,89,Dhaka
5,Muhammad,71,Sydney


In [78]:
df_1.drop_duplicates(subset = ["Name", "Mark", "City"], keep = "first")

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
4,Rishan,89,Dhaka


In [81]:
df_1.drop_duplicates(subset = ['Name', 'Mark'], keep = 'first')

Unnamed: 0,Name,Mark,City
0,Muhammad,71,Sydney
1,Alam,88,Melbourn
2,Rishan,89,Perth
3,Ayaan,92,Darwin
