# Removing Duplicates

In [2]:
import pandas as pd
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
#The DataFrame method duplicated returns a boolean Series indicating whether each
#row is a duplicate (has been observed in a previous row) or not:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

# Pandas drop_duplicates() Function Syntax
Pandas drop_duplicates() function removes duplicate rows from the DataFrame. Its syntax 

In [None]:
drop_duplicates(self, subset=None, keep="first", inplace=False)

In [4]:
#drop_duplicates returns a DataFrame where the duplicated array is False:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [None]:
#Suppose we had an additional column of values and wanted to filter duplicates only based on the 'k1' column:

In [6]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [8]:
data.drop_duplicates(['k1'])


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


# Drop Duplicates and Keep Last Row

In [9]:
#Duplicated and drop_duplicates by default keep the first observed value combina‐tion. Passing keep='last' will return the last one:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# Identify Duplicate Rows based on Specific Columns

In [1]:

import pandas as pd

d1 = {'A': [1, 1, 1, 2], 'B': [2, 2, 2, 3], 'C': [3, 3, 4, 5]}

source_df = pd.DataFrame(d1)
print('Source DataFrame:\n', source_df)

result_df = source_df.drop_duplicates(subset=['A', 'B'])
print('Result DataFrame:\n', result_df)

Source DataFrame:
    A  B  C
0  1  2  3
1  1  2  3
2  1  2  4
3  2  3  5
Result DataFrame:
    A  B  C
0  1  2  3
3  2  3  5


The columns ‘A’ and ‘B’ are used to identify duplicate rows. Hence, rows 0, 1, and 2 are duplicates. So, rows 1 and 2 are removed from the output.

# Remove Duplicate Rows in place


In [None]:
source_df.drop_duplicates(inplace=True)
print(source_df)

# Transforming Data Using a Function or Mapping

In [27]:
data = pd.DataFrame({'food': ['Chilly', 'Apple', 'Pastry',
   'corn', 'Mango',
  'Chicken', 'nova lox'],
  'ounces': [4,  6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,Chilly,4.0
1,Apple,6.0
2,Pastry,7.5
3,corn,8.0
4,Mango,3.0
5,Chicken,5.0
6,nova lox,6.0


In [28]:
lowercased = data['food'].str.lower()
lowercased

0      chilly
1       apple
2      pastry
3        corn
4       mango
5     chicken
6    nova lox
Name: food, dtype: object

In [29]:
#write down a mapping of each distinct meat type to the kind of animal:
Catogery = {
 'Pastry': 'Pastry',
 'corn': 'Fruits',
 'Mango': 'Fruits',
 'Chicken': 'Meat',
 'Apple': 'pig',
 'nova lox': 'salmon'
}

In [30]:
data['Food_Catogery'] = lowercased.map(Catogery)

#Using map is a convenient way to perform element-wise transformations and other data cleaning–related operations.

In [31]:
data

Unnamed: 0,food,ounces,Food_Catogery
0,Chilly,4.0,
1,Apple,6.0,
2,Pastry,7.5,
3,corn,8.0,Fruits
4,Mango,3.0,
5,Chicken,5.0,
6,nova lox,6.0,salmon


# Replacing Values
Filling in missing data with the fillna method is a special case of more general value
replacement.

In [44]:
s = pd.Series([27, 33, 13, 19])
s
s.replace(13, 42)

0    27
1    33
2    42
3    19
dtype: int64

In [45]:
s

0    27
1    33
2    13
3    19
dtype: int64

In [46]:
import pandas as pd

first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')
job = ('data analyst', 'programmer', 'computer scientist', 
       'data scientist', 'programmer', 'psychiatrist')
language = ('Python', 'Perl', 'Java', 'Pithon', 'Pythen', 'Brainfuck')

df = pd.DataFrame(list(zip(last, job, language)), 
                  columns =['last', 'job', 'language'],
                  index=first) 

In [47]:
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,programmer,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [48]:
df.replace("programmer", 
           "computer scientist", 
           inplace=True)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,computer scientist,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [49]:
#Changing one value in DataFrame

# accessing the job of Bill:
print(df.loc['Bill', 'job'])
# alternative way to access it with at:
print(df.at['Bill', 'job'])

# setting the job of Bill to 'data analyst' with 'loc'
df.loc['Bill', 'job'] = 'data analyst'
# let us check it:
print(df.loc['Bill', 'job'])

# setting the job of Bill to 'computer scientist' with 'at'
df.at['Pete', 'language'] = 'Python'

data scientist
data scientist
data analyst


In [51]:
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data analyst,Pithon
Pete,Meyr,computer scientist,Python
Kate,Mair,psychiatrist,Brainfuck
