# Removing Duplicates

In [2]:
import pandas as pd
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
#The DataFrame method duplicated returns a boolean Series indicating whether each
#row is a duplicate (has been observed in a previous row) or not:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

# Pandas drop_duplicates() function removes duplicate rows from the DataFrame.

In [None]:
drop_duplicates(self, subset=None, keep="first", inplace=False)

keep: allowed values are {‘first’, ‘last’, False}, default ‘first’. 
    If ‘first’, duplicate rows except the first one is deleted. 
    If ‘last’, duplicate rows except the last one is deleted. 
    If False, all the duplicate rows are deleted.

In [4]:
#drop_duplicates returns a DataFrame where the duplicated array is False:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [5]:
data.drop_duplicates(keep=False)

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3


In [None]:
#Suppose we had an additional column of values and wanted to filter duplicates only based on the 'k1' column:

In [6]:
data['v1'] = range(7)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [8]:
data.drop_duplicates(['k1'])


Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


# Drop Duplicates and Keep Last Row

In [9]:
#Duplicated and drop_duplicates by default keep the first observed value combina‐tion. Passing keep='last' will return the last one:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


# Identify Duplicate Rows based on Specific Columns

In [6]:

import pandas as pd

d1 = {'A': [1, 1, 1, 2], 'B': [2, 2, 2, 3], 'C': [3, 3, 4, 5]}

source_df = pd.DataFrame(d1)
print('Source DataFrame:\n', source_df)

result_df = source_df.drop_duplicates(subset=['A', 'B'])
print('Result DataFrame:\n', result_df)

Source DataFrame:
    A  B  C
0  1  2  3
1  1  2  3
2  1  2  4
3  2  3  5
Result DataFrame:
    A  B  C
0  1  2  3
3  2  3  5


The columns ‘A’ and ‘B’ are used to identify duplicate rows. 

Hence, rows 0, 1, and 2 are duplicates. So, rows 1 and 2 are removed from the output.

# Transforming Data Using a Function or Mapping

In [27]:
data = pd.DataFrame({'food': ['Chilly', 'Apple', 'Pastry',
   'corn', 'Mango',
  'Chicken', 'nova lox'],
  'ounces': [4,  6, 7.5, 8, 3, 5, 6]})

data

Unnamed: 0,food,ounces
0,Chilly,4.0
1,Apple,6.0
2,Pastry,7.5
3,corn,8.0
4,Mango,3.0
5,Chicken,5.0
6,nova lox,6.0


In [28]:
lowercased = data['food'].str.lower()
lowercased

0      chilly
1       apple
2      pastry
3        corn
4       mango
5     chicken
6    nova lox
Name: food, dtype: object

In [29]:
#write down a mapping of each distinct meat type to the kind of animal:
Catogery = {
 'Pastry': 'Pastry',
 'corn': 'Fruits',
 'Mango': 'Fruits',
 'Chicken': 'Meat',
 'Apple': 'pig',
 'nova lox': 'salmon'
}

In [30]:
data['Food_Catogery'] = lowercased.map(Catogery)

#Using map is a convenient way to perform element-wise transformations and other data cleaning–related operations.

In [31]:
data

Unnamed: 0,food,ounces,Food_Catogery
0,Chilly,4.0,
1,Apple,6.0,
2,Pastry,7.5,
3,corn,8.0,Fruits
4,Mango,3.0,
5,Chicken,5.0,
6,nova lox,6.0,salmon


# Replacing Values
Filling in missing data with the fillna method is a special case of more general value
replacement.

In [44]:
s = pd.Series([27, 33, 13, 19])
s
s.replace(13, 42)

0    27
1    33
2    42
3    19
dtype: int64

In [45]:
s

0    27
1    33
2    13
3    19
dtype: int64

In [46]:
import pandas as pd

first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')
job = ('data analyst', 'programmer', 'computer scientist', 
       'data scientist', 'programmer', 'psychiatrist')
language = ('Python', 'Perl', 'Java', 'Pithon', 'Pythen', 'Brainfuck')

df = pd.DataFrame(list(zip(last, job, language)), 
                  columns =['last', 'job', 'language'],
                  index=first) 

In [47]:
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,programmer,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [48]:
df.replace("programmer", 
           "computer scientist", 
           inplace=True)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,computer scientist,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [49]:
#Changing one value in DataFrame

# accessing the job of Bill:
print(df.loc['Bill', 'job'])
# alternative way to access it with at:
print(df.at['Bill', 'job'])

# setting the job of Bill to 'data analyst' with 'loc'
df.loc['Bill', 'job'] = 'data analyst'
# let us check it:
print(df.loc['Bill', 'job'])

# setting the job of Bill to 'computer scientist' with 'at'
df.at['Pete', 'language'] = 'Python'

data scientist
data scientist
data analyst


In [51]:
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data analyst,Pithon
Pete,Meyr,computer scientist,Python
Kate,Mair,psychiatrist,Brainfuck


# Rename

In [54]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
 index=['Ohio', 'Colorado', 'new York'],
 columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
new York,8,9,10,11


In [55]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [60]:
#rename can be used in conjunction with a dict-like object providing new val‐ues for a subset of the axis labels:
data.rename( columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
new York,8,9,10,11


In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Ohio to new York
Data columns (total 4 columns):
one      3 non-null int32
two      3 non-null int32
three    3 non-null int32
four     3 non-null int32
dtypes: int32(4)
memory usage: 152.0+ bytes


In [62]:
data.rename(columns={'three': 'peekaboo'}, inplace=True)

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, Ohio to new York
Data columns (total 4 columns):
one         3 non-null int32
two         3 non-null int32
peekaboo    3 non-null int32
four        3 non-null int32
dtypes: int32(4)
memory usage: 72.0+ bytes


# Discretization and Binning

Continuous data is often discretized or otherwise separated into “bins” for analysis.
Suppose you have data about a group of people in a study, and you want to group them into discrete age buckets:


In [65]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]


In [66]:
#Let’s divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To do so, you have to use cut, a function in pandas:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats


[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [67]:
 pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [68]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [69]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [70]:
 pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

# Computing Indicator/Dummy Variables


# Engineer labels of categorical variables

In this section, I will describe a variety of methods to transform the strings of categorical variables into numbers, so that we can feed these variables in machine learning algorithms using sklearn.

One Hot Encoding

One hot encoding, consists of replacing the categorical variable by different boolean variables, which take value 0 or 1, to indicate whether or not a certain category / label of the variable was present for that observation.

Each one of the boolean variables are also known as dummy variables or binary variables.


In [71]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})


In [72]:
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [73]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [74]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [75]:
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [76]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy 

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0
