In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

# Handling Missing Data with Pandas

In [2]:
pd.isnull(np.nan)

True

In [3]:
pd.isna(np.nan)

True

In [4]:
# The opposite ones also exist

pd.notnull(np.nan)

False

In [5]:
# The opposite ones also exist

pd.notna(np.nan)

False

In [6]:
pd.notnull(3)

True

These functions also work with Series and DataFranes.

In [9]:
# Series

pd.isnull(pd.Series([1,np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [11]:
d = pd.DataFrame({
    'Column A' : [1,np.nan,7],
    'Column B' : [np.nan,2,3],
    'Column C' : [np.nan,2,np.nan]
})

d

Unnamed: 0,Column A,Column B,Column C
0,1.0,,
1,,2.0,2.0
2,7.0,3.0,


In [12]:
pd.isnull(d)

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


## Pandas Operations with Missing values

In [13]:
pd.Series([1,2,np.nan])

0    1.0
1    2.0
2    NaN
dtype: float64

In [14]:
pd.Series([1,2,np.nan]).count()

2

In [16]:
pd.Series([1,2,np.nan,np.nan,np.nan,np.nan]).count()

2

In [17]:
pd.Series([1,2,np.nan,np.nan,np.nan,444,np.nan]).count()

3

In [18]:
pd.Series([1,2,np.nan,np.nan,np.nan,444,np.nan]).sum()

447.0

In [19]:
pd.Series([1,2,np.nan,np.nan,np.nan,444,np.nan]).mean()

149.0

In [20]:
np.array([1,2,np.nan,np.nan,np.nan,444,np.nan]).mean()

nan

## Filtering missing data

In [21]:
s = pd.Series([1,2,3,np.nan,np.nan,4])
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [22]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [23]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [25]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [26]:
s[pd.isnull(s)]

3   NaN
4   NaN
dtype: float64

## Dropping null values

In [27]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [28]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

## Dropping null values on DataFrame

In [29]:
df = pd.DataFrame({
    'Column A' : [1,np.nan,30,np.nan],
    'Column B' : [2,8,31,np.nan],
    'Column C' : [np.nan,9,32,100],
    'Column D' : [5,8,34,110]
})
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [30]:
df.shape

(4, 4)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [32]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [34]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [35]:
df.isna()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [36]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [37]:
df.isna().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [38]:
# The default behavior of dropna will drop all wows in which
# any null value is present

df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [39]:
df.dropna(axis=0) # default is axis=0

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [40]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [41]:
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [42]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [43]:
df.dropna(axis='columns')

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [44]:
df.dropna(axis='rows')

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [45]:
df2 = pd.DataFrame({
    'Column A' : [1,np.nan,30],
    'Column B' : [2,np.nan,31],
    'COlumn C' : [np.nan,np.nan,100]
})
df2

Unnamed: 0,Column A,Column B,COlumn C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [47]:
df2.dropna(how='all')

Unnamed: 0,Column A,Column B,COlumn C
0,1.0,2.0,
2,30.0,31.0,100.0


In [48]:
df2.dropna(how='any') # default behavior

Unnamed: 0,Column A,Column B,COlumn C
2,30.0,31.0,100.0


In [49]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [50]:
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [51]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [52]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


# Filling Null values

In [54]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

__Fillling nulls with a arbitary value__

In [55]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [56]:
s.mean()

2.5

In [57]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [58]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

__Filling nulls with contiguous (close) values__

In [59]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [60]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [61]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

__This can still leave null values at the extremes of the Series or DataFrame.__

In [64]:
x = pd.Series([np.nan,3,np.nan,9])
x

0    NaN
1    3.0
2    NaN
3    9.0
dtype: float64

In [65]:
x.fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [66]:
x = pd.Series([np.nan,3,4,5,6,7,2,np.nan,9,np.nan,22,33,np.nan])
x

0      NaN
1      3.0
2      4.0
3      5.0
4      6.0
5      7.0
6      2.0
7      NaN
8      9.0
9      NaN
10    22.0
11    33.0
12     NaN
dtype: float64

In [68]:
x.fillna(method='ffill').fillna(method='bfill')

0      3.0
1      3.0
2      4.0
3      5.0
4      6.0
5      7.0
6      2.0
7      2.0
8      9.0
9      9.0
10    22.0
11    33.0
12    33.0
dtype: float64

## Filling null values on DataFrames

In [69]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


Lets Fill missing values in Column A with 0.

Lets Fill missing values in Column B with 99.

Lets Fill missing values in Column C with mean.

In [70]:
df.fillna({
    'Column A' : 0,
    'Column B' : 99,
    'Column C' : df['Column C'].mean()
})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [71]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [73]:
df.fillna(method='ffill')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [75]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [76]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [77]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


In [78]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [79]:
df.fillna(method='bfill', axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,5.0,5.0
1,8.0,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,100.0,100.0,100.0,110.0


In [80]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [84]:
df.fillna({
    'Column A' : 0,
    'Column B' : df['Column B'].fillna(method='ffill'),
    'Column C' : df['Column C'].mean()
})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,31.0,100.0,110


# Cleaning not-null values

In [85]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29,30,24,290,25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [86]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [87]:
df['Sex'].value_counts()

F    2
M    1
D    1
?    1
Name: Sex, dtype: int64

In [89]:
df['Sex'] = df['Sex'].replace('D','F')
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,290
4,?,25


In [90]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29,30,24,290,25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [91]:
df['Sex'] = df['Sex'].replace({
    'D' : 'F',
    'N' : 'M'
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,290
4,?,25


In [92]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29,30,24,290,25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


**If you have many columns to replace, you could apply it as a DataFrame level as well.**

In [93]:
df.replace({
    'Sex' : { 'D':'F' , 'N':'M' },
    'Age' : { 290:29 }
})


Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [94]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [95]:
df['Age']>100

0    False
1    False
2    False
3     True
4    False
Name: Age, dtype: bool

In [96]:
df[df['Age']>100]

Unnamed: 0,Sex,Age
3,D,290


In [97]:
df.loc[df['Age']>100,'Age'] = df.loc[df['Age']>100,'Age']/10
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


In [98]:
df = pd.DataFrame({
    'Sex' : ['M','F','F','D','?'],
    'Age' : [29,30,24,290,25]
})
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [99]:
def age_clean(x):
    if x>100:
        return x/10
    else:
        return x

In [100]:
df['Age'] = df['Age'].apply(age_clean)
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


# Duplicates

In [101]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany'
], index=[
    'Gerard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth'
])
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

For dealing with duplicates, there are two important methods:

>1. duplicated: It tells you which values are duplicates

>2. drop_duplicates: It will just get rid of the duplicates

In [102]:
ambassadors.duplicated()

Gerard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [103]:
ambassadors.duplicated().sum()

3

In [104]:
ambassadors.duplicated(keep='last')

Gerard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [105]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [106]:
ambassadors.duplicated(keep=False)

Gerard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [107]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [108]:
ambassadors.drop_duplicates()

Gerard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [109]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [110]:
ambassadors.drop_duplicates(keep='last')

Gerard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [111]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [112]:
ambassadors.drop_duplicates(keep=False)

Gerard Araud          France
Armando Varricchio     Italy
dtype: object

__Duplicates in Dataframes__

In [113]:
players = pd.DataFrame({
    'Name' : [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant'
    ],
    'Pos' : [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [114]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [115]:
players.duplicated().sum()

1

Conceptually, 'duplicated' means all the column values are duplicates.

We can customize this with the subset parameter.

In [116]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [117]:
players.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [118]:
players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [119]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [120]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [121]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [122]:
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


# Happy Learning