# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'A':[1,2,np.nan,4],
                  'B':[5,np.nan,np.nan,8],
                  'C':[10,20,30,40]})

### Keeping missing data

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,,,30
3,4.0,8.0,40


## Removing missing data

In [4]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,10
3,4.0,8.0,40


In [5]:
df.dropna(axis=1)

Unnamed: 0,C
0,10
1,20
2,30
3,40


### Threshold (Require that many non-NA values.)

In [10]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
3,4.0,8.0,40


## Filling in missing data

In [11]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,10
1,2,FILL VALUE,20
2,FILL VALUE,FILL VALUE,30
3,4,8,40


In [12]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,,20
2,,,30
3,4.0,8.0,40


In [13]:
df['A'].fillna(value=0)

0    1.0
1    2.0
2    0.0
3    4.0
Name: A, dtype: float64

In [14]:
df['A'].fillna(df['A'].mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
Name: A, dtype: float64

In [15]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,10
1,2.0,6.5,20
2,2.333333,6.5,30
3,4.0,8.0,40
