# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [39]:
import numpy as np
import pandas as pd

In [40]:
df = pd.DataFrame({'A': [1, 2, np.nan, 6, 3], 
                   'B': [5, np.nan, np.nan, np.nan, 1], 
                   'C': [1, 2, np.nan, np.nan, 7], 
                   'D': [0, 2, 5, 6, 3]})

In [41]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,0
1,2.0,,2.0,2
2,,,,5
3,6.0,,,6
4,3.0,1.0,7.0,3


In [42]:
# Same as df.dropna(axis = 0)
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,0
4,3.0,1.0,7.0,3


In [43]:
df.dropna(axis = 1)

Unnamed: 0,D
0,0
1,2
2,5
3,6
4,3


In [44]:
# Thresh means the number of NaN, if the num of NaN > thresh, drop the row.
df.dropna(thresh = 2)

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,0
1,2.0,,2.0,2
3,6.0,,,6
4,3.0,1.0,7.0,3


In [45]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,0
1,2.0,0.0,2.0,2
2,0.0,0.0,0.0,5
3,6.0,0.0,0.0,6
4,3.0,1.0,7.0,3


In [46]:
df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,0
1,2.0,,2.0,2
2,,,,5
3,6.0,,,6
4,3.0,1.0,7.0,3


In [47]:
df['B'].fillna(value = df['B'].mean())

0    5.0
1    3.0
2    3.0
3    3.0
4    1.0
Name: B, dtype: float64