# Missing Values

In [1]:
import numpy as np
import pandas as pd

### None

* Pythonic missing data

In [2]:
vals1 = np.array([1, None, 3, 4])
print(vals1)

[1 None 3 4]


In [3]:
print(vals1.dtype)

object


In [4]:
try:
    print(vals1.sum())
except:
    print(False)

False


### NaN

* Missing numerical data

In [5]:
vals2 = np.array([1, np.nan, 3, 4]) 
print(vals2)

[  1.  nan   3.   4.]


In [6]:
print(vals2.dtype)

float64


In [7]:
print(1 + np.nan, 0 * np.nan)

nan nan


In [8]:
print(vals2.sum(), vals2.min(), vals2.max())

nan nan nan


In [9]:
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2))

8.0 1.0 4.0


In [11]:
data = pd.Series([1, np.nan, 'hello', None])
print(data)

0        1
1      NaN
2    hello
3     None
dtype: object

### isnull()

In [12]:
print(data.isnull())

0    False
1     True
2    False
3     True
dtype: bool


In [13]:
print(data[data.isnull()])

1     NaN
3    None
dtype: object


### notnull()

In [14]:
print(data.notnull())

0     True
1    False
2     True
3    False
dtype: bool


In [15]:
print(data[data.notnull()])

0        1
2    hello
dtype: object


### dropna()

* thresh: parameter lets you specify a minimum number of non-null values for the row/column to be kept

In [16]:
print(data.dropna())

0        1
2    hello
dtype: object


In [17]:
df = pd.DataFrame([[1,      np.nan, 2], [2,      3,      5], [np.nan, 4,      6]])
print(df)

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [18]:
print(df.dropna())

     0    1  2
1  2.0  3.0  5


In [19]:
print(df.dropna(axis='columns'))

   2
0  2
1  5
2  6


In [20]:
print(df.dropna(axis='rows'))

     0    1  2
1  2.0  3.0  5


In [21]:
df[3] = np.nan
print(df)

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [22]:
print(df.dropna(axis='columns', how='all'))

     0    1  2
0  1.0  NaN  2
1  2.0  3.0  5
2  NaN  4.0  6


In [23]:
print(df.dropna(axis='rows', thresh=3))

     0    1  2   3
1  2.0  3.0  5 NaN


### fillna()

* ffill: specify a forward-fill to propagate the previous value forward
* bfill: specify a back-fill to propagate the next values backward

In [24]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
print(data)

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64


In [25]:
print(data.fillna(0))

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64


In [26]:
print(data.fillna(method='ffill'))

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64


In [27]:
print(data.fillna(method='bfill'))

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64


In [28]:
print(df)

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  NaN  4.0  6 NaN


In [29]:
print(df.fillna(0))

     0    1  2    3
0  1.0  0.0  2  0.0
1  2.0  3.0  5  0.0
2  0.0  4.0  6  0.0


In [30]:
print(df.fillna(method='ffill', axis=0)) # column

     0    1  2   3
0  1.0  NaN  2 NaN
1  2.0  3.0  5 NaN
2  2.0  4.0  6 NaN


In [31]:
print(df.fillna(method='ffill', axis=1)) # row

     0    1    2    3
0  1.0  1.0  2.0  2.0
1  2.0  3.0  5.0  5.0
2  NaN  4.0  6.0  6.0
