# Handling Missing Data

In [2]:
import pandas as pd
import numpy as np

In [9]:
string_data = pd.Series(['Hello','world',np.nan,'corona'])
string_data

0     Hello
1     world
2       NaN
3    corona
dtype: object

In [4]:
string_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [7]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

## Filtering Out Missing Data

In [10]:
from numpy import nan as NA

In [12]:
data = pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

**dropna returns non null value and their index.**

In [13]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

**This is equavalent to:**

In [14]:
data[ data.notnull() ]

0    1.0
2    3.5
4    7.0
dtype: float64

**With DataFrame objects, things are a bit more complex. You may want to drop rows
or columns that are all NA or only those containing any NAs. dropna by default drops
any row containing a missing value**

In [16]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
 [NA, NA, NA], [NA, 6.5, 3.]])

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
cleaned = data.dropna()

In [18]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


**Passing how='all' will only drop rows that are all NA**

In [20]:
cleaned = data.dropna( how='all')

In [21]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


**To drop columns in the same way, pass axis=1**

In [23]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [24]:
cleaned = data.dropna(axis=1,how='all')
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


**dropna(thresh=2),by default axis=0. will drop rows that contains two NA**

In [26]:
data = pd.DataFrame(np.random.randn(7,3))
data

Unnamed: 0,0,1,2
0,1.668366,-0.87062,1.506825
1,-0.622122,0.44715,0.091993
2,-0.890133,0.136171,0.407828
3,1.387941,-0.814674,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


In [27]:
data.iloc[:4,1] = NA
data.iloc[:2,2] = NA
data

Unnamed: 0,0,1,2
0,1.668366,,
1,-0.622122,,
2,-0.890133,,0.407828
3,1.387941,,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


In [29]:
cleaned = data.dropna(thresh=2)
cleaned

Unnamed: 0,0,1,2
2,-0.890133,,0.407828
3,1.387941,,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


## Filling In Missing Data

**Rather than filtering out missing data, we may want to fill in the “holes” in any number of ways**

In [30]:
data.fillna(0)

Unnamed: 0,0,1,2
0,1.668366,0.0,0.0
1,-0.622122,0.0,0.0
2,-0.890133,0.0,0.407828
3,1.387941,0.0,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


**Calling fillna with a dict, you can use a different fill value for each column**

In [33]:
data.fillna({1:0.5,2:0.2})

Unnamed: 0,0,1,2
0,1.668366,0.5,0.2
1,-0.622122,0.5,0.2
2,-0.890133,0.5,0.407828
3,1.387941,0.5,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


**fill na returns an object but we can modify the main object by giving inplace = True**

In [38]:
data.fillna(0,inplace=True)
data

Unnamed: 0,0,1,2
0,1.668366,0.0,0.0
1,-0.622122,0.0,0.0
2,-0.890133,0.0,0.407828
3,1.387941,0.0,2.16995
4,-0.623657,-1.050762,0.946606
5,-1.928626,0.290297,0.313907
6,-0.931288,-1.795846,1.084401


**method = 'ffill'. fills all the NA values with the value of above row.**

In [39]:
df = pd.DataFrame(np.random.randn(6, 3))

In [40]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.74334,1.678901,0.189848
1,-0.330603,-0.151343,-0.640444
2,0.845828,,0.637942
3,-1.061721,,-1.103957
4,-0.707325,,
5,0.300496,,


In [41]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.74334,1.678901,0.189848
1,-0.330603,-0.151343,-0.640444
2,0.845828,-0.151343,0.637942
3,-1.061721,-0.151343,-1.103957
4,-0.707325,-0.151343,-1.103957
5,0.300496,-0.151343,-1.103957


**max two values will be filled from a column**

In [42]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.74334,1.678901,0.189848
1,-0.330603,-0.151343,-0.640444
2,0.845828,-0.151343,0.637942
3,-1.061721,-0.151343,-1.103957
4,-0.707325,,-1.103957
5,0.300496,,-1.103957
