In [2]:
import pandas as pd
import numpy as np

In [3]:
string_data = pd.Series(['arrdvark', 'artichoke', np.nan, 'avocado'])

In [4]:
string_data

0     arrdvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [7]:
string_data[0]  = None

In [8]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [9]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [10]:
string_data.dropna()

1    artichoke
3      avocado
dtype: object

In [24]:
string_data.fillna(method='bfill')

0    artichoke
1    artichoke
2      avocado
3      avocado
dtype: object

In [15]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

# Filtering Out Missing Data

In [5]:
from numpy import nan as NA

In [18]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [21]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [23]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

This is Markdown
Only for summary
And Other important info

In [25]:
data = pd.DataFrame([
        [1., 6.5, 3.],
        [1., NA, NA],
        [NA, NA, NA],
        [NA, 6.5, 3.]
])

In [26]:
cleaned = data.dropna()

In [27]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Pass how='all' will only drop rows that are all NA:

In [30]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


to drop col in same way pass axis=1

In [32]:
data[3] = NA

In [35]:
data.dropna( axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [3]:
df = pd.DataFrame(np.random.randn(7, 3))

In [6]:
df.iloc[:4, 1] = NA

In [7]:
df.iloc[:2, 2] = NA

Suppose you want to keep only rows containing a certain no of observations.
You can indicate this with the thresh argument.

In [8]:
df

Unnamed: 0,0,1,2
0,-1.299109,,
1,0.349022,,
2,-0.064081,,-1.084766
3,0.221053,,0.397085
4,-0.041822,1.223723,-0.062882
5,0.492759,-1.532673,-0.014877
6,-0.308028,0.207413,-0.623296


In [51]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.763497,,0.735016
3,-0.18974,,-0.026148
4,-0.402722,-0.04876,1.226535
5,0.725832,-0.904916,0.613815
6,0.998201,1.174224,0.607075


# Filling In Missing Data

In [10]:

df.fillna(0)

Unnamed: 0,0,1,2
0,-1.299109,0.0,0.0
1,0.349022,0.0,0.0
2,-0.064081,0.0,-1.084766
3,0.221053,0.0,0.397085
4,-0.041822,1.223723,-0.062882
5,0.492759,-1.532673,-0.014877
6,-0.308028,0.207413,-0.623296


Calling fillna with dict, you can use a diff fill value for each column:


In [11]:
df.fillna({1: 1.2, 2: 0})

Unnamed: 0,0,1,2
0,-1.299109,1.2,0.0
1,0.349022,1.2,0.0
2,-0.064081,1.2,-1.084766
3,0.221053,1.2,0.397085
4,-0.041822,1.223723,-0.062882
5,0.492759,-1.532673,-0.014877
6,-0.308028,0.207413,-0.623296


In [15]:
df.fillna(method='bfill', limit=2)

Unnamed: 0,0,1,2
0,-1.299109,,-1.084766
1,0.349022,,-1.084766
2,-0.064081,1.223723,-1.084766
3,0.221053,1.223723,0.397085
4,-0.041822,1.223723,-0.062882
5,0.492759,-1.532673,-0.014877
6,-0.308028,0.207413,-0.623296


In [99]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [101]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64