# Handling Missing Data

Missing data occurs commonly in many data analysis applications.of pandas is to make working with missing data as painless as possible. 




# All of the descriptive statistics on pandas objects exclude missing data by default.
 
For numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data. We call this a sentinel value that can be easily detected:


<img src="Pandas_Images\Missing_1.JPG">

In [1]:
from numpy import nan as NA
import pandas as pd
data = pd.Series([1, NA, 3.5, NA, 7])
data


0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

# Check whether missing value exists or not 

In [2]:
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [3]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [7]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [9]:
#This is equivalent to:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
 [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
#Passing how='all' will only drop rows that are all NA:
data.dropna(how='all')


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [15]:
#To drop columns in the same way, pass axis=1:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [16]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
""" Suppose
you want to keep only rows containing a certain number of observations. You can
indicate this with the thresh argument: """
import numpy as np
df = pd.DataFrame(np.random.randn(7, 3))
df

Unnamed: 0,0,1,2
0,-0.207841,-0.27232,-0.527367
1,0.114323,1.165434,0.838124
2,0.48692,0.222543,1.058602
3,-2.436952,-0.778711,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [31]:
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.207841,,
1,0.114323,,
2,0.48692,,1.058602
3,-2.436952,,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [21]:
df.dropna()

Unnamed: 0,0,1,2
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [32]:
df

Unnamed: 0,0,1,2
0,-0.207841,,
1,0.114323,,
2,0.48692,,1.058602
3,-2.436952,,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [34]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.48692,,1.058602
3,-2.436952,,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


# Filling In Missing Data
 Calling fillna with a constant replaces missing values with that value:


In [35]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.207841,0.0,0.0
1,0.114323,0.0,0.0
2,0.48692,0.0,1.058602
3,-2.436952,0.0,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [36]:
df

Unnamed: 0,0,1,2
0,-0.207841,,
1,0.114323,,
2,0.48692,,1.058602
3,-2.436952,,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [33]:
df
df.isnull().sum()

0    0
1    4
2    2
dtype: int64

In [25]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.207841,0.5,0.0
1,0.114323,0.5,0.0
2,0.48692,0.5,1.058602
3,-2.436952,0.5,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [27]:
#fillna returns a new object, but you can modify the existing object in-place:
_ = df.fillna(0, inplace=True)

In [28]:
df

Unnamed: 0,0,1,2
0,-0.207841,0.0,0.0
1,0.114323,0.0,0.0
2,0.48692,0.0,1.058602
3,-2.436952,0.0,-1.020797
4,-1.33311,2.373465,2.366912
5,-2.252334,1.675538,0.488997
6,-1.124258,-1.072068,-0.570589


In [37]:
df.isnull().sum()

0    0
1    4
2    2
dtype: int64

In [40]:
df.isnull().sum().any()

True

In [53]:
df = pd.DataFrame(np.random.randn(6, 3))

In [54]:
df.iloc[2:, 1] = NA

In [55]:
df.iloc[4:, 2] = NA

In [56]:
df

Unnamed: 0,0,1,2
0,-0.57337,2.169192,0.472998
1,1.542298,-1.444009,0.781495
2,-1.096382,,-0.539153
3,-0.058582,,0.092827
4,0.394718,,
5,-1.301954,,


In [57]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.57337,2.169192,0.472998
1,1.542298,-1.444009,0.781495
2,-1.096382,-1.444009,-0.539153
3,-0.058582,-1.444009,0.092827
4,0.394718,-1.444009,0.092827
5,-1.301954,-1.444009,0.092827


In [58]:
df.fillna(method='ffill', limit=2)


Unnamed: 0,0,1,2
0,-0.57337,2.169192,0.472998
1,1.542298,-1.444009,0.781495
2,-1.096382,-1.444009,-0.539153
3,-0.058582,-1.444009,0.092827
4,0.394718,,0.092827
5,-1.301954,,0.092827
