In [2]:
import numpy as np
import pandas as pd

# Missing Data

- Find Data -> Column wise
- Drop Data -> Row Wise
- Fill Data -> Column Wise

### Finding Missing Data

In [14]:
i = ['01', '02', '03', '04', '05']
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, 2, np.nan, np.nan, 5]
}

df = pd.DataFrame(data, index=i)
df

Unnamed: 0,A,B,C,D
1,1.0,,1.0,1.0
2,2.0,2.0,2.0,2.0
3,,3.0,3.0,
4,4.0,4.0,,
5,5.0,5.0,,5.0


In [15]:
df.isna() # return all values as boolean; True for null (NaN) and False for Values


Unnamed: 0,A,B,C,D
1,False,True,False,False
2,False,False,False,False
3,True,False,False,True
4,False,False,True,True
5,False,False,True,False


In [16]:
df.isna().sum() # Sum of all nulls in every column


A    1
B    1
C    2
D    2
dtype: int64

In [17]:
df.isna().any() # Check whether any column has some value, or is filled with null (True for any values and False for all null)


A    True
B    True
C    True
D    True
dtype: bool

### Removing Missing Data

In [19]:
df

Unnamed: 0,A,B,C,D
1,1.0,,1.0,1.0
2,2.0,2.0,2.0,2.0
3,,3.0,3.0,
4,4.0,4.0,,
5,5.0,5.0,,5.0


In [None]:
# .dropna() just works for rows

df.dropna()  # Removes all rows with atleast one null

Unnamed: 0,A,B,C,D
2,2.0,2.0,2.0,2.0


In [None]:
df.dropna(thresh = 3)  # At least 3 non-null values in a row

Unnamed: 0,A,B,C,D
1,1.0,,1.0,1.0
2,2.0,2.0,2.0,2.0
5,5.0,5.0,,5.0


In [21]:
df.dropna(thresh = 1)

Unnamed: 0,A,B,C,D
1,1.0,,1.0,1.0
2,2.0,2.0,2.0,2.0
3,,3.0,3.0,
4,4.0,4.0,,
5,5.0,5.0,,5.0


### Filling the Missing Data

In [22]:
df.fillna(0)  # Fills the NaN with 0s

Unnamed: 0,A,B,C,D
1,1.0,0.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,0.0,3.0,3.0,0.0
4,4.0,4.0,0.0,0.0
5,5.0,5.0,0.0,5.0


In [23]:
# When we want to fill the values customized for every column in NaN

i = {'A': 5, 'B': 10, 'C': 3, 'D': 0}
df.fillna(value = i)

Unnamed: 0,A,B,C,D
1,1.0,10.0,1.0,1.0
2,2.0,2.0,2.0,2.0
3,5.0,3.0,3.0,0.0
4,4.0,4.0,3.0,0.0
5,5.0,5.0,3.0,5.0


In [26]:
df

Unnamed: 0,A,B,C,D
1,1.0,,1.0,1.0
2,2.0,2.0,2.0,2.0
3,,3.0,3.0,
4,4.0,4.0,,
5,5.0,5.0,,5.0


In [None]:

df.fillna(df.mean()) # fills the mean value of table with nulls

Unnamed: 0,A,B,C,D
1,1.0,3.5,1.0,1.0
2,2.0,2.0,2.0,2.0
3,3.0,3.0,3.0,2.666667
4,4.0,4.0,2.0,2.666667
5,5.0,5.0,2.0,5.0
