## Class 2: Treating missing Values

By default, missing values are represented with NaN: "Not a Number"
If your dataset has 0s, 99s or 999s, be sure either to drop them or approximate them as you would with missing values

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

In [5]:
missing = np.nan
series_obj = Series(['row 1', 'row 2', missing, 'row 4', 'row 5', 'row 6', missing,'row 8'])


0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [15]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

In [17]:
filled_series = series_obj.fillna('Unfilled Row')
filled_series

0           row 1
1           row 2
2    Unfilled Row
3           row 4
4           row 5
5           row 6
6    Unfilled Row
7           row 8
dtype: object

### Filling the missing values

In [7]:
np.random.seed(25)

DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj

In [10]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.559242,0.745284,0.828346,0.823694,0.07714,0.644862
1,0.309258,0.524254,0.958092,0.883201,0.295432,
2,0.088702,0.641717,0.132421,0.766486,0.076742,
3,,0.509213,0.655146,0.60212,0.719055,
4,,0.825139,0.712552,0.097937,0.842154,
5,,0.913676,0.547778,0.251937,0.027474,0.206257


In [11]:
filled_df = DF_obj.fillna(0)
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.559242,0.745284,0.828346,0.823694,0.07714,0.644862
1,0.309258,0.524254,0.958092,0.883201,0.295432,0.0
2,0.088702,0.641717,0.132421,0.766486,0.076742,0.0
3,0.0,0.509213,0.655146,0.60212,0.719055,0.0
4,0.0,0.825139,0.712552,0.097937,0.842154,0.0
5,0.0,0.913676,0.547778,0.251937,0.027474,0.206257


In [12]:
filled_df = DF_obj.fillna({0: 0.1, 5: 1.25})
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.559242,0.745284,0.828346,0.823694,0.07714,0.644862
1,0.309258,0.524254,0.958092,0.883201,0.295432,1.25
2,0.088702,0.641717,0.132421,0.766486,0.076742,1.25
3,0.1,0.509213,0.655146,0.60212,0.719055,1.25
4,0.1,0.825139,0.712552,0.097937,0.842154,1.25
5,0.1,0.913676,0.547778,0.251937,0.027474,0.206257


In [14]:
fill_DF = DF_obj.fillna(method = 'ffill')
fill_DF

Unnamed: 0,0,1,2,3,4,5
0,0.559242,0.745284,0.828346,0.823694,0.07714,0.644862
1,0.309258,0.524254,0.958092,0.883201,0.295432,0.644862
2,0.088702,0.641717,0.132421,0.766486,0.076742,0.644862
3,0.088702,0.509213,0.655146,0.60212,0.719055,0.644862
4,0.088702,0.825139,0.712552,0.097937,0.842154,0.644862
5,0.088702,0.913676,0.547778,0.251937,0.027474,0.206257


## Counting Missing Values

In [18]:
np.random.seed(25)

DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj


Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [19]:
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

## Filtering out missing Values

In [22]:
DF_NoNAN = DF_obj.dropna()
DF_NoNAN
# Drops the rows which has a null or missing value

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [23]:
DF_NoNAN = DF_obj.dropna(axis = 1)
DF_NoNAN
# Drops the columns which has a null or missing value

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
