# Chapter 2 - Data Preparation Basics
## Segment 2 - Treating missing values

In [1]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [3]:
missing = np.nan

series_obj = Series(["row 1", "row 2", missing, "row 4", "row 5", "row 6", missing, "row 8"])

series_obj

0    row 1
1    row 2
2      NaN
3    row 4
4    row 5
5    row 6
6      NaN
7    row 8
dtype: object

In [4]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [5]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6, 6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [10]:
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [11]:
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [12]:
filled_DF = DF_obj.fillna({0: 0.1, 5:1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,0.1,0.836375,0.481343,0.516502,0.383048,
4,0.1,0.559053,0.03445,0.71993,0.421004,
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [14]:
filled_DF = DF_obj.fillna(method = "ffill")
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


### Counting missing values

In [17]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(100).reshape(10, 10))
DF_obj.loc[3:5, 0] = missing
DF_obj.loc[1:4, 5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969,0.437611,0.556229,0.36708
1,0.402366,0.113041,0.447031,0.585445,0.161985,,0.326051,0.699186,0.366395,0.836375
2,0.481343,0.516502,0.383048,0.997541,0.514244,,0.03445,0.71993,0.421004,0.436935
3,,0.900274,0.669612,0.456069,0.289804,,0.559242,0.745284,0.828346,0.823694
4,,0.644862,0.309258,0.524254,0.958092,,0.295432,0.512376,0.088702,0.641717
5,,0.766486,0.076742,0.331044,0.679852,0.509213,0.655146,0.60212,0.719055,0.415219
6,0.396542,0.825139,0.712552,0.097937,0.842154,0.440821,0.373989,0.913676,0.547778,0.251937
7,0.027474,0.206257,0.590885,0.163652,0.836928,0.775203,0.169041,0.766994,0.335366,0.472398
8,0.215064,0.912094,0.759208,0.676561,0.021376,0.660874,0.09444,0.831163,0.112749,0.56683
9,0.174626,0.790607,0.033683,0.795971,0.689437,0.491846,0.088554,0.93755,0.084362,0.469394


In [18]:
DF_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
6    0
7    0
8    0
9    0
dtype: int64

### Filtering out missing values

In [21]:
DF_no_NaN = DF_obj.dropna(axis = 1)
DF_no_NaN

Unnamed: 0,1,2,3,4,6,7,8,9
0,0.582277,0.278839,0.185911,0.4111,0.684969,0.437611,0.556229,0.36708
1,0.113041,0.447031,0.585445,0.161985,0.326051,0.699186,0.366395,0.836375
2,0.516502,0.383048,0.997541,0.514244,0.03445,0.71993,0.421004,0.436935
3,0.900274,0.669612,0.456069,0.289804,0.559242,0.745284,0.828346,0.823694
4,0.644862,0.309258,0.524254,0.958092,0.295432,0.512376,0.088702,0.641717
5,0.766486,0.076742,0.331044,0.679852,0.655146,0.60212,0.719055,0.415219
6,0.825139,0.712552,0.097937,0.842154,0.373989,0.913676,0.547778,0.251937
7,0.206257,0.590885,0.163652,0.836928,0.169041,0.766994,0.335366,0.472398
8,0.912094,0.759208,0.676561,0.021376,0.09444,0.831163,0.112749,0.56683
9,0.790607,0.033683,0.795971,0.689437,0.088554,0.93755,0.084362,0.469394


In [22]:
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376,0.684969,0.437611,0.556229,0.36708
6,0.396542,0.825139,0.712552,0.097937,0.842154,0.440821,0.373989,0.913676,0.547778,0.251937
7,0.027474,0.206257,0.590885,0.163652,0.836928,0.775203,0.169041,0.766994,0.335366,0.472398
8,0.215064,0.912094,0.759208,0.676561,0.021376,0.660874,0.09444,0.831163,0.112749,0.56683
9,0.174626,0.790607,0.033683,0.795971,0.689437,0.491846,0.088554,0.93755,0.084362,0.469394
