# Data Preparation Basics
## Segment 2 - Treating missing values

In [91]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### Figuring out what data is missing

In [92]:
missing = np.nan

### Filling in for missing values

In [93]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6,6)))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [94]:
DF_obj.loc[3:5, 3:5] = missing
# DF_obj.isnull()
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,,,
4,0.514244,0.559053,0.03445,,,
5,0.281701,0.900274,0.669612,,,


In [95]:
#fills 0 to the missing numbers
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.0,0.0,0.0
4,0.514244,0.559053,0.03445,0.0,0.0,0.0
5,0.281701,0.900274,0.669612,0.0,0.0,0.0


In [96]:
#we can fill column and row together
filled_DF = DF_obj.fillna({0:7, 5:100})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,,,100.0
4,0.514244,0.559053,0.03445,,,100.0
5,0.281701,0.900274,0.669612,,,100.0


In [97]:
#fills the missing value with the previous value from row/column
filled_DF = DF_obj.fillna(method='ffill')
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.520719,0.326051,0.699186
4,0.514244,0.559053,0.03445,0.520719,0.326051,0.699186
5,0.281701,0.900274,0.669612,0.520719,0.326051,0.699186


### Counting missing values

In [98]:
#counts the total missing values form rows/columns
DF_obj.isnull().sum()

0    0
1    0
2    0
3    3
4    3
5    3
dtype: int64

### Filtering out missing values

In [99]:
#Removes missing values.
DF_no_NaN = DF_obj.dropna()
DF_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186


In [100]:
#Drop rows which contain missing values.
DF_no_NaN = DF_obj.dropna(axis=1)
DF_no_NaN

Unnamed: 0,0,1,2
0,0.870124,0.582277,0.278839
1,0.684969,0.437611,0.556229
2,0.447031,0.585445,0.161985
3,0.366395,0.836375,0.481343
4,0.514244,0.559053,0.03445
5,0.281701,0.900274,0.669612
