# Data Cleaning and Preparation

### Handling Missing Data

In [3]:
# Example
import pandas as pd
import numpy as np

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
# Check if there is any null value

string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

- __Both 'NA' and 'None' refers to the missing values__

In [6]:
string_data[0] = None

In [8]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

#### Filtering Out Missing Data
- __'dropna' function will only return the non missing value__

In [13]:
# filter data
from numpy import nan as NA

data = pd.Series([1,NA,1.3,5,7,NA])
data.dropna()

0    1.0
2    1.3
3    5.0
4    7.0
dtype: float64

##### 'dropna' function with DataFrame
- __if we use only 'dropna' function then it will retun the list in which all the not missing__

In [14]:
data = pd.DataFrame([
    [1., 6.5, 3.], 
    [1., NA, NA],
    [NA, NA, NA],
    [NA, 6.5, 3.]
])
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


- __if we pass 'how ='all'' then it will only elimintate in list in which all the missing data__

In [15]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


#### Drop data
- __to drop data we can pass 'axis=1'__

In [18]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [21]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### Filling In Missing Data
- __'fillna' function is use to fill the missing data__

In [22]:
data = pd.DataFrame([
    [1., 6.5, 3.], 
    [1., NA, NA],
    [NA, NA, NA],
    [NA, 6.5, 3.]
])

data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


- __Calling fillna with a dict, you can use a different fill value for each column__

In [26]:
data.fillna({0:11, 1: 0.5, 2: 10})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.5,10.0
2,11.0,0.5,10.0
3,11.0,6.5,3.0


- __'fillna' returns a new object, but you can modify the existing object in-place:__

In [28]:
__= data.fillna(0, inplace=True)
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


- __some method with 'fillna' function__

In [34]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.235308,1.090288,-1.265837
1,-0.318357,-0.460245,-0.566642
2,-0.705942,,-0.134484
3,0.354891,,0.210572
4,-1.542925,,
5,0.279995,,


In [None]:
- __method ='ffill'__

In [36]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.235308,1.090288,-1.265837
1,-0.318357,-0.460245,-0.566642
2,-0.705942,-0.460245,-0.134484
3,0.354891,-0.460245,0.210572
4,-1.542925,-0.460245,0.210572
5,0.279995,-0.460245,0.210572
