#### Here and throughout the book, we’ll refer to missing data in general as null, NaN, or NA values. 

#### 2 main strategies
i - masking - globally indicate missing values<br>
ii - sentinel value - missing entry

In [1]:
import numpy as np
import pandas as pd

In [2]:
vals1 = np.array([1,None,3,4])

In [3]:
vals1

array([1, None, 3, 4], dtype=object)

In [5]:
# No aggregations works if any of the value in the column is not numeric

In [4]:
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

## NAN MISSING NUMERICAL DATA 

In [12]:
vals2 = np.array([1,np.nan,3,4])

##### np.nan does not interfere with inference of the oject in an array for the dtype 

In [13]:
vals2.dtype

dtype('float64')

#### np.nan is a sink 

In [14]:
1 + np.nan

nan

##### np.nan allows aggregation but are useless 

In [15]:
vals2.sum(), vals2.min(), vals2.max()

  return umr_minimum(a, axis, None, out, keepdims, initial)
  return umr_maximum(a, axis, None, out, keepdims, initial)


(nan, nan, nan)

### special aggregation nan: numpy allows special aggregation for values with nan 

In [16]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

## NAN AND NONE IN PANDAS 

Pandas is built to handle the two of them nearly interchangeably, converting between them where appropriate

In [17]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [18]:
x = pd.Series(range(2), dtype=int)

In [19]:
x

0    0
1    1
dtype: int64

In [20]:
x[0]=None

In [21]:
x

0    NaN
1    1.0
dtype: float64

### DETECTING NULL VALUES 

In [25]:
data = pd.Series([1,np.nan,'hello',None])

In [26]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [27]:
data[data.notnull()]

0        1
2    hello
dtype: object

### DROPPING NULL VALUES 

In [28]:
data.dropna()

0        1
2    hello
dtype: object

In [32]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])

In [33]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


##### Alternatively, you can drop NA values along a different axis; axis=1 drops all columns containing a null value: 

In [34]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [38]:
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


#### axis=1 and axis=columns are the same 

In [35]:
df.dropna(axis='rows')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [36]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
1,2.0,3.0,5


Unnamed: 0,2
0,2
1,5
2,6


In [39]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [40]:
df[3]=np.nan

In [41]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


##### delete only those columns which have Na in all elements 

In [47]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


##### drop with thresh

In [48]:
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


Here the first and last row have been dropped, because they contain only two non-null values.

# FILLING NULL VALUES 