In [1]:
# Handling Missing Data 

# two strategies:
# mask
# sentinel (NaN - pandas / None - python)

# python doesn't support operations with None 
# all will have conterted to array when the None has used

0.825503355704698

In [2]:
import numpy as np
import pandas as pd 

In [3]:
vals1 = np.array([1,None,2,3])
vals1

array([1, None, 2, 3], dtype=object)

In [4]:
try:
    vals1.sum()
except:
    print("non supported with None")

non supported with None


In [5]:
# NaN: Missing Numerical Data
vals2 = np.array([1,np.nan,3,4])
vals2 

array([ 1., nan,  3.,  4.])

In [6]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [7]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

In [8]:
# NaN and None in Pandas
# None will converted to NaN

pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [None]:
x = pd.Series(range(2), dtype=int)

In [9]:
# None is convert all set to dtype=float64:
x = pd.Series(range(2), dtype=int)
print(x)
x[0] = None 
print(x)

0    0
1    1
dtype: int64
0    NaN
1    1.0
dtype: float64


In [10]:
# Pandas Nullable Dtypes
pd.Series([1, np.nan, 2, None, pd.NA], dtype="Int32")

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [11]:
# operating on Null Values 
# isnull
# notnull
# dropna
# fillna 

In [12]:
# Detecting Null Values 
data = pd.Series([1, np.nan, "hello", None])

In [13]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [14]:
data[ data.notnull() ]

0        1
2    hello
dtype: object

In [15]:
# Dropping Null Values 
data.dropna()

0        1
2    hello
dtype: object

In [16]:
df = pd.DataFrame([
    [1,np.nan, 2],
    [2,3,5],
    [np.nan, 4, 6]
])
df 

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [17]:
df.dropna(axis="columns")

Unnamed: 0,2
0,2
1,5
2,6


In [18]:
# drop nulls only if whole row/column is null
df[3] = np.nan
df 

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [19]:
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [20]:
df.dropna(axis="rows",thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


In [22]:
# Filling Null Values 
data = pd.Series([1,np.nan,2,None,3], index=list("abcde"), dtype="Int32")
data 

a       1
b    <NA>
c       2
d    <NA>
e       3
dtype: Int32

In [23]:
data.fillna(0)

a    1
b    0
c    2
d    0
e    3
dtype: Int32

In [24]:
# forward fill
data.fillna(method="ffill")

a    1
b    1
c    2
d    2
e    3
dtype: Int32

In [25]:
# back fill 
data.fillna(method="bfill")

a    1
b    2
c    2
d    3
e    3
dtype: Int32

In [26]:
df 

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [27]:
# when previous value is not available - it doesn't fill 
df.fillna(method="ffill",axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
