- Note: I refer to Chapter 07 < Python for Data Analysis: Data Wrangling with Pandas, NumPy, and IPython (3rd) > (2023, Wes McKinney)

# 0. Overview

In [1]:
# Import modules
import pandas as pd
import numpy as np

# 1. Basics

In [6]:
# Sentinal Values of 'float64 dtype': NaN
float_data = pd.Series([2.4, -31, np.nan, 0, 32.2])

float_data

0     2.4
1   -31.0
2     NaN
3     0.0
4    32.2
dtype: float64

In [7]:
# Built-in Python 'None' value: NA
float_data.isna()

0    False
1    False
2     True
3    False
4    False
dtype: bool

- NA Handling Object Methods
  - 'dropna': Drop axis
  - 'fillna': Fill in NA with other values
  - 'isna': Return 'False' for NA
  - 'notna': Return 'True' for NA

# 2. Drop Missing Data

- 'dropna'
   - Series: Use 'dropna' to return a Series without NA
   - DataFrame: Drop rows or columns - 1) all NA; or 2) containing NAs
   

In [9]:
# Series 

data = pd.Series([3, np.nan, 2.1, 3.2, np.nan, 5,1])

data.dropna()

0    3.0
2    2.1
3    3.2
5    5.0
6    1.0
dtype: float64

In [12]:
# DataFrame

data = pd.DataFrame([[1., 3.1, 5.3, 1.4], 
                     [3.5, np.nan, 3., 3.1],
                     [np.nan, 6.4, 4., 1.],
                    [np.nan, np.nan, np.nan, np.nan]])
data

Unnamed: 0,0,1,2,3
0,1.0,3.1,5.3,1.4
1,3.5,,3.0,3.1
2,,6.4,4.0,1.0
3,,,,


In [14]:
# Drop rows and columns that are all NA
data.dropna()

Unnamed: 0,0,1,2,3
0,1.0,3.1,5.3,1.4


In [15]:
# Drop rows and columns that all values are NA
data.dropna(how="all")

Unnamed: 0,0,1,2,3
0,1.0,3.1,5.3,1.4
1,3.5,,3.0,3.1
2,,6.4,4.0,1.0


# 3. Fill In Missing Data

- 'fillna'
   - Fill in missing values with specific values

In [21]:
# Create a DataFrame
df = pd.DataFrame(np.random.standard_normal((10, 5)))

# Change some values to NA
df.iloc[:3, 2] = np.nan
df.iloc[5:7, 4] = np.nan
df.iloc[4:8, 1] = np.nan

df

Unnamed: 0,0,1,2,3,4
0,-0.28987,0.165609,,0.199031,-0.00082
1,0.209653,0.494726,,-0.035999,-0.89358
2,0.78187,-0.826988,,-0.25475,-0.547827
3,-1.943551,0.333328,-0.213465,0.881631,-1.952164
4,1.192532,,0.212951,0.774165,-0.756945
5,-1.171044,,-0.885033,-0.793543,
6,-0.278768,,-0.762153,0.877884,
7,-0.699923,,1.42311,1.120154,-2.147784
8,0.580638,-0.351248,0.083375,-0.246006,0.067944
9,-1.335462,-1.765141,-0.592251,0.257692,0.022198


In [22]:
# Fill in the missing values with '0'
df.fillna(0)

Unnamed: 0,0,1,2,3,4
0,-0.28987,0.165609,0.0,0.199031,-0.00082
1,0.209653,0.494726,0.0,-0.035999,-0.89358
2,0.78187,-0.826988,0.0,-0.25475,-0.547827
3,-1.943551,0.333328,-0.213465,0.881631,-1.952164
4,1.192532,0.0,0.212951,0.774165,-0.756945
5,-1.171044,0.0,-0.885033,-0.793543,0.0
6,-0.278768,0.0,-0.762153,0.877884,0.0
7,-0.699923,0.0,1.42311,1.120154,-2.147784
8,0.580638,-0.351248,0.083375,-0.246006,0.067944
9,-1.335462,-1.765141,-0.592251,0.257692,0.022198


In [23]:
# Fill in the missing values with a different value for each column
df.fillna({1:0.123456, 2:0.10040})

Unnamed: 0,0,1,2,3,4
0,-0.28987,0.165609,0.1004,0.199031,-0.00082
1,0.209653,0.494726,0.1004,-0.035999,-0.89358
2,0.78187,-0.826988,0.1004,-0.25475,-0.547827
3,-1.943551,0.333328,-0.213465,0.881631,-1.952164
4,1.192532,0.123456,0.212951,0.774165,-0.756945
5,-1.171044,0.123456,-0.885033,-0.793543,
6,-0.278768,0.123456,-0.762153,0.877884,
7,-0.699923,0.123456,1.42311,1.120154,-2.147784
8,0.580638,-0.351248,0.083375,-0.246006,0.067944
9,-1.335462,-1.765141,-0.592251,0.257692,0.022198


In [24]:
# Fill in missing values with 'mean'
data = pd.Series([3, np.nan, 2.1, 3.2, np.nan, 5,1])

data.fillna(data.mean())

0    3.00
1    2.86
2    2.10
3    3.20
4    2.86
5    5.00
6    1.00
dtype: float64