In [38]:
import pandas as pd
import numpy as np

# pandas.DataFrame.dropna

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

In [39]:
data = {
  'feature1': [np.nan,  2, np.nan, 4, 5, np.nan, 7, 8, 9, 10],
  'feature2': [np.nan,  2, 3, np.nan, 5, 6, 7, 8, 9, 10],
  'feature3': [np.nan,  2, 3, 4, 5, 6, 7, np.nan, 9, np.nan],
  'feature4': [np.nan,  2, 3, 4, 5, 6, 7, 8, 9, 10],
  'feature5': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
  'label': [np.nan, 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
}

df = pd.DataFrame(data)
df

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label
0,,,,,,
1,2.0,2.0,2.0,2.0,,B
2,,3.0,3.0,3.0,,A
3,4.0,,4.0,4.0,,B
4,5.0,5.0,5.0,5.0,,A
5,,6.0,6.0,6.0,,B
6,7.0,7.0,7.0,7.0,,A
7,8.0,8.0,,8.0,,B
8,9.0,9.0,9.0,9.0,,A
9,10.0,10.0,,10.0,,B


## 全部根據預設
> DataFrame.dropna(*, axis=0, how=_NoDefault.no_default, thresh=_NoDefault.no_default, subset=None, inplace=False, ignore_index=False)

In [40]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label


## axis
> {0 or ‘index’, 1 or ‘columns’}, default 0

Determine if rows or columns which contain missing values are removed.

- 0, or ‘index’ : Drop rows which contain missing values.

- 1, or ‘columns’ : Drop columns which contain missing value.

Only a single axis is allowed.

In [41]:
df_cleaned_row = df.dropna(axis=0)
df_cleaned_row

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label


In [42]:
df_cleaned_column = df.dropna(axis=1)
df_cleaned_column

0
1
2
3
4
5
6
7
8
9


## subset
> column label or sequence of labels, optional

Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.

In [43]:
df1 = df.dropna(subset=['feature1'])
df1

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label
1,2.0,2.0,2.0,2.0,,B
3,4.0,,4.0,4.0,,B
4,5.0,5.0,5.0,5.0,,A
6,7.0,7.0,7.0,7.0,,A
7,8.0,8.0,,8.0,,B
8,9.0,9.0,9.0,9.0,,A
9,10.0,10.0,,10.0,,B


## how
> {‘any’, ‘all’}, default ‘any’
Determine if row or column is removed from DataFrame, when we have at least one NA or all NA.

- ‘any’ : If any NA values are present, drop that row or column.

- ‘all’ : If all values are NA, drop that row or column.



In [44]:
df_cleaned_all_row = df.dropna(how='all', axis=0)
df_cleaned_all_row

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label
1,2.0,2.0,2.0,2.0,,B
2,,3.0,3.0,3.0,,A
3,4.0,,4.0,4.0,,B
4,5.0,5.0,5.0,5.0,,A
5,,6.0,6.0,6.0,,B
6,7.0,7.0,7.0,7.0,,A
7,8.0,8.0,,8.0,,B
8,9.0,9.0,9.0,9.0,,A
9,10.0,10.0,,10.0,,B


In [45]:
df_cleaned_all_column = df.dropna(how='all', axis=1)
df_cleaned_all_column

Unnamed: 0,feature1,feature2,feature3,feature4,label
0,,,,,
1,2.0,2.0,2.0,2.0,B
2,,3.0,3.0,3.0,A
3,4.0,,4.0,4.0,B
4,5.0,5.0,5.0,5.0,A
5,,6.0,6.0,6.0,B
6,7.0,7.0,7.0,7.0,A
7,8.0,8.0,,8.0,B
8,9.0,9.0,9.0,9.0,A
9,10.0,10.0,,10.0,B


In [46]:
df_cleaned_any = df.dropna(how='any')
df_cleaned_any

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label


## thresh
> int, optional

Require that many non-NA values. Cannot be combined with how.

(至少有 n 個非 Nan 值的行被保留下來)

In [59]:
df_threshold = df.dropna(thresh=5)
df_threshold

# （至少有 5 個非 Nan 值的行被保留下來，若這邊調到 6，則沒有一個樣本是符合的了！）

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label
1,2.0,2.0,2.0,2.0,,B
4,5.0,5.0,5.0,5.0,,A
6,7.0,7.0,7.0,7.0,,A
8,9.0,9.0,9.0,9.0,,A


## 確認 DataFrame 是否存在 NaN 的方法

In [60]:
print(df.isna())

   feature1  feature2  feature3  feature4  feature5  label
0      True      True      True      True      True   True
1     False     False     False     False      True  False
2      True     False     False     False      True  False
3     False      True     False     False      True  False
4     False     False     False     False      True  False
5      True     False     False     False      True  False
6     False     False     False     False      True  False
7     False     False      True     False      True  False
8     False     False     False     False      True  False
9     False     False      True     False      True  False


In [61]:
print(df.isna().any())

feature1    True
feature2    True
feature3    True
feature4    True
feature5    True
label       True
dtype: bool


In [64]:
data1 = {
  'feature1': [1, 2, np.nan, 4, 5, np.nan, 7, 8, 9, 10],
  'feature2': [1, 2, 3, np.nan, 5, 6, 7, 8, 9, 10],
  'feature3': [1, 2, 3, 4, 5, 6, 7, np.nan, 9, np.nan],
  'feature4': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  'feature5': [1,np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
  'label': ['A','B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']
}

df = pd.DataFrame(data1)
df

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,label
0,1.0,1.0,1.0,1,1.0,A
1,2.0,2.0,2.0,2,,B
2,,3.0,3.0,3,,A
3,4.0,,4.0,4,,B
4,5.0,5.0,5.0,5,,A
5,,6.0,6.0,6,,B
6,7.0,7.0,7.0,7,,A
7,8.0,8.0,,8,,B
8,9.0,9.0,9.0,9,,A
9,10.0,10.0,,10,,B


In [69]:
print(df.isna().any())

feature1     True
feature2     True
feature3     True
feature4    False
feature5     True
label       False
dtype: bool


In [70]:
print(df.isna().sum())

feature1    2
feature2    1
feature3    2
feature4    0
feature5    9
label       0
dtype: int64


In [66]:
print(df.isna().any(axis=1))

0    False
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
dtype: bool


In [71]:
print(df.isna().sum(axis=1))

0    0
1    1
2    2
3    2
4    1
5    2
6    1
7    2
8    1
9    2
dtype: int64


In [72]:
print(df.isna().values.any())

True


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   feature1  8 non-null      float64
 1   feature2  9 non-null      float64
 2   feature3  8 non-null      float64
 3   feature4  10 non-null     int64  
 4   feature5  1 non-null      float64
 5   label     10 non-null     object 
dtypes: float64(4), int64(1), object(1)
memory usage: 612.0+ bytes
