In [1]:
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

# 產生假數據
feature, _ = make_blobs(n_samples=1000,  # 1,000 個樣本
                         n_features=2,    # 2 個特徵
                         random_state=1)  # 亂數種子數

# 特徵標準化
scaler = StandardScaler()
X_std = scaler.fit_transform(feature)
X_std

array([[ 0.87301861,  1.31426523],
       [-0.67073178, -0.22369263],
       [ 2.1048424 ,  1.45332359],
       ...,
       [ 1.18998798,  1.33439442],
       [ 1.22406396,  1.27667052],
       [-0.21664919, -1.19113343]])

In [2]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.impute import SimpleImputer

# 製造遺漏值
X_std[0, 0] = np.nan
X_std[1, 1] = np.nan

# 製造離群值
X_std[1, 0] = 50
X_std[2, 1] = -30
X_std[-1, 1] = 30
X_std

array([[         nan,   1.31426523],
       [ 50.        ,          nan],
       [  2.1048424 , -30.        ],
       ...,
       [  1.18998798,   1.33439442],
       [  1.22406396,   1.27667052],
       [ -0.21664919,  30.        ]])

In [3]:
# 產生遺漏值填補器，利用平均值填補
imputer = SimpleImputer(strategy="mean")
X_imp = imputer.fit_transform(X_std)

# 真實值 vs 遺漏值填補後的結果
print("遺漏值填補：{}".format(X_imp[:2, :]))

遺漏值填補：[[ 4.98475607e-02  1.31426523e+00]
 [ 5.00000000e+01 -3.85360658e-05]]


In [4]:
# 產生以橢圓法為基礎的離群值偵測器
outlier = EllipticEnvelope(contamination=.01) # 設定離群值比例為 0.01
outlier.fit(X_imp)
outlier.predict(X_imp)[:5]

array([ 1, -1, -1,  1,  1])

In [5]:
# 利用 IQR 在個別特徵上偵測離群值

# 回傳離群值的索引值
def outlier_idx(x):
    q1, q3 = np.percentile(x, [25, 75])
    IQR = q3 - q1
    lower_bound = q1 - 1.5*IQR
    upper_bound = q3 + 1.5*IQR
    
    return np.where((x < lower_bound) | (x > upper_bound))

for i in range(X_imp.shape[1]):
    idx = outlier_idx(X_imp[:, i])
    print("特徵{} -> 離群值的索引值：{}".format(i, idx))

特徵0 -> 離群值的索引值：(array([1], dtype=int64),)
特徵1 -> 離群值的索引值：(array([  2, 999], dtype=int64),)
