In [1]:
# 移除異常值
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from feature_engine.outliers import OutlierTrimmer

  from pandas.core import (


In [2]:
# load the California House price data from Scikit-learn
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# let's separate the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((14448, 8), (6192, 8))

In [3]:
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
1989,1.975,52.0,2.8,0.7,193.0,4.825,36.73,-119.79
256,2.2604,43.0,3.67148,1.184116,836.0,3.018051,37.77,-122.21
7887,6.299,17.0,6.478022,1.087912,1387.0,3.81044,33.87,-118.04
4581,1.7199,17.0,2.518,1.196,3051.0,3.051,34.06,-118.28
1993,2.2206,50.0,4.622754,1.161677,606.0,3.628743,36.73,-119.81


In [4]:
def find_limits(df, variable, fold):

    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_limit = df[variable].quantile(0.25) - (IQR * fold)
    upper_limit = df[variable].quantile(0.75) + (IQR * fold)

    return lower_limit, upper_limit

In [5]:
# we find the limits

lower_limit, upper_limit = find_limits(X_train, "MedInc", 3)
lower_limit, upper_limit

(-3.925900000000002, 11.232600000000001)

In [6]:
# Remove outliers on the right

inliers = X_train["MedInc"].ge(lower_limit)
X_train = X_train.loc[inliers]

inliers = X_test["MedInc"].ge(lower_limit)
X_test = X_test.loc[inliers]

In [7]:
# Remove outliers on the left

inliers = X_train["MedInc"].le(upper_limit)
X_train = X_train.loc[inliers]

inliers = X_test["MedInc"].le(upper_limit)
X_test = X_test.loc[inliers]

In [8]:
# 透過Feature-engine移除異常值
# let's separate the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((14448, 8), (6192, 8))

In [9]:
trimmer = OutlierTrimmer(
    variables=["MedInc", "HouseAge", "Population"],
    capping_method="iqr",
    tail="both",
    fold=1.5,
)

trimmer.fit(X_train)

In [10]:
trimmer.left_tail_caps_

{'MedInc': -0.6776500000000012, 'HouseAge': -10.5, 'Population': -626.0}

In [11]:
trimmer.right_tail_caps_

{'MedInc': 7.984350000000001, 'HouseAge': 65.5, 'Population': 3134.0}

In [12]:
print(X_train.shape, X_test.shape)

X_train = trimmer.transform(X_train)
X_test = trimmer.transform(X_test)

print(X_train.shape, X_test.shape)

(14448, 8) (6192, 8)
(13165, 8) (5619, 8)
