In [1]:
from sklearn.base import OneToOneFeatureMixin, TransformerMixin, BaseEstimator

class OutlierDetector(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    def __init__(self, threshold=1.5):
        self.threshold = threshold
        self.median_values = None

    def fit(self, X, y=None):
        q1 = np.percentile(X, 25, axis=0)
        q3 = np.percentile(X, 75, axis=0)

        iqr = q3 - q1

        self.lower_bound = q1 - self.threshold * iqr
        self.upper_bound = q3 + self.threshold * iqr

        self.median_values = np.median(X, axis=0)

        return self

    def transform(self, X):
        X_outliers_removed = np.where((X < self.lower_bound) | (X > self.upper_bound), self.median_values, X)
        return X_outliers_removed

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numeric_columns = ["a", "b", "c"]

missing = ColumnTransformer(
    transformers=[
        ('missing', SimpleImputer(strategy='median'), numeric_columns)
        ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
missing.set_output(transform='pandas')

outlier = ColumnTransformer(
    transformers=[
        ('outliers', OutlierDetector(), numeric_columns)
        ],
    remainder='passthrough',
    verbose_feature_names_out=False
)
outlier.set_output(transform='pandas')

preprocessor = Pipeline(steps=[
    ('missing', missing),
    ('outlier', outlier)
])

In [3]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=2,
    n_classes=2,
    random_state=42
)
X = pd.DataFrame(X, columns=['a', 'b', 'c', 'd','e','f', 'g', 'h', 'i', 'j'])
X

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.125100,1.178124,0.493516,0.790880,-0.614278,1.347020,1.419515,1.357325,0.966041,-1.981139
1,-0.564641,3.638629,-1.522415,-1.541705,1.616697,4.781310,3.190292,-0.890254,1.438826,-3.828748
2,0.516313,2.165426,-0.628486,-0.386923,0.492518,1.442381,1.332905,-1.958175,-0.348803,-1.804124
3,0.537282,0.966618,-0.115420,0.670755,-0.958516,0.871440,0.508186,-1.034471,-1.654176,-1.910503
4,0.278385,1.065828,-1.724917,-2.235667,0.715107,0.731249,-0.674119,0.598330,-0.524283,1.047610
...,...,...,...,...,...,...,...,...,...,...
995,-0.906303,-0.527162,-1.511787,-1.697166,-0.585131,0.160046,-2.225249,1.480886,-0.934154,1.151678
996,1.225603,-1.389881,-0.406775,-1.606446,2.500944,-1.089977,0.452517,-1.765429,1.297249,4.705105
997,1.650153,-0.692165,-2.049206,-1.610471,0.119157,-0.876080,-1.368269,-1.302577,-1.285505,3.328569
998,-1.186603,-1.414598,-0.121520,-1.440709,1.630283,-2.034632,-1.537456,-1.421465,-0.028340,3.413932


In [4]:
indexes = np.arange(1,1000,4)
outliers = np.arange(10,10000,40)

X.iloc[indexes, 0:2] = np.NaN
X.iloc[indexes, 2] = outliers
X

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.125100,1.178124,0.493516,0.790880,-0.614278,1.347020,1.419515,1.357325,0.966041,-1.981139
1,,,10.000000,-1.541705,1.616697,4.781310,3.190292,-0.890254,1.438826,-3.828748
2,0.516313,2.165426,-0.628486,-0.386923,0.492518,1.442381,1.332905,-1.958175,-0.348803,-1.804124
3,0.537282,0.966618,-0.115420,0.670755,-0.958516,0.871440,0.508186,-1.034471,-1.654176,-1.910503
4,0.278385,1.065828,-1.724917,-2.235667,0.715107,0.731249,-0.674119,0.598330,-0.524283,1.047610
...,...,...,...,...,...,...,...,...,...,...
995,-0.906303,-0.527162,-1.511787,-1.697166,-0.585131,0.160046,-2.225249,1.480886,-0.934154,1.151678
996,1.225603,-1.389881,-0.406775,-1.606446,2.500944,-1.089977,0.452517,-1.765429,1.297249,4.705105
997,,,9970.000000,-1.610471,0.119157,-0.876080,-1.368269,-1.302577,-1.285505,3.328569
998,-1.186603,-1.414598,-0.121520,-1.440709,1.630283,-2.034632,-1.537456,-1.421465,-0.028340,3.413932


In [5]:
X_transformed = preprocessor.fit_transform(X)
X_transformed

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
0,1.125100,1.178124,0.493516,0.790880,-0.614278,1.347020,1.419515,1.357325,0.966041,-1.981139
1,0.623894,0.048131,10.000000,-1.541705,1.616697,4.781310,3.190292,-0.890254,1.438826,-3.828748
2,0.516313,2.165426,-0.628486,-0.386923,0.492518,1.442381,1.332905,-1.958175,-0.348803,-1.804124
3,0.537282,0.966618,-0.115420,0.670755,-0.958516,0.871440,0.508186,-1.034471,-1.654176,-1.910503
4,0.278385,1.065828,-1.724917,-2.235667,0.715107,0.731249,-0.674119,0.598330,-0.524283,1.047610
...,...,...,...,...,...,...,...,...,...,...
995,-0.906303,-0.527162,-1.511787,-1.697166,-0.585131,0.160046,-2.225249,1.480886,-0.934154,1.151678
996,1.225603,-1.389881,-0.406775,-1.606446,2.500944,-1.089977,0.452517,-1.765429,1.297249,4.705105
997,0.623894,0.048131,0.370869,-1.610471,0.119157,-0.876080,-1.368269,-1.302577,-1.285505,3.328569
998,-1.186603,-1.414598,-0.121520,-1.440709,1.630283,-2.034632,-1.537456,-1.421465,-0.028340,3.413932
