In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from feature_engine import outlier_removers as outr

# Winsorizer

In [None]:
class feature_engine.outlier_removers.Winsorizer(distribution='gaussian', tail='right', fold=3, 
                                                 variables=None, missing_values='raise')

Desired distribution. Can take ‘gaussian’, ‘skewed’ or ‘quantiles’.

In [37]:
# Load dataset
def load_titanic():
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    data['fare'] = data['fare'].astype('float')
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['age'] = data['age'].astype('float')
    data['age'].fillna(data['age'].median(), inplace=True)
    return data

data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
            data.drop(['survived', 'name', 'ticket'], axis=1),
            data['survived'], test_size=0.3, random_state=0)

# set up the capper
capper = outr.Winsorizer(
    distribution='gaussian', tail='right', fold=3, variables=['age', 'fare'])

# fit the capper
capper.fit(X_train)

# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

In [38]:
capper.right_tail_caps_

{'age': 67.49048447470311, 'fare': 174.78162171790427}

In [39]:
train_t[['fare', 'age']].max()

fare    174.781622
age      67.490484
dtype: float64

In [40]:
X_train[['fare', 'age']].max()   # We can see that outlier has been removed in above DF.

fare    512.3292
age      74.0000
dtype: float64

In [41]:
# If distribution=’skewed’ fold is the value to multiply the IQR.

# set up the capper
capper = outr.Winsorizer(
    distribution='skewed', tail='right', fold=3, variables=['age', 'fare'])

# fit the capper
capper.fit(X_train)

# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

In [42]:
capper.right_tail_caps_

{'age': 71.0, 'fare': 101.4126}

In [43]:
train_t[['fare', 'age']].max()

fare    101.4126
age      71.0000
dtype: float64

In [44]:
# If distribution=’quantile’, fold is the percentile on each tail that should be censored.

# set up the capper
capper = outr.Winsorizer(
    distribution='quantiles', tail='right', fold=0.1, variables=['age', 'fare'])

# fit the capper
capper.fit(X_train)

# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

In [45]:
capper.right_tail_caps_

{'age': 48.0, 'fare': 79.025}

In [46]:
train_t[['fare', 'age']].max()

fare    79.025
age     48.000
dtype: float64

In [47]:
X_train.shape

(916, 11)

In [48]:
train_t.shape   #rows remains intact

(916, 11)

In [50]:
X_train[X_train['age']>48]

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
294,1,male,49.0,1,1,110.8833,C,C,,,"Haverford, PA"
367,2,male,52.0,0,0,13.5000,n,S,,130,"Bronx, NY"
279,1,male,62.0,0,0,26.5500,C,S,,,"Wimbledon Park, London / Hayling Island, Hants"
188,1,female,51.0,0,1,39.4000,D,S,9,,"Paris, France"
316,1,male,51.0,0,1,61.3792,n,C,,,"Geneva, Switzerland / Radnor, PA"
...,...,...,...,...,...,...,...,...,...,...,...
335,2,male,51.0,0,0,12.5250,n,S,,174,"Jacksonville, FL"
115,1,male,64.0,1,4,263.0000,C,S,,,"Winnipeg, MB"
174,1,male,58.0,0,0,29.7000,B,C,,258,"Buffalo, NY"
551,2,female,50.0,0,0,10.5000,n,S,13,,"London, England / Marietta, Ohio and Milwaukee..."


In [51]:
train_t.loc[[294,367]] # we can see that age is replaced with max value i.e 48

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
294,1,male,48.0,1,1,79.025,C,C,,,"Haverford, PA"
367,2,male,48.0,0,0,13.5,n,S,,130.0,"Bronx, NY"


In [52]:
X_train[['fare', 'age']].min()

fare    0.0000
age     0.1667
dtype: float64

In [53]:
train_t[['fare', 'age']].min()  

# since 'right tail' was chosen in transformer no change has happened to left side values

fare    0.0000
age     0.1667
dtype: float64

# ArbitraryOutlierCapper

In [None]:
class feature_engine.outlier_removers.ArbitraryOutlierCapper(max_capping_dict=None, 
                                                             min_capping_dict=None, 
                                                             missing_values='raise')

In [22]:
# Load dataset
def load_titanic():
        data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
        data = data.replace('?', np.nan)
        data['cabin'] = data['cabin'].astype(str).str[0]
        data['pclass'] = data['pclass'].astype('O')
        data['embarked'].fillna('C', inplace=True)
        data['fare'] = data['fare'].astype('float')
        data['fare'].fillna(data['fare'].median(), inplace=True)
        data['age'] = data['age'].astype('float')
        data['age'].fillna(data['age'].median(), inplace=True)
        return data

data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
                data.drop(['survived', 'name', 'ticket'], axis=1),
                data['survived'], test_size=0.3, random_state=0)

# set up the capper
capper = outr.ArbitraryOutlierCapper(
                max_capping_dict={'age': 50, 'fare': 200}, min_capping_dict=None)

# fit the capper
capper.fit(X_train)

# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

In [23]:
capper.right_tail_caps_

{'age': 50, 'fare': 200}

In [24]:
train_t[['fare', 'age']].max()

fare    200.0
age      50.0
dtype: float64

In [25]:
X_train[['fare', 'age']].max()

fare    512.3292
age      74.0000
dtype: float64

In [26]:
X_train.shape

(916, 11)

In [27]:
train_t.shape  # row are not dropped

(916, 11)

In [29]:
X_train[X_train['age'] > 50]

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
367,2,male,52.0,0,0,13.5000,n,S,,130,"Bronx, NY"
279,1,male,62.0,0,0,26.5500,C,S,,,"Wimbledon Park, London / Hayling Island, Hants"
188,1,female,51.0,0,1,39.4000,D,S,9,,"Paris, France"
316,1,male,51.0,0,1,61.3792,n,C,,,"Geneva, Switzerland / Radnor, PA"
264,1,male,56.0,0,0,35.5000,A,C,3,,"Basel, Switzerland"
...,...,...,...,...,...,...,...,...,...,...,...
123,1,male,60.0,1,1,79.2000,B,C,5,,"Zurich, Switzerland"
201,1,male,54.0,0,0,51.8625,E,S,,175,"Dorchester, MA"
335,2,male,51.0,0,0,12.5250,n,S,,174,"Jacksonville, FL"
115,1,male,64.0,1,4,263.0000,C,S,,,"Winnipeg, MB"


In [36]:
train_t.loc[[367,279]]  # we can see that age is replaced with max value i.e 50

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
367,2,male,50.0,0,0,13.5,n,S,,130.0,"Bronx, NY"
279,1,male,50.0,0,0,26.55,C,S,,,"Wimbledon Park, London / Hayling Island, Hants"


# OutlierTrimmer

In [None]:
class feature_engine.outlier_removers.OutlierTrimmer(distribution='gaussian', 
                                                     tail='right', 
                                                     fold=3, 
                                                     variables=None, 
                                                     missing_values='raise')

distribution can be, guassian, skewed (i.e IQR), percentiles. 

In [54]:
# Load dataset
def load_titanic():
    data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    data['fare'] = data['fare'].astype('float')
    data['fare'].fillna(data['fare'].median(), inplace=True)
    data['age'] = data['age'].astype('float')
    data['age'].fillna(data['age'].median(), inplace=True)
    return data

data = load_titanic()

# Separate into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
            data.drop(['survived', 'name', 'ticket'], axis=1),
            data['survived'], test_size=0.3, random_state=0)

# set up the capper
capper = outr.OutlierTrimmer(
    distribution='skewed', tail='right', fold=1.5, variables=['age', 'fare'])

# fit the capper
capper.fit(X_train)

# transform the data
train_t= capper.transform(X_train)
test_t= capper.transform(X_test)

In [55]:
capper.right_tail_caps_

{'age': 53.0, 'fare': 66.34379999999999}

In [56]:
train_t[['fare', 'age']].max()

fare    65.0
age     53.0
dtype: float64

In [57]:
X_train[['fare', 'age']].max()

fare    512.3292
age      74.0000
dtype: float64

In [58]:
train_t.shape

(757, 11)

In [59]:
X_train.shape #we can see that in above data set rows (outliers) are removed. 

(916, 11)

In [61]:
train_t[['fare', 'age']].min() #as 'right tail' was chosen no changes to left side value. 

fare    0.0000
age     0.1667
dtype: float64

In [60]:
X_train[['fare', 'age']].min() 

fare    0.0000
age     0.1667
dtype: float64