## Capping at arbitrary points

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from feature_engine.outliers import ArbitraryOutlierCapper

In [2]:
# function to load the titanic dataset

def load_titanic():
    data = pd.read_csv('../../titanic.csv')
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['embarked'].fillna('C', inplace=True)
    return data

In [3]:
data = load_titanic()
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C,S,,,"Montreal, PQ / Chesterville, ON"


## ArbitraryOutlierCapper

The ArbitraryOutlierCapper caps the minimum and maximum values by a value determined by the user. 

In [4]:
# let's find out the maximum Age and maximum Fare in the titanic

data.age.max(), data.fare.max()

(80.0, 512.3292)

In [5]:
capper = ArbitraryOutlierCapper(
    max_capping_dict={'age': 50, 'fare': 200},
    min_capping_dict=None,
)

capper.fit(data.fillna(0))

In [6]:
capper.right_tail_caps_

{'age': 50, 'fare': 200}

In [7]:
capper.left_tail_caps_

{}

In [8]:
data_tr = capper.transform(data.fillna(0))

data_tr.age.max(), data_tr.fare.max()

(50.0, 200.0)

### Minimum capping

In [9]:
capper = ArbitraryOutlierCapper(
    max_capping_dict=None,
    min_capping_dict={
        'age': 10,
        'fare': 100
    })

capper.fit(data.fillna(0))

In [10]:
capper.variables_

['age', 'fare']

In [11]:
capper.right_tail_caps_

{}

In [12]:
capper.left_tail_caps_

{'age': 10, 'fare': 100}

In [13]:
data_tr = capper.transform(data.fillna(0))

data_tr.age.min(), data_tr.fare.min()

(10.0, 100.0)

### Both ends capping

In [14]:
capper = ArbitraryOutlierCapper(
    max_capping_dict={
        'age': 50, 'fare': 200},
    min_capping_dict={
        'age': 10, 'fare': 100})

capper.fit(data.fillna(0))

In [15]:
capper.right_tail_caps_

{'age': 50, 'fare': 200}

In [16]:
capper.left_tail_caps_

{'age': 10, 'fare': 100}

In [17]:
data_tr = capper.transform(data.fillna(0))

data_tr.age.min(), data_tr.fare.min()

(10.0, 100.0)

In [18]:
data_tr.age.max(), data_tr.fare.max()

(50.0, 200.0)