In [3]:
import pandas as pd
from feature_engine.imputation import MeanMedianImputer 
from sklearn.pipeline import Pipeline

df = pd.DataFrame({
    'Age':[22, 35, None, 58, 42, None],
    'Salary': [25000, 40000, 38000, None, 52000, None]
})
df

imputer = MeanMedianImputer(imputation_method='mean', variables=['Age', 'Salary'])

#make a pipeline

pipe = Pipeline([
    ('imputer', imputer)
])

pipe.fit(df)
pipe.transform(df)

Unnamed: 0,Age,Salary
0,22.0,25000.0
1,35.0,40000.0
2,39.25,38000.0
3,58.0,38750.0
4,42.0,52000.0
5,39.25,38750.0


In [None]:
df_practice = pd.DataFrame({
    'model': ['Honda', 'Ford', 'Nissan', 'Audi'],
    'year': [2015, 2014, None, 2019],
    'mileage': [None, 15000, 25000, None]
})

imputer = MeanMedianImputer(imputation_method='median', variables=['year', 'mileage']) #fills in the missing data with the imputation method

pipe = Pipeline([
    ('imputer', imputer)
])

pipe.fit(df_practice)
pipe.transform(df_practice)

Unnamed: 0,model,year,mileage
0,Honda,2015.0,20000.0
1,Ford,2014.0,15000.0
2,Nissan,2015.0,25000.0
3,Audi,2019.0,20000.0


In [11]:
from feature_engine.imputation import CategoricalImputer

df = pd.DataFrame({
    'Color': ['Red', 'Blue', None, 'Green', 'Blue', None],
    'Size': ['S', 'M', 'M', 'L', None, None]
})

cat_imputer = CategoricalImputer(imputation_method='frequent', variables=['Color', 'Size'])
cat_imputer.fit(df).transform(df)


Unnamed: 0,Color,Size
0,Red,S
1,Blue,M
2,Blue,M
3,Green,L
4,Blue,M
5,Blue,M


In [23]:
from feature_engine.imputation import RandomSampleImputer

df = pd.DataFrame({
    'model': ['Honda', 'Ford', 'Nissan', 'Audi'],
    'year': [2015, 2014, None, 2019],
    'mileage': [None, 15000, 25000, None]
})

rand_imputer = RandomSampleImputer(variables=['year', 'mileage'])
rand_imputer.fit(df).transform(df)

Unnamed: 0,model,year,mileage
0,Honda,2015.0,15000.0
1,Ford,2014.0,15000.0
2,Nissan,2015.0,25000.0
3,Audi,2019.0,15000.0


In [None]:
from feature_engine.encoding import OneHotEncoder, CountFrequencyEncoder

df = pd.DataFrame({
    'City': ['London', 'Paris', 'London', 'Berlin', 'Berlin', 'Paris'],
    'Brought': [1, 0, 0, 1, 0, 1]
})

onehot = OneHotEncoder(variables=['City'], drop_last=True)
count_enc = CountFrequencyEncoder(encoding_method='frequency', variables=['City'])

print(onehot.fit_transform(df))
print(count_enc.fit_transform(df))

   Brought  City_London  City_Paris
0        1            1           0
1        0            0           1
2        0            1           0
3        1            0           0
4        0            0           0
5        1            0           1
       City  Brought
0  0.333333        1
1  0.333333        0
2  0.333333        0
3  0.333333        1
4  0.333333        0
5  0.333333        1


In [37]:
from feature_engine.encoding import OrdinalEncoder

df_ordinal = pd.DataFrame({
    'Education': ["High School", "Bachelor", "Master", "PhD", "Bachelor", "Master", "PhD"]
})

order = [["High School", "Bachelor", "Master", "PhD"]]

y = pd.Series([1, 2, 3, 4, 2, 3, 4])

encoder = OrdinalEncoder(
    encoding_method = 'ordered', variables=['Education']
    )

df_ordinal_encoded = encoder.fit_transform(df_ordinal, y)
print(df_ordinal_encoded)

   Education
0          0
1          1
2          2
3          3
4          1
5          2
6          3
