# One-hot Encoding 

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder


In [2]:
df = pd.read_csv("../Datasets/online_retail_dataset_countries.csv",
                parse_dates=["week"],
                index_col="week",
                )

df.head()

Unnamed: 0_level_0,country,quantity,revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,Belgium,143,439.1
2009-12-13,Belgium,10,8.5
2009-12-20,Belgium,0,0.0
2009-12-27,Belgium,0,0.0
2010-01-03,Belgium,0,0.0


In [4]:
# Set up the one-hot encoder

ohe = OneHotEncoder(
    drop="first",  # to encode into k-1 dummies
    sparse=False,  # to return a np array
    # if an unknown category appears in transform,
    # it will be encoded as all zeroes:
    handle_unknown="ignore",
)

In [5]:
# We place the encoder inside the ColumnTransformer
# to encode only the variable "country".

ct = ColumnTransformer(
    [("ohe", ohe, ["country"])],  # to encode only the variable country
    remainder="passthrough",  # to return all the columns in the resulting array
)

In [6]:
# We should split the data into train and 
# test before fitting. 

# We avoid this step to speed up the demo

ct.fit(df)



In [7]:
# Encode country

tmp = ct.transform(df)

# We've got 6 countries, that will return 5 dummies
# after the encoding, plus "revenue" and "quantity" = 7 variables in total

tmp.shape

(636, 7)

In [8]:
tmp

array([[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        1.4300000e+02, 4.3910000e+02],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        1.0000000e+01, 8.5000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       ...,
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        1.3399800e+05, 2.1074176e+05],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        1.2304100e+05, 2.2021399e+05],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 1.0000000e+00,
        2.0428100e+05, 3.7294626e+05]])

In [9]:
df_t = pd.DataFrame(tmp, columns = ct.get_feature_names_out())

df_t.head()

Unnamed: 0,ohe__country_EIRE,ohe__country_France,ohe__country_Germany,ohe__country_Spain,ohe__country_United Kingdom,remainder__quantity,remainder__revenue
0,0.0,0.0,0.0,0.0,0.0,143.0,439.1
1,0.0,0.0,0.0,0.0,0.0,10.0,8.5
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Ordinal Encoding 

In [11]:
df = pd.read_csv("../Datasets/online_retail_dataset_countries.csv",
                parse_dates=["week"],
                index_col="week",
                )

df.head()

Unnamed: 0_level_0,country,quantity,revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,Belgium,143,439.1
2009-12-13,Belgium,10,8.5
2009-12-20,Belgium,0,0.0
2009-12-27,Belgium,0,0.0
2010-01-03,Belgium,0,0.0


In [12]:
o_enc = OrdinalEncoder()
ct = ColumnTransformer(
    [("o_enc", o_enc, ["country"])],  # to encode only the variable country
    remainder="passthrough",  # to return all the columns in the resulting array
)

ct.fit(df)

# Encode country

tmp = ct.transform(df)

# The result is a numpy array,
# where the original variable was replaced by
# integers.

tmp


array([[0.0000000e+00, 1.4300000e+02, 4.3910000e+02],
       [0.0000000e+00, 1.0000000e+01, 8.5000000e+00],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
       ...,
       [5.0000000e+00, 1.3399800e+05, 2.1074176e+05],
       [5.0000000e+00, 1.2304100e+05, 2.2021399e+05],
       [5.0000000e+00, 2.0428100e+05, 3.7294626e+05]])

In [13]:
df_t = pd.DataFrame(tmp, columns=ct.get_feature_names_out())

df_t.head()

Unnamed: 0,o_enc__country,remainder__quantity,remainder__revenue
0,0.0,143.0,439.1
1,0.0,10.0,8.5
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


# Mean Encoding 

In [14]:
from feature_engine.encoding import MeanEncoder

In [15]:
df = pd.read_csv("../Datasets/online_retail_dataset_countries.csv",
                parse_dates=["week"],
                index_col="week",
                )

df.head()

Unnamed: 0_level_0,country,quantity,revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,Belgium,143,439.1
2009-12-13,Belgium,10,8.5
2009-12-20,Belgium,0,0.0
2009-12-27,Belgium,0,0.0
2010-01-03,Belgium,0,0.0


In [16]:
# Split the data before and after June 2011

X_train = df[df.index <= pd.to_datetime('2011-06-30')]
X_test = df[df.index > pd.to_datetime('2011-06-30')]

y_train = X_train["revenue"]
y_test = X_test["revenue"]

In [17]:
enc = MeanEncoder()

In [18]:
enc.fit(X_train, y_train)


In [19]:
# Feature-engine's encoder finds categorical variables
# by default

enc.variables_

['country']

In [20]:
enc.encoder_dict_

{'country': {'Belgium': 511.37853658536585,
  'EIRE': 5579.161829268293,
  'France': 2872.7475609756098,
  'Germany': 3764.180012195122,
  'Spain': 919.3335365853659,
  'United Kingdom': 129124.83931707316}}

In [21]:
# Encode datasets

X_train_t = enc.transform(X_train)
X_test_t = enc.transform(X_test)

X_train_t.head()

Unnamed: 0_level_0,country,quantity,revenue
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2009-12-06,511.378537,143,439.1
2009-12-13,511.378537,10,8.5
2009-12-20,511.378537,0,0.0
2009-12-27,511.378537,0,0.0
2010-01-03,511.378537,0,0.0
