### Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
# pip install category_encoders
import category_encoders as ce

### Create example data

In [2]:
df_train = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['d', 'e', 'f'], 'C': [1, np.nan, 3]})
df_test = pd.DataFrame({'A': ['a', 'b', 'c'], 'B': ['d', 'e', 'e'], 'C': [1, 2, 4]})

In [3]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        #return X[self.attribute_names].values
        return X[self.attribute_names]
    
class CategoricalWarrior(BaseEstimator, TransformerMixin):
    """One hot encoder for all categorical features"""

    def fit(self, X, y=None):
        """
        fit function first creates a dictionary.
        Each element has: key word=categorical column name, value=unique value in the categorical column
        then fit function creates a new attribute call categoricals, which equals the dictionary
        """
        cats = {}
        self.ncols = X.shape[1]
        for col in range(self.ncols):
            cats[col] = np.unique(X[:, col]).tolist()
        self.categoricals = cats
        return self

    def transform(self, X, y=None):
        """
        transform turns first convert the categorical columns into categorical data type
        with given categories (defined from function "fit" above)
        This means that if the new category appear in the test set, they will be ignored
        Then it uses function get_dummies to convert to one hot encoded data        
        """
        mydict = {}
        for col in range(self.ncols):
            mydict[col] = pd.Categorical(X[:, col], categories=self.categoricals[col])
        df = pd.DataFrame(mydict)
        new_df = pd.get_dummies(df, drop_first=False)
        # in case we need them later
        self.columns = new_df.columns
        return new_df


### Define 2 pipelines

In [11]:
cat_att = ["A", "B", "C"]

cat_pipeline1 = Pipeline([
    ("select_cat", DataFrameSelector(cat_att)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True))
])

# select categorical columns, imputing missing data, one-hot encoding
cat_pipeline2 = Pipeline([
    ("select_cat", DataFrameSelector(cat_att)),
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("cat_maker", CategoricalWarrior())])


### Test the first approach

In [12]:
dftrain_onehot1 = cat_pipeline1.fit_transform(df_train)
dftest_onehot1 = cat_pipeline1.transform(df_test)

In [13]:
dftrain_onehot1

Unnamed: 0,0_a,0_b,1_d,1_e,1_f,2
0,1,0,1,0,0,1.0
1,0,1,0,1,0,1.0
2,1,0,0,0,1,3.0


In [14]:
dftest_onehot1

Unnamed: 0,0_a,0_b,1_d,1_e,1_f,2
0,1.0,0.0,1,0,0,1
1,0.0,1.0,0,1,0,2
2,,,0,1,0,4


- This approach does not one-hot encode the last column C, it automatically treats C as numerical features
- Unseen category "c" in the first column in the test data is converted to Nan. So one more additional step for imputing data after one-hot encoding is needed

### Test the second approach

In [5]:
dftrain_onehot2 = cat_pipeline2.fit_transform(df_train)
dftest_onehot2 = cat_pipeline2.transform(df_test)

In [6]:
dftrain_onehot2

Unnamed: 0,0_a,0_b,1_d,1_e,1_f,2_1.0,2_3.0
0,1,0,1,0,0,1,0
1,0,1,0,1,0,1,0
2,1,0,0,0,1,0,1


In [7]:
dftest_onehot2

Unnamed: 0,0_a,0_b,1_d,1_e,1_f,2_1.0,2_3.0
0,1,0,1,0,0,1,0
1,0,1,0,1,0,0,0
2,0,0,0,1,0,0,0


- For the first column, since there are 2 categories in the train data (a and b) and we use the categories in the train data as standard to encode the test data, this approach ignore the new unseen category "c" in the test set
- For the second column, there are less categories in the test set than in the train set. It is not a problem, the algorithm just adds one more column "1_f" with all zeros

### Conclusion

By comparing these 2 approaches, I would prefer the 2nd approach. It is more manually, but we have much better control over the converted/transformed data