In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_diabetes, fetch_california_housing, load_boston

In [3]:
from category_encoders import TargetEncoder

In [4]:
class targetencoder:
    def __init__(self, handle_missing='values', handle_unknown='values'):
        self.handle_missing = handle_missing
        self.handle_unknown = handle_unknown
        
    def fit_transform(self, X, y):
        self.mapping = dict()
        if type(X) == np.ndarray:
            if len(X.shape)<2:
                X = X.reshape(-1, 1)
            for col in range(X.shape[1]):
                dic = dict()
                uniq, counts = np.unique(X[:, col], return_counts=True)
                for u in uniq:
                    array = X[:, col].flatten()
                    idx = np.where(array==u)[0]
                    dic[u] = (y[idx]).mean()
                #values, counts = np.unique(array)
                values_num = np.array([dic[i] for i in uniq])
                imputer = (values_num * counts).sum()/counts.sum()
                dic['missing'] = imputer
                dic['unknown'] = imputer
                self.mapping[col] = dic
        
            X1 = X.copy()
            for col in range(X1.shape[1]):
                X1[:,col:col+1] = np.vectorize(lambda x: self.mapping[col][x])(X1[:,col:col+1])
               
            return X1
        
        else:
            if len(X.shape)<2:
                X = pd.DataFrame(X)
            for col in X.columns:
                dic = dict()
                uniq, counts = np.unique(X[col], return_counts=True)
                for u in uniq:
                    array = X[col].values
                    idx = np.where(array==u)[0]
                    dic[u] = (y.values[idx]).mean()
                    
                #values, counts = np.unique(array, return_counts=True)
                values_num = np.array([dic[i] for i in uniq])
                imputer = (values_num * counts).sum()/counts.sum()
                dic['missing'] = imputer
                dic['unknown'] = imputer
                self.mapping[col] = dic
            
            X1 = X.copy()
            for col in X1.columns:
                X1[col] = X1[col].apply(lambda x: self.mapping[col][x])
            
            return X1
    
    def _apply(self, col, x):
        if x in self.mapping[col]:
            return self.mapping[col][x]
        elif x!=x:
            if self.handle_missing == 'values':
                return self.mapping[col]['missing']
            else:
                return x
        else:
            if self.handle_unknown == 'values':
                return self.mapping[col]['unknown']
            else:
                return np.nan
        
    def transform(self, X,):
        if type(X) == np.ndarray:
            if len(X.shape)<2:
                X = X.reshape(-1, 1)
            X1 = X.copy()
            
            for col in range(X1.shape[1]):
                X1[:,col:col+1] = np.vectorize(lambda x: self._apply(col, x) )(X1[:,col:col+1])
               
            return X1
        else:
            if len(X.shape)<2:
                X = pd.DataFrame(X)
            X1 = X.copy()
            for col in X1.columns:
                X1[col] = X1[col].apply(lambda x: self._apply(col, x))
            
            return X1

In [5]:
loading =  load_boston()
feat_names = loading.feature_names

In [6]:
X, y = pd.DataFrame(loading['data'], columns = feat_names), pd.Series(loading['target'], name='label')

In [7]:
for col in X.columns:
    print(col, len(pd.unique(X[col])))

CRIM 504
ZN 26
INDUS 76
CHAS 2
NOX 81
RM 446
AGE 356
DIS 412
RAD 9
TAX 66
PTRATIO 46
B 357
LSTAT 455


In [8]:
X.insert(X.columns.tolist().index('CHAS')+1, 'CHAS_cat', X['CHAS'].apply(lambda x: 'No' if x==.0 else 'Yes') )
# X.insert(X.columns.tolist().index('RAD')+1, 'RAD_cat', X['RAD'].apply(lambda x: 'idx'+ str(x)) )

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size = .2, random_state =42)

X_test.loc[173, ['CHAS_cat']] = np.nan
X_test.loc[491, ['CHAS_cat']] = 'pending'

In [10]:
en = TargetEncoder()
X_train.loc[:,['CHAS_cat']] = en.fit_transform(X_train['CHAS_cat'], y_train)
X_test.loc[:,['CHAS_cat']] = en.transform(X_test['CHAS_cat'], )

  elif pd.api.types.is_categorical(cols):


In [18]:
en.mapping

{'CHAS_cat': CHAS_cat
  1    22.302400
  2    29.186207
 -1    22.796535
 -2    22.796535
 dtype: float64}

In [23]:
en.ordinal_encoder.mapping

[{'col': 'CHAS_cat',
  'mapping': No     1
  Yes    2
  NaN   -2
  dtype: int64,
  'data_type': dtype('O')}]

In [11]:
t = X_test.loc[:,['CHAS_cat']].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X.copy(), y.copy(), test_size = .2, random_state =42)

X_test.loc[173, ['CHAS_cat']] = np.nan
X_test.loc[491, ['CHAS_cat']] = 'pending'

In [13]:
en0 = targetencoder()
X_train.loc[:,['CHAS_cat']] = en0.fit_transform(X_train['CHAS_cat'], y_train)
X_test.loc[:,['CHAS_cat']] = en0.transform(X_test['CHAS_cat'], )

In [17]:
en0.mapping

{'CHAS_cat': {'No': 22.302400000000002,
  'Yes': 29.186206896551727,
  'missing': 22.79653465346535,
  'unknown': 22.79653465346535}}

In [14]:
t0 = X_test.loc[:,['CHAS_cat']].values

* The result of my target encoder is the same with the result of python library function

In [24]:
np.allclose(t, t0)

True