In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [184]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer, load_diabetes, fetch_california_housing, load_boston

In [3]:
from category_encoders import TargetEncoder

In [179]:
class targetencoder:
    def __init__(self, handle_missing='values', handle_unknown='values'):
        self.handle_missing = handle_missing
        self.handle_unknown = handle_unknown
        
    def fit_transform(self, X, y):
        self.mapping = dict()
        if type(X) == np.ndarray:
            if len(X.shape)<2:
                X = X.reshape(-1, 1)
            for col in range(X.shape[1]):
                dic = dict()
                uniq, counts = np.unique(X[:, col], return_counts=True)
                for u in uniq:
                    array = X[:, col].flatten()
                    idx = np.where(array==u)[0]
                    #print(uniq,idx)
                    dic[u] = (y[idx]).mean()
                #values, counts = np.unique(array)
                values_num = np.array([dic[i] for i in uniq])
                imputer = (values_num * counts).sum()/counts.sum()
                dic['missing'] = imputer
                dic['unknown'] = imputer
                self.mapping[col] = dic
        
            X1 = X.copy()
            #print(X1[:3,:])
            for col in range(X1.shape[1]):
                X1[:,col:col+1] = np.vectorize(lambda x: self.mapping[col][x])(X1[:,col:col+1])
               
            return X1
        
        else:
            if len(X.shape)<2:
                X = pd.DataFrame(X)
            for col in X.columns:
                dic = dict()
                uniq, counts = np.unique(X[col], return_counts=True)
                for u in uniq:
                    array = X[col].values
                    idx = np.where(array==u)[0]
                    dic[u] = (y.values[idx]).mean()
                    
                #values, counts = np.unique(array, return_counts=True)
                values_num = np.array([dic[i] for i in uniq])
                imputer = (values_num * counts).sum()/counts.sum()
                dic['missing'] = imputer
                dic['unknown'] = imputer
                self.mapping[col] = dic
            
            X1 = X.copy()
            for col in X1.columns:
                X1[col] = X1[col].apply(lambda x: self.mapping[col][x])
            
            return X1
    
    def _apply(self, col, x):
        if x in self.mapping[col]:
            return self.mapping[col][x]
        elif x!=x:
            if self.handle_missing == 'values':
                return self.mapping[col]['missing']
            else:
                return x
        else:
            if self.handle_unknown == 'values':
                return self.mapping[col]['unknown']
            else:
                return np.nan
        
    def transform(self, X,):
        if type(X) == np.ndarray:
            if len(X.shape)<2:
                X = X.reshape(-1, 1)
            X1 = X.copy()
            
            for col in range(X1.shape[1]):
                X1[:,col:col+1] = np.vectorize(lambda x: self._apply(col, x) )(X1[:,col:col+1])
               
            return X1
        else:
            if len(X.shape)<2:
                X = pd.DataFrame(X)
            X1 = X.copy()
            for col in X1.columns:
                X1[col] = X1[col].apply(lambda x: self._apply(col, x))
            
            return X1

In [180]:
loading =  load_boston()
feat_names = loading.feature_names

In [181]:
X, y = pd.DataFrame(loading['data'], columns = feat_names), pd.Series(loading['target'], name='label')

In [117]:
# for col in X.columns:
#     print(col, len(pd.unique(X[col])))

In [182]:
X.insert(X.columns.tolist().index('CHAS')+1,'CHAS_cat', X['CHAS'].apply(lambda x: 'No' if x==.0 else 'Yes') )

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state =42)

In [203]:
X_test.loc[173, ['CHAS_cat']] = np.nan
X_test.loc[491, ['CHAS_cat']] = 'pending'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [204]:
en = targetencoder()
X_train.loc[:,['CHAS_cat']] = en.fit_transform(X_train['CHAS_cat'], y_train)
X_test.loc[:,['CHAS_cat']] = en.transform(X_test['CHAS_cat'], )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, v)


In [205]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,CHAS_cat,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
173,0.09178,0.0,4.05,0.0,22.796535,0.510,6.416,84.1,2.6463,5.0,296.0,16.6,395.50,9.04
274,0.05644,40.0,6.41,1.0,29.186207,0.447,6.758,32.9,4.0776,4.0,254.0,17.6,396.90,3.53
491,0.10574,0.0,27.74,0.0,22.796535,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07
72,0.09164,0.0,10.81,0.0,22.302400,0.413,6.065,7.8,5.2873,4.0,305.0,19.2,390.91,5.52
452,5.09017,0.0,18.10,0.0,22.302400,0.713,6.297,91.8,2.3682,24.0,666.0,20.2,385.09,17.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412,18.81100,0.0,18.10,0.0,22.302400,0.597,4.628,100.0,1.5539,24.0,666.0,20.2,28.79,34.37
436,14.42080,0.0,18.10,0.0,22.302400,0.740,6.461,93.3,2.0026,24.0,666.0,20.2,27.49,18.05
411,14.05070,0.0,18.10,0.0,22.302400,0.597,6.657,100.0,1.5275,24.0,666.0,20.2,35.05,21.22
86,0.05188,0.0,4.49,0.0,22.302400,0.449,6.015,45.1,4.4272,3.0,247.0,18.5,395.99,12.86


In [173]:
en = targetencoder()
Xn[:,4:5] = en.fit_transform(Xn[:,4], y.values)

In [176]:
en.mapping

{0: {'No': 22.093842887473464,
  'Yes': 28.439999999999998,
  'missing': 22.532806324110673,
  'unknown': 22.532806324110673}}

In [64]:
en = TargetEncoder()
X['CHAS_cat'] = en.fit_transform(X['CHAS_cat'], y)

  elif pd.api.types.is_categorical(cols):


In [68]:
en.ordinal_encoder

OrdinalEncoder(cols=['CHAS_cat'],
               mapping=[{'col': 'CHAS_cat', 'data_type': dtype('O'),
                         'mapping': No     1
Yes    2
NaN   -2
dtype: int64}])

In [54]:
values, counts = np.unique(X['CHAS_cat'], return_counts=True)

In [57]:
(values * counts).sum()/counts.sum()

22.53280632411069

In [51]:
en.mapping

{'CHAS_cat': CHAS_cat
  1    22.093843
  2    28.440000
 -1    22.532806
 -2    22.532806
 dtype: float64}