In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin 

In [4]:
# para criar custom transformer 
class catencoder(BaseEstimator, TransformerMixin): 
    """
    semelhante ao labelencoder do sklearn mas funciona com categorias ineditas
    Permite o uso dentro do pipeline
    """ 
        
    def __init__(self):
        self.dict_ = {} 
        self.reverse_dict_ = {} 
        self.columns_ = [] 
        
        
    def fit(self, df, lista_cols):
        
        self.columns_ = lista_cols
        
        for col in lista_cols:
            uniques = df[col].unique()
            self.dict_[col] = {}
            self.reverse_dict_[col] = {}

            for num, cat in enumerate(uniques):
                print(num,cat)
                self.dict_[col][cat] = num
                self.reverse_dict_[col][num] = cat
                
        return self              
           
        
        
    def transform(self, df, inplace=True):

        new_df = df.copy()
        
        lista_cols = self.columns_
        
        for col in lista_cols:
            
            new_df[col].fillna("NaN",inplace=True)
            new_df.replace({col: self.dict_[col]}, inplace=True)
        
        return new_df


    
    def inverse_transform(self, df, inplace=True):
        
        new_df = df.copy()
        
        lista_cols = self.columns_
        
        for col in lista_cols:
            
            new_df.replace({col: self.reverse_dict_[col]},inplace=True)
        
        return new_df

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')

In [6]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [7]:
catenc = catencoder()

In [8]:
lista = ["ocean_proximity"]

catenc.fit(df,lista)

# print(catenc.columns_)
# print(catenc.dict_)
# print(catenc.reverse_dict_)

0 NEAR BAY
1 <1H OCEAN
2 INLAND
3 NEAR OCEAN
4 ISLAND


catencoder()

In [9]:
df.head()
df["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [10]:
transformed = catenc.transform(df)
transformed["ocean_proximity"].value_counts()

1    9136
2    6551
3    2658
0    2290
4       5
Name: ocean_proximity, dtype: int64

In [11]:
catenc.dict_

{'ocean_proximity': {'NEAR BAY': 0,
  '<1H OCEAN': 1,
  'INLAND': 2,
  'NEAR OCEAN': 3,
  'ISLAND': 4}}

In [12]:
catenc.reverse_dict_

{'ocean_proximity': {0: 'NEAR BAY',
  1: '<1H OCEAN',
  2: 'INLAND',
  3: 'NEAR OCEAN',
  4: 'ISLAND'}}

In [13]:
inversed_transformed = catenc.inverse_transform(transformed)
inversed_transformed["ocean_proximity"].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64