# Fonte:

https://stackoverflow.com/questions/57528350/can-you-consistently-keep-track-of-column-labels-using-sklearns-transformer-api/57534118

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.linear_model import LinearRegression


### df

In [2]:
df = pd.DataFrame({'age': [23,12, 12, np.nan],
                   'Gender': ['M','F', np.nan, 'F'],
                   'income': ['high','low','low','medium'],
                   'sales': [10000, 100020, np.NaN, 100],
                   'foo' : [1,0,0,1],
                   'y': [0,1,1,1]})

df

Unnamed: 0,age,Gender,income,sales,foo,y
0,23.0,M,high,10000.0,1,0
1,12.0,F,low,100020.0,0,1
2,12.0,,low,,0,1
3,,F,medium,100.0,1,1


In [4]:
# setting the variables
numeric_columns = ['age']
cat_columns     = ['Gender','income']


numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('simple_transformer', MinMaxScaler(), ['sales']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(df.drop('y',1), df['y'])

In [5]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features

In [6]:
pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

Unnamed: 0,age,Gender_F,Gender_M,income_high,income_low,income_medium,sales,foo
0,1.732051,0.0,1.0,1.0,0.0,0.0,0.099079,1.0
1,-0.57735,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.57735,1.0,0.0,0.0,1.0,0.0,,0.0
3,-0.57735,1.0,0.0,0.0,0.0,1.0,0.0,1.0


---
# Incluindo um Regressor

In [7]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

df = pd.DataFrame({'brand': ['A', 'B', 'C', np.NaN],
                   'num1': [1, 1, np.NaN, 0],
                   'category': ['A', 'A', np.NaN, 'D'],
                    'target': [2, 4, 8, 10]})


df

Unnamed: 0,brand,num1,category,target
0,A,1.0,A,2
1,B,1.0,A,4
2,C,,,8
3,,0.0,D,10


In [14]:
# numeric_transformer
numeric_features = ['num1']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# categorical transformer
categorical_features = ['brand', 'category']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Preprocessor 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
        # MUITA Atenção aos argumentos do ColumnTransformer:
        remainder='passthrough',
        sparse_threshold=0.3,
        n_jobs=None,
        transformer_weights=None,
        verbose=False,)

# Preprocessor & Regressor
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',  LinearRegression())])

# Fit
clf.fit(df.drop('target', 1), df['target'])

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['num1']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehot',
                         

In [19]:
transformed_data = preprocessor.transform(df.drop('target', 1))

pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(preprocessor))

Unnamed: 0,num1,brand_A,brand_B,brand_C,brand_missing,category_A,category_D,category_missing
0,0.57735,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.57735,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.57735,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,-1.732051,0.0,0.0,0.0,1.0,0.0,1.0,0.0


<br>
<br>
<br>
<br>
<br>

---

# Next step
## A full pipeline with both preparation and prediction

<br>
<br>
<br>
<br>
<br>

---

# Next step - Incluindo funcões de criação de variavel