# Fonte:

https://stackoverflow.com/questions/57528350/can-you-consistently-keep-track-of-column-labels-using-sklearns-transformer-api/57534118

# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import _VectorizerMixin
from sklearn.feature_selection._base import SelectorMixin
from sklearn.linear_model import LinearRegression


### df

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv')

df.rename(columns={'median_house_value' : 'target'}, inplace=True)

df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,target,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# setting the variables
numeric_columns = ['housing_median_age', 'total_rooms']
cat_columns     = ['ocean_proximity']


numeric_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
cat_pipeline     = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())

transformers = [
('num', numeric_pipeline, numeric_columns),
('cat', cat_pipeline, cat_columns),
('simple_transformer', MinMaxScaler(), ['population']),
]

combined_pipe = ColumnTransformer(transformers, remainder='passthrough')

transformed_data = combined_pipe.fit_transform(df.drop('target',1), df['target'])

In [4]:
def get_feature_out(estimator, feature_in):
    if hasattr(estimator,'get_feature_names'):
        if isinstance(estimator, _VectorizerMixin):
            # handling all vectorizers
            return [f'vec_{f}' \
                for f in estimator.get_feature_names()]
        else:
            return estimator.get_feature_names(feature_in)
    elif isinstance(estimator, SelectorMixin):
        return np.array(feature_in)[estimator.get_support()]
    else:
        return feature_in


def get_ct_feature_names(ct):
    # handles all estimators, pipelines inside ColumnTransfomer
    # doesn't work when remainder =='passthrough'
    # which requires the input column names.
    output_features = []

    for name, estimator, features in ct.transformers_:
        if name!='remainder':
            if isinstance(estimator, Pipeline):
                current_features = features
                for step in estimator:
                    current_features = get_feature_out(step, current_features)
                features_out = current_features
            else:
                features_out = get_feature_out(estimator, features)
            output_features.extend(features_out)
        elif estimator=='passthrough':
            output_features.extend(ct._feature_names_in[features])
                
    return output_features

In [5]:
pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(combined_pipe))

Unnamed: 0,housing_median_age,total_rooms,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN,population,longitude,latitude,total_bedrooms,households,median_income
0,0.982143,-0.804819,0.0,0.0,0.0,1.0,0.0,0.008941,-122.23,37.88,129.0,126.0,8.3252
1,-0.607019,2.045890,0.0,0.0,0.0,1.0,0.0,0.067210,-122.22,37.86,1106.0,1138.0,8.3014
2,1.856182,-0.535746,0.0,0.0,0.0,1.0,0.0,0.013818,-122.24,37.85,190.0,177.0,7.2574
3,1.856182,-0.624215,0.0,0.0,0.0,1.0,0.0,0.015555,-122.25,37.85,235.0,219.0,5.6431
4,1.856182,-0.462404,0.0,0.0,0.0,1.0,0.0,0.015752,-122.25,37.85,280.0,259.0,3.8462
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.289187,-0.444985,0.0,1.0,0.0,0.0,0.0,0.023599,-121.09,39.48,374.0,330.0,1.5603
20636,-0.845393,-0.888704,0.0,1.0,0.0,0.0,0.0,0.009894,-121.21,39.49,150.0,114.0,2.5568
20637,-0.924851,-0.174995,0.0,1.0,0.0,0.0,0.0,0.028140,-121.22,39.43,485.0,433.0,1.7000
20638,-0.845393,-0.355600,0.0,1.0,0.0,0.0,0.0,0.020684,-121.32,39.43,409.0,349.0,1.8672


---
# Incluindo um Regressor

In [6]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression


In [7]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,target,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Train Test Split

In [8]:
# padrão 20% para teste
df_train, df_test = train_test_split(df,
                                     test_size = 0.2,
                                     random_state=75)

print("df.shape:      ",df.shape)
print("df_train.shape:",df_train.shape)
print("df_test.shape: ",df_test.shape)

df.shape:       (20640, 10)
df_train.shape: (16512, 10)
df_test.shape:  (4128, 10)


In [9]:
X_train = df_train.drop("target", axis=1) # drop labels for training set
y_train = df_train["target"].copy()

## PipeLine

In [10]:
# numeric_transformer
numeric_features = X_train.select_dtypes(include='number').columns # all numericals
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# categorical transformer
categorical_features = X_train.select_dtypes(include='object').columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Preprocessor 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
        # MUITA Atenção aos argumentos do ColumnTransformer:
        remainder='passthrough',
        sparse_threshold=0.3,
        n_jobs=None,
        transformer_weights=None,
        verbose=False)

# Preprocessor & Regressor
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',  LinearRegression())])

# Fit
clf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                     

In [11]:
transformed_data = preprocessor.transform(X_train)

pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(preprocessor))

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.641994,-0.687318,-0.931081,0.567626,1.280680,1.400900,1.308212,-0.798046,1.0,0.0,0.0,0.0,0.0
1,-1.419730,0.974303,1.851934,-0.577377,-0.625127,-0.621030,-0.592813,-0.104371,0.0,0.0,0.0,1.0,0.0
2,-1.120930,0.773601,-0.931081,-0.617772,-0.781933,-0.535029,-0.743520,0.331722,1.0,0.0,0.0,0.0,0.0
3,0.602154,-0.818007,-0.613022,0.030877,0.031050,-0.251045,-0.066660,0.914037,1.0,0.0,0.0,0.0,0.0
4,1.120075,-1.126060,-1.090111,0.134420,0.137196,-0.261795,0.213603,-0.672563,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.656934,-0.785335,0.818243,-0.638202,-0.511743,-0.466049,-0.452682,-1.200747,1.0,0.0,0.0,0.0,0.0
16508,0.706734,-0.864682,-0.294963,1.315639,1.760750,1.422401,1.799994,-0.333496,0.0,0.0,0.0,0.0,1.0
16509,1.174855,-1.303424,-0.215449,-0.456190,-0.661313,-0.652385,-0.632472,0.430166,0.0,0.0,0.0,0.0,1.0
16510,-0.109988,0.470215,0.420669,-0.170172,-0.270502,-0.213419,-0.241163,-1.102040,0.0,1.0,0.0,0.0,0.0


<br>
<br>
<br>
<br>
<br>

---

# Exercícios

In [12]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR

In [13]:
# padrão 20% para teste
df_train, df_test = train_test_split(df,
                                     test_size = 0.2,
                                     random_state=75)

print("df.shape:      ",df.shape)
print("df_train.shape:",df_train.shape)
print("df_test.shape: ",df_test.shape)

df.shape:       (20640, 10)
df_train.shape: (16512, 10)
df_test.shape:  (4128, 10)


In [14]:
X_train = df_train.drop("target", axis=1) # drop labels for training set
y_train = df_train["target"].copy()

In [29]:
# numeric_transformer
numeric_features = X_train.select_dtypes(include='number').columns # all numericals
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# categorical transformer
categorical_features = X_train.select_dtypes(include='object').columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Preprocessor 
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)],
        # MUITA Atenção aos argumentos do ColumnTransformer:
        remainder='passthrough',
        sparse_threshold=0.3,
        n_jobs=None,
        transformer_weights=None,
        verbose=False)

# GridSearch | RandomizedSearchCV
param_grid = [
        {'kernel': ['linear'], 'C': [1]},
        {'kernel': ['rbf'], 'C': [1.0],
         'gamma': [0.01, 0.03]},
    ]

svm_reg = SVR()
# rnd_search = RandomizedSearchCV(svm_reg,
#                                 param_distributions=param_grid,
#                                 n_iter=2,
#                                 cv=5,
#                                 scoring='neg_mean_squared_error',
#                                 verbose=10,
#                                 random_state=42,
#                                     refit = True)

grid_search = GridSearchCV(svm_reg,
                                param_grid=param_grid,
                                cv=5,
                                scoring='neg_mean_squared_error',
                                verbose=10,
                                refit=True)



prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', preprocessor),
    ('svm_reg', grid_search)
])

In [31]:
X_train.shape

(16512, 9)

In [34]:
prepare_select_and_predict_pipeline.fit(X_train[100:], y_train[100:])

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START C=1, kernel=linear..........................................
[CV 1/5; 1/3] END ........................C=1, kernel=linear; total time=   6.2s
[CV 2/5; 1/3] START C=1, kernel=linear..........................................
[CV 2/5; 1/3] END ........................C=1, kernel=linear; total time=   6.2s
[CV 3/5; 1/3] START C=1, kernel=linear..........................................
[CV 3/5; 1/3] END ........................C=1, kernel=linear; total time=   6.6s
[CV 4/5; 1/3] START C=1, kernel=linear..........................................
[CV 4/5; 1/3] END ........................C=1, kernel=linear; total time=   6.6s
[CV 5/5; 1/3] START C=1, kernel=linear..........................................
[CV 5/5; 1/3] END ........................C=1, kernel=linear; total time=   6.3s
[CV 1/5; 2/3] START C=1.0, gamma=0.01, kernel=rbf...............................
[CV 1/5; 2/3] END .............C=1.0, gamma=0.01,

Pipeline(steps=[('preparation',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                      

In [36]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,5.5107,0.13987,0.8727,0.046052,1,linear,,"{'C': 1, 'kernel': 'linear'}",-13708290000.0,-12933280000.0,-12294390000.0,-12050790000.0,-12565530000.0,-12710460000.0,578721700.0,1
1,7.393006,0.187803,1.920808,0.071406,1,rbf,0.01,"{'C': 1.0, 'gamma': 0.01, 'kernel': 'rbf'}",-15110190000.0,-14256650000.0,-13592340000.0,-13366670000.0,-13853880000.0,-14035950000.0,613280900.0,3
2,7.595931,0.494457,1.943532,0.091094,1,rbf,0.03,"{'C': 1.0, 'gamma': 0.03, 'kernel': 'rbf'}",-15079600000.0,-14231670000.0,-13564710000.0,-13342020000.0,-13826580000.0,-14008920000.0,611765200.0,2


In [37]:
transformed_data = preprocessor.transform(X_train)

pd.DataFrame(transformed_data, 
             columns=get_ct_feature_names(preprocessor))

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,0.641957,-0.687359,-0.931896,0.568306,1.281352,1.401314,1.309160,-0.796611,1.0,0.0,0.0,0.0,0.0
1,-1.419607,0.973828,1.851441,-0.576868,-0.624704,-0.620630,-0.592290,-0.102840,0.0,0.0,0.0,1.0,0.0
2,-1.120830,0.773179,-0.931896,-0.617269,-0.781531,-0.534628,-0.743030,0.333314,1.0,0.0,0.0,0.0,0.0
3,0.602120,-0.818014,-0.613800,0.031477,0.031558,-0.250642,-0.066019,0.915710,1.0,0.0,0.0,0.0,0.0
4,1.120001,-1.125987,-1.090944,0.135035,0.137718,-0.261392,0.214306,-0.671110,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16507,0.656896,-0.785350,0.817630,-0.637702,-0.511306,-0.465647,-0.452127,-1.199368,1.0,0.0,0.0,0.0,0.0
16508,0.706692,-0.864677,-0.295705,1.316431,1.761485,1.422815,1.801052,-0.331996,0.0,0.0,0.0,0.0,1.0
16509,1.174777,-1.303305,-0.216181,-0.455663,-0.660895,-0.651985,-0.631958,0.431772,0.0,0.0,0.0,0.0,1.0
16510,-0.109966,0.469872,0.420010,-0.169602,-0.270033,-0.213016,-0.240561,-1.100648,0.0,1.0,0.0,0.0,0.0
