### Pipeline Regresion

In [25]:
import pandas as pd
import numpy as np
from wrangler import Wrangler
from wrangler.data import PandasDataset, CSVDataset
import wrangler.transformers as tr
import wrangler.transformers.text as text_tr
import wrangler.transformers.ml as ml_tr
from wrangler import logger as wrangler_logger

from sklearn.datasets import make_classification, make_regression
from sklearn.linear_model import LinearRegression, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

wrangler_logger.enable()

In [26]:
## Definir input
## Definir los distintos metodos de seleccion
## Definir el modelo de prediccion a utilizar
## Agrupar los resultados de cada metodo.
## Seleccionar y Guardar el mejor metodo.

In [27]:
X_reg, y_reg = make_regression(
    n_samples=1000, 
    n_features=10,
    n_informative=5, 
    random_state=45
)
X_reg = pd.DataFrame(X_reg, columns=[f"feature_{i}" for i in range(X_reg.shape[1])])
y_reg = pd.DataFrame(y_reg, columns=['target'])

In [28]:
wrangler = Wrangler()

2022-02-22 at 17:45:31 | INFO | catalog | Adding dataset: intermediate


In [29]:
X_ds = PandasDataset('X', X_reg)
y_ds = PandasDataset("y", y_reg)

In [30]:
wrangler.add_dataset(X_ds)
wrangler.add_dataset(y_ds)

2022-02-22 at 17:45:32 | INFO | catalog | Adding dataset: X
2022-02-22 at 17:45:32 | INFO | catalog | Adding dataset: y


In [31]:
# wrangler.data_catalog.load("X")

In [32]:
wrangler.add_node(
    name='correlation_selector',
    transformer=ml_tr.CorrelationFeatureSelector(threshold=0.2),
    inputs=['X','y'],
    outputs=['X_correlation']
)

2022-02-22 at 17:45:32 | INFO | pipeline | Node correlation_selector added to Pipeline 


In [33]:

wrangler.add_node(
    name='rfe_selector',
    transformer=ml_tr.RFEFeatureSelector(n_features=5, estimator=LinearRegression()),
    inputs=['X','y'],
    outputs=['X_rfe']
)

2022-02-22 at 17:45:32 | INFO | pipeline | Node rfe_selector added to Pipeline 


In [34]:

wrangler.add_node(
    name='kbest_selector',
    transformer=ml_tr.KBestFeatureSelectorRegression(n_features=5),
    inputs=['X','y'],
    outputs=['X_kbest']
)

2022-02-22 at 17:45:33 | INFO | pipeline | Node kbest_selector added to Pipeline 


In [35]:
wrangler.add_node(
    name='seq_selector',
    transformer=ml_tr.BackwardForwardFeatureSelector(
        n_features=5, 
        estimator=LinearRegression()
    ),
    inputs=['X','y'],
    outputs=['X_seq']
)

2022-02-22 at 17:45:33 | INFO | pipeline | Node seq_selector added to Pipeline 


In [36]:
wrangler.add_node(
    name='vif_selector',
    transformer=ml_tr.VIFFeatureSelector(threshold=2),
    inputs=['X','y'],
    outputs=['X_vif']
)

2022-02-22 at 17:45:33 | INFO | pipeline | Node vif_selector added to Pipeline 


In [37]:
wrangler.add_node(
    name='from_model_selector',
    transformer=ml_tr.FromModelFeatureSelector(n_features=5, estimator=ElasticNet()),
    inputs=['X','y'],
    outputs=['X_from_model']
)

2022-02-22 at 17:45:33 | INFO | pipeline | Node from_model_selector added to Pipeline 


In [38]:
wrangler.add_node(
    name='model_full',
    transformer=ml_tr.SklearnModelTransformer(name='rf_clf', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X','y'],
    outputs=['y_full']
)

2022-02-22 at 17:45:33 | INFO | pipeline | Node model_full added to Pipeline 


In [39]:

wrangler.add_node(
    name='model_corr',
    transformer=ml_tr.SklearnModelTransformer(name='rf_corr', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_correlation','y'],
    outputs=['y_correlation']
)

2022-02-22 at 17:45:34 | INFO | pipeline | Node model_corr added to Pipeline 


In [40]:
wrangler.add_node(
    name='model_rfe',
    transformer=ml_tr.SklearnModelTransformer(name='rf_rfe', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_rfe','y'],
    outputs=['y_rfe']
)

2022-02-22 at 17:45:34 | INFO | pipeline | Node model_rfe added to Pipeline 


In [41]:
wrangler.add_node(
    name='model_kbest',
    transformer=ml_tr.SklearnModelTransformer(name='rf_kbest', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_kbest','y'],
    outputs=['y_kbest']
)

2022-02-22 at 17:45:34 | INFO | pipeline | Node model_kbest added to Pipeline 


In [42]:
wrangler.add_node(
    name='model_seq',
    transformer=ml_tr.SklearnModelTransformer(name='rf_seq', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_seq','y'],
    outputs=['y_seq']
)

2022-02-22 at 17:45:35 | INFO | pipeline | Node model_seq added to Pipeline 


In [43]:
wrangler.add_node(
    name='model_vif',
    transformer=ml_tr.SklearnModelTransformer(name='rf_vif', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_vif','y'],
    outputs=['y_vif']
)

2022-02-22 at 17:45:35 | INFO | pipeline | Node model_vif added to Pipeline 


In [44]:
wrangler.add_node(
    name='model_from_model',
    transformer=ml_tr.SklearnModelTransformer(name='rf_from_model', model=RandomForestRegressor(n_estimators=200)),
    inputs=['X_from_model','y'],
    outputs=['y_from_model']
)

2022-02-22 at 17:45:36 | INFO | pipeline | Node model_from_model added to Pipeline 


In [45]:

wrangler.add_node(
    name='evaluate_vif',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_vif'),
    inputs=['X_vif','y'],
    outputs=['results_vif']
)

2022-02-22 at 17:45:36 | INFO | pipeline | Node evaluate_vif added to Pipeline 


In [46]:
wrangler.add_node(
    name='evaluate_kbest',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_kbest'),
    inputs=['X_kbest','y'],
    outputs=['results_kbest']
)

2022-02-22 at 17:45:37 | INFO | pipeline | Node evaluate_kbest added to Pipeline 


In [47]:
wrangler.add_node(
    name='evaluate_seq',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_seq'),
    inputs=['X_seq','y'],
    outputs=['results_seq']
)

2022-02-22 at 17:45:37 | INFO | pipeline | Node evaluate_seq added to Pipeline 


In [48]:
wrangler.add_node(
    name='evaluate_rfe',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_rfe'),
    inputs=['X_rfe','y'],
    outputs=['results_rfe']
)

2022-02-22 at 17:45:38 | INFO | pipeline | Node evaluate_rfe added to Pipeline 


In [49]:
wrangler.add_node(
    name='evaluate_from_model',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_from_model'),
    inputs=['X_from_model','y'],
    outputs=['results_from_model']
)

2022-02-22 at 17:45:38 | INFO | pipeline | Node evaluate_from_model added to Pipeline 


In [50]:
wrangler.add_node(
    name='evaluate_correlation',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf_corr'),
    inputs=['X_correlation','y'],
    outputs=['results_from_model']
)

2022-02-22 at 17:45:39 | INFO | pipeline | Node evaluate_correlation added to Pipeline 


In [51]:

wrangler.add_node(
    name='evaluate_nothing',
    transformer=ml_tr.RegressionModelEvaluator(model=RandomForestRegressor(n_estimators=200), name='rf'),
    inputs=['X','y'],
    outputs=['results_nothing']
)

2022-02-22 at 17:45:40 | INFO | pipeline | Node evaluate_nothing added to Pipeline 


In [22]:
# def concat_predictions(y_full, y_correlation, y_rfe, y_kbest, y_seq, y_vif, y_from_model):
#     pd.concat([y_full, y_correlation, y_rfe, y_kbest, y_seq, y_vif, y_from_model])

# wrangler.add_node(
#     name='join_predictions',
#     transformer=tr.DataframeTransformer(function=concat_predictions),
#     inputs=[],
#     outputs=[]
# )

In [52]:
wrangler.fit_transform()

2022-02-22 at 17:45:43 | INFO | node | Running Node: correlation_selector
2022-02-22 at 17:45:43 | INFO | base | Loading PandasDataset(name='X', data=DataFrame)
2022-02-22 at 17:45:43 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 at 17:45:43 | DEBUG | base | Fitting CorrelationFeatureSelector(threshold=0.2)
2022-02-22 at 17:45:43 | DEBUG | base | Transforming CorrelationFeatureSelector(threshold=0.2, selected_features=['feature_0', 'feature_3', 'feature_5'])
2022-02-22 at 17:45:43 | INFO | catalog | Adding dataset: X_correlation
2022-02-22 at 17:45:43 | INFO | node | Running Node: rfe_selector
2022-02-22 at 17:45:43 | INFO | base | Loading PandasDataset(name='X', data=DataFrame)
2022-02-22 at 17:45:43 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 at 17:45:43 | DEBUG | base | Fitting RFEFeatureSelector(n_features=5, estimator=LinearRegression)
2022-02-22 at 17:45:43 | DEBUG | base | Transforming RFEFeatureSelector(n_features=5, e

Features selected by Correlation: ['feature_0', 'feature_3', 'feature_5'] 

Features selected by RFE: ['feature_0', 'feature_3', 'feature_5', 'feature_8', 'feature_9'] 



2022-02-22 at 17:45:44 | DEBUG | base | Transforming KBestFeatureSelectorRegression(n_features=5, selected_features=set)
2022-02-22 at 17:45:44 | INFO | catalog | Adding dataset: X_kbest
2022-02-22 at 17:45:44 | INFO | node | Running Node: seq_selector
2022-02-22 at 17:45:44 | INFO | base | Loading PandasDataset(name='X', data=DataFrame)
2022-02-22 at 17:45:44 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 at 17:45:44 | DEBUG | base | Fitting BackwardForwardFeatureSelector(n_features=5, estimator=LinearRegression)


Features selected by Select KBest: {'feature_1', 'feature_9', 'feature_5', 'feature_2', 'feature_0', 'feature_7', 'feature_3'} 



2022-02-22 at 17:45:45 | DEBUG | base | Transforming BackwardForwardFeatureSelector(n_features=5, estimator=LinearRegression, selected_features=set)
2022-02-22 at 17:45:45 | INFO | catalog | Adding dataset: X_seq
2022-02-22 at 17:45:45 | INFO | node | Running Node: vif_selector
2022-02-22 at 17:45:45 | INFO | base | Loading PandasDataset(name='X', data=DataFrame)
2022-02-22 at 17:45:45 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 at 17:45:45 | DEBUG | base | Fitting VIFFeatureSelector(threshold=2)
2022-02-22 at 17:45:45 | DEBUG | base | Transforming VIFFeatureSelector(threshold=2, selected_features=['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9'])
2022-02-22 at 17:45:45 | INFO | catalog | Adding dataset: X_vif
2022-02-22 at 17:45:45 | INFO | node | Running Node: from_model_selector
2022-02-22 at 17:45:45 | INFO | base | Loading PandasDataset(name='X', data=DataFrame)
2022-02-

Features selected by union of back and forward sequential selection: {'feature_9', 'feature_5', 'feature_8', 'feature_0', 'feature_3'} 

Features selected by VIF: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9'] 

Features selected from Model ElasticNet(): {'feature_5', 'feature_0', 'feature_3'} 



2022-02-22 at 17:45:46 | DEBUG | base | Transforming SklearnModelTransformer(name='rf_clf', model=RandomForestRegressor)
2022-02-22 at 17:45:46 | INFO | catalog | Adding dataset: y_full
2022-02-22 at 17:45:46 | INFO | node | Running Node: model_corr
2022-02-22 at 17:45:46 | INFO | base | Loading PandasDataset(name='X_correlation', data=DataFrame)
2022-02-22 at 17:45:46 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 at 17:45:46 | DEBUG | base | Fitting SklearnModelTransformer(name='rf_corr', model=RandomForestRegressor)
2022-02-22 at 17:45:47 | DEBUG | base | Transforming SklearnModelTransformer(name='rf_corr', model=RandomForestRegressor)
2022-02-22 at 17:45:47 | INFO | catalog | Adding dataset: y_correlation
2022-02-22 at 17:45:47 | INFO | node | Running Node: model_rfe
2022-02-22 at 17:45:47 | INFO | base | Loading PandasDataset(name='X_rfe', data=DataFrame)
2022-02-22 at 17:45:47 | INFO | base | Loading PandasDataset(name='y', data=DataFrame)
2022-02-22 a

In [54]:
wrangler.data_catalog.load('results_nothing')

2022-02-22 at 17:48:44 | INFO | base | Loading PandasDataset(name='results_nothing', data=DataFrame)


Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_mean_squared_error,train_neg_mean_squared_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
0,0.835879,0.020859,0.957673,0.992921,-622.585598,-102.716966,-24.951665,-10.134938
1,0.827275,0.019296,0.95655,0.992826,-648.608791,-103.884036,-25.467799,-10.192352
2,0.809443,0.01885,0.940524,0.993589,-850.449612,-93.838508,-29.162469,-9.687028
3,0.797382,0.021045,0.952276,0.993078,-694.653162,-100.852602,-26.356274,-10.04254
4,0.80291,0.020463,0.949559,0.992689,-717.168778,-107.075505,-26.780007,-10.347729
