# Ejemplo Seleccion de Variables

In [1]:
import pandas as pd
import numpy as np
from wrangler import Wrangler
from wrangler.data import PandasDataset, CSVDataset
import wrangler.transformers as tr
import wrangler.transformers.text as text_tr
import wrangler.transformers.ml as ml_tr
from wrangler import logger as wrangler_logger

from sklearn.datasets import make_classification, make_regression
from sklearn.linear_model import LinearRegression, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

### Regression

In [2]:
X_reg, y_reg = make_regression(
    n_samples=1000, 
    n_features=10,
    n_informative=5, 
    random_state=45
)
X_reg = pd.DataFrame(X_reg, columns=[f"feature_{i}" for i in range(X_reg.shape[1])])
y_reg = pd.DataFrame(y_reg, columns=['target'])


In [3]:
constant_selector = ml_tr.ConstantFeatureSelector(threshold=0.9)
constant_selector.fit(X_reg).transform(X_reg)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,1.033322,-1.929203,-0.742612,-1.111879,0.525046,-0.429861,0.684294,1.143700,1.108751,-0.647347
1,0.332632,-1.654263,2.248090,-1.393813,-0.499546,-0.606944,0.673129,1.670300,-0.627695,-1.541367
2,-0.253937,-0.472065,0.270377,1.162081,1.371881,0.930392,0.141970,0.354050,-0.568870,0.276627
3,0.906857,0.307474,0.853668,-0.287983,0.656502,-0.450929,1.246239,1.216485,-1.455277,-0.530376
4,-1.222388,-1.143100,0.607359,0.665962,-0.056158,0.570558,0.003816,-0.252877,0.776616,-0.841119
...,...,...,...,...,...,...,...,...,...,...
995,0.060660,0.066531,-0.155566,-0.424515,-0.470683,-1.173683,1.088382,0.064572,1.179325,-0.081126
996,1.512642,-1.220905,1.744725,2.362092,1.626819,-0.731330,-0.738586,1.280592,-0.584389,-0.465799
997,0.470598,-0.215780,0.063875,-2.706865,1.026021,0.070994,0.657090,-1.373408,1.473614,1.186668
998,-1.495550,-1.779412,0.413934,1.027929,-0.713468,0.409360,0.282206,-0.337900,1.169105,0.436709


In [4]:
corr_selector = ml_tr.CorrelationFeatureSelector(threshold=0.2)
corr_selector.fit(X_reg, y_reg).transform(X_reg)


Features selected by Correlation: ['feature_0', 'feature_3', 'feature_5'] 



Unnamed: 0,feature_0,feature_3,feature_5
0,1.033322,-1.111879,-0.429861
1,0.332632,-1.393813,-0.606944
2,-0.253937,1.162081,0.930392
3,0.906857,-0.287983,-0.450929
4,-1.222388,0.665962,0.570558
...,...,...,...
995,0.060660,-0.424515,-1.173683
996,1.512642,2.362092,-0.731330
997,0.470598,-2.706865,0.070994
998,-1.495550,1.027929,0.409360


In [5]:
rfe_reg = ml_tr.RFEFeatureSelector(n_features=5, estimator=LinearRegression())
rfe_reg.fit(X_reg, y_reg).transform(X_reg)

Features selected by RFE: ['feature_0', 'feature_3', 'feature_5', 'feature_8', 'feature_9'] 



Unnamed: 0,feature_0,feature_3,feature_5,feature_8,feature_9
0,1.033322,-1.111879,-0.429861,1.108751,-0.647347
1,0.332632,-1.393813,-0.606944,-0.627695,-1.541367
2,-0.253937,1.162081,0.930392,-0.568870,0.276627
3,0.906857,-0.287983,-0.450929,-1.455277,-0.530376
4,-1.222388,0.665962,0.570558,0.776616,-0.841119
...,...,...,...,...,...
995,0.060660,-0.424515,-1.173683,1.179325,-0.081126
996,1.512642,2.362092,-0.731330,-0.584389,-0.465799
997,0.470598,-2.706865,0.070994,1.473614,1.186668
998,-1.495550,1.027929,0.409360,1.169105,0.436709


In [6]:
kbest_reg = ml_tr.KBestFeatureSelectorRegression(n_features=5)
kbest_reg.fit(X_reg, y_reg).transform(X_reg)

Features selected by Select KBest: {'feature_5', 'feature_2', 'feature_7', 'feature_1', 'feature_0', 'feature_3', 'feature_9'} 



Unnamed: 0,feature_5,feature_2,feature_7,feature_1,feature_0,feature_3,feature_9
0,-0.429861,-0.742612,1.143700,-1.929203,1.033322,-1.111879,-0.647347
1,-0.606944,2.248090,1.670300,-1.654263,0.332632,-1.393813,-1.541367
2,0.930392,0.270377,0.354050,-0.472065,-0.253937,1.162081,0.276627
3,-0.450929,0.853668,1.216485,0.307474,0.906857,-0.287983,-0.530376
4,0.570558,0.607359,-0.252877,-1.143100,-1.222388,0.665962,-0.841119
...,...,...,...,...,...,...,...
995,-1.173683,-0.155566,0.064572,0.066531,0.060660,-0.424515,-0.081126
996,-0.731330,1.744725,1.280592,-1.220905,1.512642,2.362092,-0.465799
997,0.070994,0.063875,-1.373408,-0.215780,0.470598,-2.706865,1.186668
998,0.409360,0.413934,-0.337900,-1.779412,-1.495550,1.027929,0.436709


In [7]:
seq_reg = ml_tr.BackwardForwardFeatureSelector(
    n_features=5, 
    estimator=LinearRegression()
)

seq_reg.fit(X_reg, y_reg).transform(X_reg)

Features selected by union of back and forward sequential selection: {'feature_5', 'feature_0', 'feature_3', 'feature_8', 'feature_9'} 



Unnamed: 0,feature_5,feature_0,feature_3,feature_8,feature_9
0,-0.429861,1.033322,-1.111879,1.108751,-0.647347
1,-0.606944,0.332632,-1.393813,-0.627695,-1.541367
2,0.930392,-0.253937,1.162081,-0.568870,0.276627
3,-0.450929,0.906857,-0.287983,-1.455277,-0.530376
4,0.570558,-1.222388,0.665962,0.776616,-0.841119
...,...,...,...,...,...
995,-1.173683,0.060660,-0.424515,1.179325,-0.081126
996,-0.731330,1.512642,2.362092,-0.584389,-0.465799
997,0.070994,0.470598,-2.706865,1.473614,1.186668
998,0.409360,-1.495550,1.027929,1.169105,0.436709


In [8]:
vif_reg = ml_tr.VIFFeatureSelector(threshold=2)
vif_reg.fit(X_reg, y_reg).transform(X_reg)

Features selected by VIF: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9'] 



Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,1.033322,-1.929203,-0.742612,-1.111879,0.525046,-0.429861,0.684294,1.143700,1.108751,-0.647347
1,0.332632,-1.654263,2.248090,-1.393813,-0.499546,-0.606944,0.673129,1.670300,-0.627695,-1.541367
2,-0.253937,-0.472065,0.270377,1.162081,1.371881,0.930392,0.141970,0.354050,-0.568870,0.276627
3,0.906857,0.307474,0.853668,-0.287983,0.656502,-0.450929,1.246239,1.216485,-1.455277,-0.530376
4,-1.222388,-1.143100,0.607359,0.665962,-0.056158,0.570558,0.003816,-0.252877,0.776616,-0.841119
...,...,...,...,...,...,...,...,...,...,...
995,0.060660,0.066531,-0.155566,-0.424515,-0.470683,-1.173683,1.088382,0.064572,1.179325,-0.081126
996,1.512642,-1.220905,1.744725,2.362092,1.626819,-0.731330,-0.738586,1.280592,-0.584389,-0.465799
997,0.470598,-0.215780,0.063875,-2.706865,1.026021,0.070994,0.657090,-1.373408,1.473614,1.186668
998,-1.495550,-1.779412,0.413934,1.027929,-0.713468,0.409360,0.282206,-0.337900,1.169105,0.436709


In [9]:
from_model_reg = ml_tr.FromModelFeatureSelector(n_features=5, estimator=ElasticNet())
from_model_reg.fit(X_reg, y_reg).transform(X_reg)

Features selected from Model ElasticNet(): {'feature_0', 'feature_3', 'feature_5'} 



Unnamed: 0,feature_0,feature_3,feature_5
0,1.033322,-1.111879,-0.429861
1,0.332632,-1.393813,-0.606944
2,-0.253937,1.162081,0.930392
3,0.906857,-0.287983,-0.450929
4,-1.222388,0.665962,0.570558
...,...,...,...
995,0.060660,-0.424515,-1.173683
996,1.512642,2.362092,-0.731330
997,0.470598,-2.706865,0.070994
998,-1.495550,1.027929,0.409360


### Classification

In [16]:
X_clf, y_clf = make_classification(
    n_samples=1000, 
    n_features=20,
    n_informative=10, 
    n_redundant=2,
    n_repeated=0, 
    n_classes=2, 
    random_state=45
)
X_clf = pd.DataFrame(X_clf, columns=[f"feature_{i}" for i in range(X_clf.shape[1])])
y_clf = pd.DataFrame(y_clf, columns=['target'])

In [11]:
constant_selector = ml_tr.ConstantFeatureSelector(threshold=0.9)
constant_selector.fit(X_clf).transform(X_clf)

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19
0,-1.580361,-0.887034,1.540749,0.067591,0.360646,0.575869,2.758616,3.087983,-2.676556,-1.129382,1.366087,-0.900623,-1.178522,-0.904983,0.552733,-2.275645,1.718287,0.762156,1.548975,-0.491114
1,-0.424849,-2.956863,0.609998,2.752760,-1.147849,0.624683,2.490057,1.021246,-1.053026,1.423420,1.931774,0.963936,-1.369744,1.919941,1.080734,3.353631,-0.898501,-0.392887,0.312617,-0.020858
2,1.179244,-0.576736,-0.432089,0.445219,0.815847,-0.524605,-1.905888,0.534551,1.454013,-1.575065,1.430067,0.098367,-0.226979,1.389737,-0.218011,-2.936310,-2.402560,0.540674,1.678322,-1.276937
3,0.307293,0.200963,2.824127,-0.720151,1.858719,0.956398,-0.601655,0.382753,-0.218382,-2.576670,-1.309205,4.428591,0.179411,-1.494399,-0.410658,-0.344793,1.110467,-0.315244,-2.236413,0.289424
4,2.309082,1.155669,2.578536,2.239298,1.073119,0.900238,0.066015,4.139504,2.241478,1.226906,0.442540,0.893552,0.106806,2.658415,0.152683,1.887397,0.941726,0.550348,2.182732,0.616452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2.635478,0.820427,2.670196,0.348308,-1.744886,-2.453938,-0.032924,3.271964,0.427774,-0.573307,-0.782703,2.072406,-0.220179,0.128878,0.505389,0.479860,0.014539,1.883090,0.442077,-0.627550
996,-4.638027,0.160292,-2.456673,0.403680,3.035365,0.368870,-2.514847,3.702176,-1.806755,-3.040096,-2.208105,-2.032693,2.204714,2.699989,1.314662,-0.276036,1.277271,-0.050341,-3.873266,-0.273772
997,-0.659840,0.071146,-3.714766,-0.969586,1.599312,0.760526,-2.742966,1.815548,0.322718,-1.376190,5.663168,0.175041,-0.900622,0.167702,-1.105180,-3.389155,-1.293801,-0.867524,1.377204,0.834560
998,5.273949,-1.091498,1.640252,-1.293241,-2.994171,-0.756625,1.611930,0.754943,-0.144511,2.195906,-4.608270,0.786892,-0.183316,1.616925,-0.004689,1.323990,-0.724467,0.239161,2.823661,-1.298770


In [12]:
rfe_clf = ml_tr.RFEFeatureSelector(n_features=5, estimator=RandomForestClassifier())
rfe_clf.fit(X_clf, y_clf).transform(X_clf)

Features selected by RFE: ['feature_0', 'feature_2', 'feature_4', 'feature_7', 'feature_9', 'feature_10', 'feature_11', 'feature_13', 'feature_15', 'feature_16', 'feature_18'] 



Unnamed: 0,feature_0,feature_2,feature_4,feature_7,feature_9,feature_10,feature_11,feature_13,feature_15,feature_16,feature_18
0,-1.580361,1.540749,0.360646,3.087983,-1.129382,1.366087,-0.900623,-0.904983,-2.275645,1.718287,1.548975
1,-0.424849,0.609998,-1.147849,1.021246,1.423420,1.931774,0.963936,1.919941,3.353631,-0.898501,0.312617
2,1.179244,-0.432089,0.815847,0.534551,-1.575065,1.430067,0.098367,1.389737,-2.936310,-2.402560,1.678322
3,0.307293,2.824127,1.858719,0.382753,-2.576670,-1.309205,4.428591,-1.494399,-0.344793,1.110467,-2.236413
4,2.309082,2.578536,1.073119,4.139504,1.226906,0.442540,0.893552,2.658415,1.887397,0.941726,2.182732
...,...,...,...,...,...,...,...,...,...,...,...
995,2.635478,2.670196,-1.744886,3.271964,-0.573307,-0.782703,2.072406,0.128878,0.479860,0.014539,0.442077
996,-4.638027,-2.456673,3.035365,3.702176,-3.040096,-2.208105,-2.032693,2.699989,-0.276036,1.277271,-3.873266
997,-0.659840,-3.714766,1.599312,1.815548,-1.376190,5.663168,0.175041,0.167702,-3.389155,-1.293801,1.377204
998,5.273949,1.640252,-2.994171,0.754943,2.195906,-4.608270,0.786892,1.616925,1.323990,-0.724467,2.823661


In [13]:
kbest_reg = ml_tr.KBestFeatureSelectorClassification(n_features=6)
kbest_reg.fit(X_clf, y_clf).transform(X_clf)

Features selected by Select KBest: {'feature_11', 'feature_2', 'feature_7', 'feature_0', 'feature_18', 'feature_15', 'feature_10', 'feature_8', 'feature_4'} 



Unnamed: 0,feature_11,feature_2,feature_7,feature_0,feature_18,feature_15,feature_10,feature_8,feature_4
0,-0.900623,1.540749,3.087983,-1.580361,1.548975,-2.275645,1.366087,-2.676556,0.360646
1,0.963936,0.609998,1.021246,-0.424849,0.312617,3.353631,1.931774,-1.053026,-1.147849
2,0.098367,-0.432089,0.534551,1.179244,1.678322,-2.936310,1.430067,1.454013,0.815847
3,4.428591,2.824127,0.382753,0.307293,-2.236413,-0.344793,-1.309205,-0.218382,1.858719
4,0.893552,2.578536,4.139504,2.309082,2.182732,1.887397,0.442540,2.241478,1.073119
...,...,...,...,...,...,...,...,...,...
995,2.072406,2.670196,3.271964,2.635478,0.442077,0.479860,-0.782703,0.427774,-1.744886
996,-2.032693,-2.456673,3.702176,-4.638027,-3.873266,-0.276036,-2.208105,-1.806755,3.035365
997,0.175041,-3.714766,1.815548,-0.659840,1.377204,-3.389155,5.663168,0.322718,1.599312
998,0.786892,1.640252,0.754943,5.273949,2.823661,1.323990,-4.608270,-0.144511,-2.994171


In [14]:
seq_clf = ml_tr.BackwardForwardFeatureSelector(
    n_features=5, 
    estimator=LogisticRegression()
)

seq_clf.fit(X_clf, y_clf).transform(X_clf)

Features selected by union of back and forward sequential selection: {'feature_16', 'feature_2', 'feature_6', 'feature_0', 'feature_18', 'feature_10', 'feature_13', 'feature_15', 'feature_9', 'feature_4'} 



Unnamed: 0,feature_16,feature_2,feature_6,feature_0,feature_18,feature_10,feature_13,feature_15,feature_9,feature_4
0,1.718287,1.540749,2.758616,-1.580361,1.548975,1.366087,-0.904983,-2.275645,-1.129382,0.360646
1,-0.898501,0.609998,2.490057,-0.424849,0.312617,1.931774,1.919941,3.353631,1.423420,-1.147849
2,-2.402560,-0.432089,-1.905888,1.179244,1.678322,1.430067,1.389737,-2.936310,-1.575065,0.815847
3,1.110467,2.824127,-0.601655,0.307293,-2.236413,-1.309205,-1.494399,-0.344793,-2.576670,1.858719
4,0.941726,2.578536,0.066015,2.309082,2.182732,0.442540,2.658415,1.887397,1.226906,1.073119
...,...,...,...,...,...,...,...,...,...,...
995,0.014539,2.670196,-0.032924,2.635478,0.442077,-0.782703,0.128878,0.479860,-0.573307,-1.744886
996,1.277271,-2.456673,-2.514847,-4.638027,-3.873266,-2.208105,2.699989,-0.276036,-3.040096,3.035365
997,-1.293801,-3.714766,-2.742966,-0.659840,1.377204,5.663168,0.167702,-3.389155,-1.376190,1.599312
998,-0.724467,1.640252,1.611930,5.273949,2.823661,-4.608270,1.616925,1.323990,2.195906,-2.994171


In [15]:
from_model_clf = ml_tr.FromModelFeatureSelector(n_features=5, estimator=RandomForestClassifier())
from_model_clf.fit(X_clf, y_clf).transform(X_clf)

Features selected from Model RandomForestClassifier(): {'feature_2', 'feature_7', 'feature_0', 'feature_18', 'feature_13', 'feature_10', 'feature_16'} 



Unnamed: 0,feature_2,feature_7,feature_0,feature_18,feature_13,feature_10,feature_16
0,1.540749,3.087983,-1.580361,1.548975,-0.904983,1.366087,1.718287
1,0.609998,1.021246,-0.424849,0.312617,1.919941,1.931774,-0.898501
2,-0.432089,0.534551,1.179244,1.678322,1.389737,1.430067,-2.402560
3,2.824127,0.382753,0.307293,-2.236413,-1.494399,-1.309205,1.110467
4,2.578536,4.139504,2.309082,2.182732,2.658415,0.442540,0.941726
...,...,...,...,...,...,...,...
995,2.670196,3.271964,2.635478,0.442077,0.128878,-0.782703,0.014539
996,-2.456673,3.702176,-4.638027,-3.873266,2.699989,-2.208105,1.277271
997,-3.714766,1.815548,-0.659840,1.377204,0.167702,5.663168,-1.293801
998,1.640252,0.754943,5.273949,2.823661,1.616925,-4.608270,-0.724467
