# Feature Selection : Wrapper Methods

Los metodos mas comunes son:

1. Forward Selection
2. Backward elimination
3. Bi-directional elimination (stepwise)

In [2]:
import pandas as pd
import matplotlib as mpl #para graficar por python
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [7]:
boston = pd.read_csv('C:\\Users\\VICTUS\\Documents\\2024\\DATA SCIENCE\\14. Validacion de un Modelo\\boston_house_prices.csv')

In [None]:
#no correr
print(boston.data.shape)         # dataset dimension
print(boston.feature_names)      # nombre feature
print(boston.target)             # target variable
print(boston.DESCR)              # data description

In [None]:
import pandas as pd
bos = pd.DataFrame(boston.data, columns = boston.feature_names)
bos['Price'] = boston.target

#DIVIDO ENTRE LA VARIABLE QUE QUIERO PREDECIR Y EL RESTO DE LAS VARIABLES
X = bos.drop("Price", 1)       # feature matrix
y = bos['Price']               # target feature VARIABLE OBJETIVO
bos.head()

In [11]:
import statsmodels.api as sm
def forward_selection(data, target, significance_level=0.05):  #Nombre del dataset - variable target (y) -  nivel de significancia (0.05 generalmente)
    initial_features = data.columns.tolist() #nos traemos todas las columnas del df en una lista
    best_features = [] #hacemos una lista vacia

    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features)) #todas las columnas iniciales - las mejores (la primera vez esta vacia pero va a ir comparando una con otra)
        new_pval = pd.Series(index=remaining_features) #me armo un panda series
        
        for new_column in remaining_features: #a cada columna que se va guardando hacele un modelo de regresion lineal, le va agregando columnas de a poco y va viendo la importancia de la variable
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<significance_level): #si el p value es menor al nivel de significancia va agregando las variables importantes
            best_features.append(new_pval.idxmin())
        else:
            break
    return best_features

In [None]:
forward_selection(X,y) #aca me tira las variables importantes

# Implementacion de future selection usando MLXTEND

In [13]:
!pip install mlxtend



In [14]:
import sys
import joblib
sys.modules['sklearn.externals.joblib'] = joblib

In [16]:
#Librerias
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=11, #le paso la cantidad de features que yo me quiero quedar, puedo ir variandolas
          forward=True,
          floating=False,
          scoring = 'r2', #scoring
          cv = 0)

In [None]:
sfs.fit(X, y)
sfs.k_feature_names_     #Lista final de features

#aca me tira cuales son las variables importantes para el modelo

# Backward Selection

- Va a ir eliminando variables que no sean importantes y se queda con el minimo indispensable de variables
- El codigo es parecido al anterior pero no le indico la cantidad de variables que se tiene que quedar

In [18]:
def backward_elimination(data, target,significance_level = 0.05): #df, target, nivel de significancia
    features = data.columns.tolist() #toma todas las features como importantes y las va eliminando
    while(len(features)>0):
        features_with_constant = sm.add_constant(data[features])
        p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:]
        max_p_value = p_values.max() #me quedo con el que de mejor nivel de significancia

        if(max_p_value >= significance_level):
            excluded_feature = p_values.idxmax()
            features.remove(excluded_feature) #elimina de la lista original
        else:
            break
    return features

In [None]:
backward_elimination(X,y)
# aca me tira cuales son finalemente las variables importantes del datset

# Eliminacion Bidireccional (stepwise)

- Es una mezcla de los dos anteriores

In [19]:
def stepwise_selection(data, target,SL_in=0.05,SL_out = 0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features)
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value<SL_in):
            best_features.append(new_pval.idxmin())
            while(len(best_features)>0):
                best_features_with_constant = sm.add_constant(data[best_features])
                p_values = sm.OLS(target, best_features_with_constant).fit().pvalues[1:]
                max_p_value = p_values.max()
                if(max_p_value >= SL_out):
                    excluded_feature = p_values.idxmax()
                    best_features.remove(excluded_feature)
                else:
                    break
        else:
            break
    return best_features

In [None]:
stepwise_selection(X,y) #aca me tira las variables mas importantes

# Metricas algoritmos de clasificacion

In [20]:
 from sklearn.datasets import load_breast_cancer
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn import metrics
 import pandas as pd
 import numpy as np
 from matplotlib import pyplot as plt
 import seaborn as sns
 sns.set_style('whitegrid')

In [21]:
# Cargamos dataset de cancer de mama
data = load_breast_cancer()
# definimos matriz de dise√±o X y vector respuesta y
X = pd.DataFrame(data['data'], columns=data['feature_names'])
y = abs(pd.Series(data['target'])-1)
# Separamos en entrenamiento/test en razon 80/20 %
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1)
# Creamos un modelo Random Forest con parametros por defect
modelo = RandomForestClassifier(random_state=1)
modelo.fit(X_train, y_train)
# Obtenemos las predicciones del modelo con X_test
preds = modelo.predict(X_test)

In [None]:
plt.figure(figsize=(10,6))
metrics.plot_confusion_matrix(modelo, X_test, y_test, display_labels=['Negative', 'Positive'])

#ME TIRA MATRIZ DE CONFUSION

In [None]:
confusion = metrics.confusion_matrix(y_test, preds)
confusion.ravel()

In [None]:
accuracy = metrics.accuracy_score(y_test, preds)
accuracy

#Me dio 0.95 quiere decir que le pego un 95%

In [None]:
# Precision se evalua para cada categoria
precision_positiva = metrics.precision_score(y_test, preds, pos_label=1)
precision_negativa = metrics.precision_score(y_test, preds, pos_label=0)
precision_positiva, precision_negativa
#(1.0 , 0.9) quiere decir que oara predecir positiva es 100% confiable y para negativa ed un 90

In [None]:
recall_sensibilidad = metrics.recall_score(y_test, preds, pos_label=1)
recall_especificidad= metrics.recall_score(y_test, preds, pos_label=0)
recall_sensibilidad, recall_especificidad
#SENSIBILIDADM ESPECIFICIDAD

In [23]:
# Todas las metricas en uno
print(metrics.classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        72
           1       1.00      0.88      0.94        42

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



# Metricas para algoritmos de regresion

In [24]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Carguemos un dataset de ejemplo
diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True)
diabetes_X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [25]:
diabetes_y

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [26]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(diabetes_X,diabetes_y,test_size=0.2,random_state=2)
from sklearn.linear_model import LinearRegression
# crear el modelo
lr = LinearRegression()
# Ajustar el modelo con X_train y y_train
lr.fit(X_train,y_train)
# PRedecir con X_test
y_pred = lr.predict(X_test)

In [33]:
from sklearn.metrics import mean_absolute_error
print("MAE",mean_absolute_error(y_test,y_pred))
#45 de errror es bastante
#ERROR ABSOLUTO MEDIO: mide el promedio de los errores 

MAE 45.213034190469024


In [29]:
from sklearn.metrics import mean_squared_error
print("MSE",mean_squared_error(y_test,y_pred))#CUADRADO

MSE 3094.4566715660626


In [30]:
print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred)))

RMSE 55.627840795469155


In [31]:
print("RMSE",np.log(np.sqrt(mean_squared_error(y_test,y_pred))))

RMSE 4.018683809662696


In [34]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred)
print(r2)

#R2: explicatividad de la variabilidad de los datos, cuanto mas alto mejor
#aca me dice que solo esta tomando el 40% de la variabilidad de los datos, por lo que es bastante malo

0.4399338661568968
