# Examples of target_mean_selection class for r2_score regression

## Load libraries and dataset.

In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.datasets import fetch_openml
warnings.filterwarnings('ignore')

def load_housing():
    from sklearn.datasets import fetch_california_housing
    d=fetch_california_housing(as_frame=True)
    return d
    
output = load_housing()
X = pd.DataFrame(output['data'])
y = pd.DataFrame(output['target'])

In [2]:
X.shape, y.shape, X.columns, y.columns

((20640, 8),
 (20640, 1),
 Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
        'Latitude', 'Longitude'],
       dtype='object'),
 Index(['MedHouseVal'], dtype='object'))

## import the class

In [3]:
from feature_engine.selection import SelectByTargetMeanPerformance

In [4]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [5]:
y.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
dtypes: float64(8)
memory usage: 1.3 MB


In [7]:
variables_categorical_ = list(X.select_dtypes(include="O").columns)
variables_numerical_ = list(
            X.select_dtypes(include=["float", "integer"]).columns
        )

In [8]:
variables_categorical_

[]

In [9]:
variables_numerical_

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [10]:
X.isnull().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
dtype: int64

In [11]:
X

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


## Example #1

In [12]:
X_ = X.copy()
sel = SelectByTargetMeanPerformance(
    variables=None,
    scoring="r2_score",
    threshold=0.68,
    cv=2,
    random_state=1,
)
sel.fit(X_, y)
sel.transform(X_)
sel.feature_performance_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Example #2

In [None]:
X_ = X[[]]
sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="r2_score",
    threshold=0.68,
    cv=2,
    random_state=1,
)
sel_.fit(X_, y)
sel_.transform(X_)
sel_.feature_performance_

## Example #3

In [None]:
X_ = X[[]]
sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="r2_score",
    threshold=0.78,
    cv=2,
    random_state=1,
)
sel_.fit(X_, y)
sel_.transform(X_)
sel_.feature_performance_

In [None]:
X_ = X[[]]
sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="r2_score",
    threshold=0.88,
    cv=2,
    random_state=42,
)
sel_.fit_transform(X_, y)
sel_.feature_performance_

+ Note: features in ['cabin','name','ticket','home.dest'] don't work. 