# Examples of target_mean_selection class for roc_auc_score classification

## Load libraries and dataset.

In [13]:
import pandas as pd
import numpy as np
import warnings

from sklearn.datasets import fetch_openml
warnings.filterwarnings('ignore')

def load_titanic():
        data = pd.read_csv('https://www.openml.org/data/get_csv/16826755/phpMYEkMl')
        data = data.replace('?', np.nan)
        data['pclass'] = data['pclass'].astype('O')
        data['embarked'].fillna('C', inplace=True)
        # numerical
        data['fare'] = data['fare'].astype(np.float)
        data['parch'] = data['parch'].astype(np.float)
        data['sibsp'] = data['sibsp'].astype(np.float)
        data['age'] = data['age'].astype(np.float)
        data['body'] = data['body'].astype(np.float)
        
        return data

df = load_titanic()

In [2]:
df.shape, df.columns

((1309, 14),
 Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
        'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
       dtype='object'))

## import the class

In [3]:
from feature_engine.selection import SelectByTargetMeanPerformance

## split features and target

In [4]:
X, y = df[[i for i in df.columns if i not in ['survived']]], df[['survived']]

In [5]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
y.head()

Unnamed: 0,survived
0,1
1,1
2,0
3,0
4,0


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   object 
 1   name       1309 non-null   object 
 2   sex        1309 non-null   object 
 3   age        1046 non-null   float64
 4   sibsp      1309 non-null   float64
 5   parch      1309 non-null   float64
 6   ticket     1309 non-null   object 
 7   fare       1308 non-null   float64
 8   cabin      295 non-null    object 
 9   embarked   1309 non-null   object 
 10  boat       486 non-null    object 
 11  body       121 non-null    float64
 12  home.dest  745 non-null    object 
dtypes: float64(5), object(8)
memory usage: 133.1+ KB


## Filling missing values in features

In [8]:
variables_categorical_ = list(X.select_dtypes(include="O").columns)
variables_numerical_ = list(X.select_dtypes(include=["float", "integer"]).columns)
missing_cols = ['age','fare','cabin','boat','body','home.dest']
for col in missing_cols:
    if col in variables_numerical_:
        X[col].fillna(0, inplace=True)
    elif col in variables_categorical_:
        X[col].fillna('M', inplace=True)
        X[col].replace('?','Missing', inplace=True)
        X[col] = X[col].astype(str).str[0]

## Example #1

In [9]:
X_ = X[['fare','sex','pclass','age']]
sel = SelectByTargetMeanPerformance(
    variables=None,
    scoring="roc_auc_score",
    threshold=0.78,
    cv=2,
    random_state=1,
)
sel.fit(X_, y)
sel.transform(X_)
sel.feature_performance_

{'fare': 0.5453941892592249,
 'sex': 0.7605160237906083,
 'pclass': 0.6666098851766213,
 'age': 0.5265563622630385}

## Example #2

In [10]:
X_ = X[['fare','sex','pclass','age', 'boat','body']]
sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="roc_auc_score",
    threshold=0.58,
    cv=2,
    random_state=1,
)
sel_.fit(X_, y)
sel_.transform(X_)
sel_.feature_performance_

{'fare': 0.5453941892592249,
 'sex': 0.7605160237906083,
 'pclass': 0.6666098851766213,
 'age': 0.5265563622630385,
 'boat': 0.97272255043677,
 'body': 0.5587092018986104}

## Example #3

In [11]:
X_ = X[['age','sex','embarked']]

sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="roc_auc_score",
    threshold=0.78,
    cv=2,
    random_state=1,
)

sel_.fit(X_, y)
sel_.transform(X_)
sel_.feature_performance_

{'age': 0.5265563622630385,
 'sex': 0.7605160237906083,
 'embarked': 0.5810591007926145}

In [12]:
X_ = X[['pclass','sibsp','parch','fare','body','boat']]
sel_ = SelectByTargetMeanPerformance(
    variables=None,
    scoring="roc_auc_score",
    threshold=0.88,
    cv=2,
    random_state=42,
)
sel_.fit_transform(X_, y)
sel_.feature_performance_

{'pclass': 0.6652427348009367,
 'sibsp': 0.5141025894302392,
 'parch': 0.5307729298063786,
 'fare': 0.5448867797481282,
 'body': 0.5588005849312889,
 'boat': 0.9733184232234993}

+ Note: features in ['cabin','name','ticket','home.dest'] don't work. 