## 特徴量エンジニアリングのライブラリxfeat を使ってみた

In [1]:
%load_ext lab_black

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline


from functools import partial

import optuna
from sklearn.model_selection import KFold
from xfeat import (ArithmeticCombinations, ConcatCombination,
                   GBDTFeatureExplorer, GBDTFeatureSelector, LabelEncoder,
                   Pipeline, SelectCategorical, SelectNumerical, TargetEncoder,
                   aggregation)

#### データの読み込み
タイタニックデータを使用

In [3]:
train_df = pd.read_csv("../data/titanic/train.csv")
test_df = pd.read_csv("../data/titanic/test.csv")

In [4]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## xfeatを利用
#### カテゴリカルデータのみを抽出

In [5]:
SelectCategorical().fit_transform(train_df).head()

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S


#### 数値データのみを抽出

In [6]:
SelectNumerical().fit_transform(train_df).head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,71.2833
2,3,1,3,26.0,0,0,7.925
3,4,1,1,35.0,1,0,53.1
4,5,0,3,35.0,0,0,8.05


#### Label Encoding
特定のカテゴリカルデータに対し、Label Encodingを実施

In [7]:
encoder = Pipeline(
    [
        SelectCategorical(exclude_cols=["Name", "Ticket"]),
        LabelEncoder(output_suffix=""),
    ]
)

encoded_df = encoder.fit_transform(train_df)
encoded_df.head()

Unnamed: 0,Sex,Cabin,Embarked
0,0,-1,0
1,1,0,1
2,1,-1,0
3,1,1,0
4,0,-1,0


#### Target Encoding
目的変数を用いてカテゴリカルデータを数値に変換する方法

In [8]:
fold = KFold(n_splits=5, shuffle=False)
encoder = TargetEncoder(
    input_cols=["Cabin"], target_col="Survived", fold=fold, output_suffix="_re"
)

encoded_df = encoder.fit_transform(train_df)
encoded_df[["Survived", "Cabin", "Cabin_re"]].head()

Unnamed: 0,Survived,Cabin,Cabin_re
0,0,,0.303867
1,1,C85,0.303867
2,1,,0.303867
3,1,C123,0.303867
4,0,,0.303867


### カテゴリカルデータの組み合わせ
#### 二つのカテゴリカルデータを組み合わせる

In [9]:
encoder = Pipeline(
    [
        SelectCategorical(exclude_cols=["Ticket", "Name"]),
        # If there are many categorical columns,
        # users can specify the columns to be combined with `input_cols` kwargs.
        # `r=2` specifies the number of columns to combine the columns.
        ConcatCombination(
            # drop_origin=True,
            output_suffix="_re",
            r=2,
        ),
    ]
)

encoded_df = encoder.fit_transform(train_df)
encoded_df.head(3)

Unnamed: 0,Sex,Cabin,Embarked,SexCabin_re,SexEmbarked_re,CabinEmbarked_re
0,male,,S,male_NaN_,maleS,_NaN_S
1,female,C85,C,femaleC85,femaleC,C85C
2,female,,S,female_NaN_,femaleS,_NaN_S


#### 三つのカテゴリカルデータを組み合わせる

In [10]:
encoder = Pipeline(
    [
        SelectCategorical(exclude_cols=["Ticket", "Name"]),
        # If there are many categorical columns,
        # users can specify the columns to be combined with `input_cols` kwargs.
        # `r=2` specifies the number of columns to combine the columns.
        ConcatCombination(
            # drop_origin=True,
            output_suffix="_re",
            r=3,
        ),
    ]
)

encoded_df = encoder.fit_transform(train_df)
encoded_df.head()

Unnamed: 0,Sex,Cabin,Embarked,SexCabinEmbarked_re
0,male,,S,male_NaN_S
1,female,C85,C,femaleC85C
2,female,,S,female_NaN_S
3,female,C123,S,femaleC123S
4,male,,S,male_NaN_S


### 数値データの加算
#### 兄弟/配偶者、両親/子供の数を加算した特徴量を作成

In [11]:
# 2-order Arithmetic combinations.
encoder = Pipeline(
    [
        SelectNumerical(),
        ArithmeticCombinations(
            # 兄弟/配偶者、両親/子供の数を加算した特徴量を作成する
            input_cols=["SibSp", "Parch"],
            drop_origin=True,
            operator="+",
            r=2,
        ),
    ]
)

encoded_df = encoder.fit_transform(train_df)

In [12]:
train_df[["SibSp", "Parch"]].head()

Unnamed: 0,SibSp,Parch
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0


In [13]:
encoded_df.head()

Unnamed: 0,SibSpParch_combi
0,1
1,1
2,0
3,1
4,0


### Aggregation
#### 性別ごとに、年齢、Pclass の平均、最大を集計した特徴量を作成

In [14]:
from copy import deepcopy

aggregated_df = deepcopy(train_df)

# 性別ごとの年齢の平均値を特徴量に追加
sex_mean_df = train_df.groupby("Sex")["Age"].mean()
aggregated_df.loc[
    aggregated_df["Sex"] == "female", "agg_mean_Age_grpby_Sex"
] = sex_mean_df["female"]
aggregated_df.loc[
    aggregated_df["Sex"] == "male", "agg_mean_Age_grpby_Sex"
] = sex_mean_df["male"]

# 性別ごとの年齢の最大値を特徴量に追加
sex_max_df = train_df.groupby("Sex")["Age"].max()
aggregated_df.loc[
    aggregated_df["Sex"] == "female", "agg_max_Age_grpby_Sex"
] = sex_max_df["female"]
aggregated_df.loc[aggregated_df["Sex"] == "male", "agg_max_Age_grpby_Sex"] = sex_max_df[
    "male"
]

# 性別ごとのPclassの平均値を特徴量に追加
pclass_mean_df = train_df.groupby("Sex")["Pclass"].mean()
aggregated_df.loc[
    aggregated_df["Pclass"] == "female", "agg_mean_Pclass_grpby_Sex"
] = pclass_mean_df["female"]
aggregated_df.loc[
    aggregated_df["Pclass"] == "male", "agg_mean_Pclass_grpby_Sex"
] = pclass_mean_df["male"]

# 性別ごとのPclassの最大値を特徴量に追加
pclass_max_df = train_df.groupby("Sex")["Pclass"].max()
aggregated_df.loc[
    aggregated_df["Pclass"] == "female", "agg_max_Pclass_grpby_Sex"
] = pclass_max_df["female"]
aggregated_df.loc[
    aggregated_df["Pclass"] == "male", "agg_max_Pclass_grpby_Sex"
] = pclass_max_df["male"]

xfeatで実施

In [15]:
aggregated_df, aggregated_cols = aggregation(
    train_df,
    group_key="Sex",
    group_values=["Age", "Pclass"],
    agg_methods=["mean", "max"],
)

cols_to_show = ["Sex"] + aggregated_cols
aggregated_df[cols_to_show].head()

Unnamed: 0,Sex,agg_mean_Age_grpby_Sex,agg_mean_Pclass_grpby_Sex,agg_max_Age_grpby_Sex,agg_max_Pclass_grpby_Sex
0,male,30.726645,2.389948,80.0,3
1,female,27.915709,2.159236,63.0,3
2,female,27.915709,2.159236,63.0,3
3,female,27.915709,2.159236,63.0,3
4,male,30.726645,2.389948,80.0,3
