In [2]:
# ピアソンの相関係数を用いて、相関の高い特徴量を探す
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X = data["data"]
col_names = data["feature_names"]
y = data["target"]

# DFに変換
df = pd.DataFrame(X, columns=col_names)
# 相関の高い特徴量を作成
df["MedInc_sqrt"]=df.MedInc.apply(np.sqrt)

# ピアソン係数の表示
df.corr()


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedInc_sqrt
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176,0.984329
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197,-0.132797
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754,0.326688
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344,-0.06691
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773,0.018415
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476,0.015266
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664,-0.084303
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0,-0.015569
MedInc_sqrt,0.984329,-0.132797,0.326688,-0.06691,0.018415,0.015266,-0.084303,-0.015569,1.0


In [None]:
# 単変量特徴量選択を特徴量のタイプと分析タイプに応じて実行するためのラッパークラス
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile


class UnivariateFeatureSelection:
    def __init__(self, n_features, problem_type, scoring):
        """ パラメータに応じて単変量特徴量選択に対応するためのラッパークラス
        :param n_features floatはSelectPercemtile、それ以外はSelectKBest
        :param problem_type 分類か回帰か
        :param scoring 手法名。文字列型
        """
        # 分析タイプに応じた特徴量選択の手法名を定義①
        if problem_type=="classification":
            valid_scoring = {
                "f_classif": f_classif,
                "chi2": chi2,
                "mutual_info_classif": mutual_info_classif,
            }
            
        # 分析タイプに応じた特徴量選択の手法名を定義②
        else:
            valid_scoring = {
                "f_regression": f_regression,
                "mutual_info_regression": mutual_info_regression,
            }
        
        # 入力された手法とアルゴリズムのタイプが対応していない時の例外
        if scoring not in valid_scoring:
            raise Exception("Invalid scoring function")
        
        # 特徴量がintの場合はSelectKBest
        if isinstance(n_features, int):
            self.selection = SelectKBest(
                valid_scoring[scoring],
                k=n_features
            )
        # 特徴量がfloatの場合はSelectPercentile
        elif isinstance(n_features, float):
            self.selection = SelectPercentile(
                valid_scoring[scoring],
                percentile=(n_features * 100)
            )
            
        else:
            raise Exception("Invalid scoring function")
        
        # fit
        def fit(self, X, y):
            return self.selection(self, X, y)
        
        # transform
        def transform(self, X, y):
            return self.transform(self, X, y)
        
        #fit_transform
        def fit_transform(self, X, y):
            return self.fit_transform(self, X, y)
        