### このノートの目的
- どんなデータを入力してもエラーの出ない特徴量選択関数の作成

In [78]:
import itertools
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
import random
from pyqubo import Array, OneHotEncInteger, solve_qubo
import scipy.stats

## 特徴量選択関数を作るのに試したいデータの複数作成

#### 特徴量5

In [79]:
bina_len5 = [list(i) for i in itertools.product([0, 1], repeat=5)]
len(bina_len5)

32

In [80]:
SNP_df55= pd.DataFrame([bina_len5[9], bina_len5[30], bina_len5[10], bina_len5[20], bina_len5[3]], columns=list('ABCDE'))
#SNP_df55

In [81]:
SNP_df65= pd.DataFrame([bina_len5[11], bina_len5[29], bina_len5[23], bina_len5[18], bina_len5[5], bina_len5[9]], columns=list('ABCDE'))
#SNP_df65

In [82]:
# CSV ファイル として出力
#SNP_df55.to_csv("../input/SNP_df55.csv")

In [83]:
# CSV ファイル として出力
#SNP_df65.to_csv("../input/SNP_df65.csv")

In [84]:
X_ori55 = pd.read_csv("../input/SNP_df55.csv", sep=',', index_col=0)
#X_ori55

In [85]:
X_ori65 = pd.read_csv("../input/SNP_df65.csv", sep=',', index_col=0)
#X_ori65.shape

#### 特徴量10

In [86]:
bina_len10 = [list(i) for i in itertools.product([0, 1], repeat=10)]
len(bina_len10)

1024

In [87]:
SNP_df1010 = pd.DataFrame(
    [bina_len10[1000], bina_len10[3], bina_len10[200], bina_len10[80], bina_len10[500], 
     bina_len10[700], bina_len10[30], bina_len10[800], bina_len10[50], bina_len10[300]], 
    columns=list('ABCDEFGHIJ'))
#SNP_df1010.head()

In [88]:
SNP_df1110 = pd.DataFrame(
    [bina_len10[1010], bina_len10[10], bina_len10[401], bina_len10[79], bina_len10[550], 
     bina_len10[690], bina_len10[25], bina_len10[810], bina_len10[49], bina_len10[310], bina_len10[200]], 
    columns=list('ABCDEFGHIJ'))
#SNP_df1110.head()

In [89]:
# CSV ファイルとして出力
#SNP_df1010.to_csv("../input/SNP_df1010.csv")

In [90]:
# CSV ファイルとして出力
#SNP_df1110.to_csv("../input/SNP_df1110.csv")

In [91]:
X_ori1010 = pd.read_csv("../input/SNP_df1010.csv", sep=',', index_col=0)
#X_ori1010.head()

In [92]:
X_ori1110 = pd.read_csv("../input/SNP_df1110.csv", sep=',', index_col=0)
#X_ori1110.head()

#### y（長さ（サンプル数別））

In [93]:
ori_y5 = pd.Series([1, 0, 1, 1, 0])
#ori_y5

In [119]:
ori_y6 = pd.Series([1, 0, 0, 1, 0, 1])
#ori_y6

In [95]:
ori_y10 = pd.Series([1, 0, 1, 1, 0, 1, 1, 0, 0 , 0])
#ori_y10

In [96]:
ori_y11 = pd.Series([1, 1, 0, 1, 1, 0, 1, 1, 0, 0 , 0])
#ori_y11

In [97]:
sel_col_num1 = 1
sel_col_num2 = 2
sel_col_num3 = 3

### 関数定義

In [98]:
#標準偏差が0である列をなくす
def drop_str0_X(X):
    for each_col in X.columns:
        if X[each_col].std()==0.0:
            X = X.drop(each_col, axis=1)
    return X

In [36]:
#内積を使った特徴量選択の関数
#max_indexが複数になったときはどうなるのか気になる
def featrure_InnerProduct_sel1(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns]
    max_index = inner_product_list.index(max(inner_product_list))
    select_cols = [X.columns[max_index]]
    
    else_cols = list(X.columns)
    del else_cols[max_index]
    
    return select_cols, else_cols

In [126]:
def featrure_InnerProduct_sel_mul(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns] #内積のリスト
    sort_inner_product_list = sorted(inner_product_list ,reverse=True)
    
    select_cols = []
    for i in range(sel_col_num):
        iindex = inner_product_list.index(sort_inner_product_list[i])
        select_cols.append(iindex)
    
    else_cols = list(X.columns)
    for col in select_cols:
        else_cols.remove(col)
    
    return select_cols, else_cols

### dataframeの調整（adjust）

In [99]:
X_ori55_ad = drop_str0_X(X_ori55)
X_ori55_ad.shape

(5, 5)

In [100]:
X_ori65_ad = drop_str0_X(X_ori65)
X_ori65_ad.shape

(6, 5)

In [101]:
X_ori1010_ad = drop_str0_X(X_ori1010)
X_ori1010_ad.shape

(10, 10)

In [102]:
X_ori1110_ad = drop_str0_X(X_ori1110)
X_ori1110_ad.shape

(11, 10)

### 特徴量選択の試行

#### 特徴量選択数：1

In [114]:
inner_product_list55 = [np.dot(X_ori55_ad[each_col], ori_y5) for each_col in X_ori55_ad.columns]
inner_product_list55

[1, 2, 1, 1, 1]

In [103]:
select_cols, else_cols = featrure_InnerProduct_sel1(X_ori55_ad, ori_y5, sel_col_num1)
select_cols

['B']

In [117]:
X_ori65_ad.shape

(6, 5)

In [120]:
ori_y6.shape

(6,)

In [121]:
inner_product_list65 = [np.dot(X_ori65_ad[each_col], ori_y6) for each_col in X_ori65_ad.columns]
inner_product_list65

[1, 2, 0, 2, 2]

In [124]:
max_index65 = inner_product_list65.index(max(inner_product_list65))
max_index65 #1つしか出力されない。そして一番若い番号が出力される。

1

In [107]:
select_cols, else_cols = featrure_InnerProduct_sel1(X_ori1010_ad, ori_y10, sel_col_num1)
select_cols

['G']

In [108]:
inner_product_list1010 = [np.dot(X_ori1010_ad[each_col], ori_y10) for each_col in X_ori1010.columns]
inner_product_list1010

[2, 1, 3, 3, 2, 3, 4, 2, 1, 0]

In [122]:
max_index1010 = inner_product_list1010.index(max(inner_product_list1010))
max_index1010 #4という数字は一つだったので、特徴量選択数一つという条件において問題にならなかった

6

In [116]:
inner_product_list1110 = [np.dot(X_ori1110_ad[each_col], ori_y11) for each_col in X_ori1110_ad.columns]
inner_product_list1110

[3, 2, 1, 2, 3, 2, 4, 2, 5, 2]

In [123]:
max_index1110 = inner_product_list1110.index(max(inner_product_list1110))
max_index1110 #5という数字は一つだったので、特徴量選択数一つという条件において問題にならなかった

8

#### 特徴量選択数: 2以上

In [142]:
def featrure_InnerProduct_sel_mul(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns] #内積のリスト
    sort_inner_product_list = sorted(inner_product_list ,reverse=True)
    
    select_cols = []
    for i in range(sel_col_num):
        iindex = inner_product_list.index(sort_inner_product_list[i])
        select_cols.append(X.columns[iindex])
    
    else_cols = list(X.columns)
    for col in select_cols:
        else_cols.remove(col)
    
    return select_cols, else_cols

In [143]:
inner_product_list55 = [np.dot(X_ori55_ad[each_col], ori_y5) for each_col in X_ori55_ad.columns]
inner_product_list55

[1, 2, 1, 1, 1]

In [146]:
select_cols, else_cols = featrure_InnerProduct_sel_mul(X_ori55_ad, ori_y5, sel_col_num2)
select_cols

['B', 'A']

In [147]:
else_cols

['C', 'D', 'E']

In [144]:
sort_inner_product_list55 = sorted(inner_product_list55,reverse=True)
sort_inner_product_list55

[2, 1, 1, 1, 1]

In [145]:
select_cols = []
for i in range(sel_col_num2):
    iindex55 = inner_product_list55.index(sort_inner_product_list55[i])
    select_cols.append(iindex55)
select_cols

[1, 0]

### featrure_InnerProduct_sel_mulを採用する

In [None]:
def featrure_InnerProduct_sel(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns] #内積のリスト
    sort_inner_product_list = sorted(inner_product_list ,reverse=True)
    
    select_cols = []
    for i in range(sel_col_num):
        iindex = inner_product_list.index(sort_inner_product_list[i])
        select_cols.append(X.columns[iindex])
    
    else_cols = list(X.columns)
    for col in select_cols:
        else_cols.remove(col)
    
    return select_cols, else_cols

In [None]:
#scikit-learn/blob/master/sklearn/feature_selection/_univariate_selection.py
#https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_selection/_univariate_selection.py
class SelectKBest(_BaseFilter):
    """Select features according to the k highest scores.
    Read more in the :ref:`User Guide <univariate_feature_selection>`.
    Parameters
    ----------
    score_func : callable, default=f_classif
        Function taking two arrays X and y, and returning a pair of arrays
        (scores, pvalues) or a single array with scores.
        Default is f_classif (see below "See also"). The default function only
        works with classification tasks.
        .. versionadded:: 0.18
    k : int or "all", default=10
        Number of top features to select.
        The "all" option bypasses selection, for use in a parameter search.
    Attributes
    ----------
    scores_ : array-like of shape (n_features,)
        Scores of features.
    pvalues_ : array-like of shape (n_features,)
        p-values of feature scores, None if `score_func` returned only scores.
    Examples
    --------
    >>> from sklearn.datasets import load_digits
    >>> from sklearn.feature_selection import SelectKBest, chi2
    >>> X, y = load_digits(return_X_y=True)
    >>> X.shape
    (1797, 64)
    >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
    >>> X_new.shape
    (1797, 20)
    Notes
    -----
    Ties between features with equal scores will be broken in an unspecified
    way.
    See also
    --------
    f_classif: ANOVA F-value between label/feature for classification tasks.
    mutual_info_classif: Mutual information for a discrete target.
    chi2: Chi-squared stats of non-negative features for classification tasks.
    f_regression: F-value between label/feature for regression tasks.
    mutual_info_regression: Mutual information for a continuous target.
    SelectPercentile: Select features based on percentile of the highest
        scores.
    SelectFpr: Select features based on a false positive rate test.
    SelectFdr: Select features based on an estimated false discovery rate.
    SelectFwe: Select features based on family-wise error rate.
    GenericUnivariateSelect: Univariate feature selector with configurable
        mode.
    """
    @_deprecate_positional_args
    def __init__(self, score_func=f_classif, *, k=10):
        super().__init__(score_func=score_func)
        self.k = k

    def _check_params(self, X, y):
        if not (self.k == "all" or 0 <= self.k <= X.shape[1]):
            raise ValueError("k should be >=0, <= n_features = %d; got %r. "
                             "Use k='all' to return all features."
                             % (X.shape[1], self.k))

    def _get_support_mask(self):
        check_is_fitted(self)

        if self.k == 'all':
            return np.ones(self.scores_.shape, dtype=bool)
        elif self.k == 0:
            return np.zeros(self.scores_.shape, dtype=bool)
        else:
            scores = _clean_nans(self.scores_)
            mask = np.zeros(scores.shape, dtype=bool)

            # Request a stable sort. Mergesort takes more memory (~40MB per
            # megafeature on x86-64).
            mask[np.argsort(scores, kind="mergesort")[-self.k:]] = 1
            return mask