### 方法
```
while 調べるべき特徴量がある限り :
    for 調べる特徴量 in 調べる特徴量の数 :
        ある特徴量ベクトルxiと元のy(y^)の内積を求める(t^i)
        for 100程度 :
            y^を、{t0 = t^0かつt1 = t^1...tj = t^j}という条件の元サンプリング（nealを使う）しyを生成する。
            ある特徴量ベクトルxiとyの内積を求める(ti)
        複数のtiを求めることによって得られた分布に対するt^iの位置から、p値を求める。
    p値の一番小さいXsを知る。
    条件{}にts = t^sを足す（更新する）
    調べる特徴量からXsを除外（更新する）
```

#### 条件について
- t0 = t^0 : 1の総和が同じ
- tj = t^j : xの特徴量jつ目とyの内積の値が同じ

### データ
mushroomの50サンプル分

### yのサンプリング
- コンパイルを行うまでは`pyqubo`
- SAは`SimulatedAnnealingSampler()`

### 改訂(08/03)
- ori_tiのkeyがなかった場合に1とする必要がない
- sum([j[1] for j in ti_list])は必ず100である

### 改訂（08/04）
- samplingでnum=100とする。

In [1]:
#!pip3 install dwave-neal

In [2]:
import pandas as pd 
import numpy as np
from neal import SimulatedAnnealingSampler
from pyqubo import Array, Constraint, Placeholder, solve_qubo
import sys

In [3]:
X_mushroom = pd.read_csv("../input/X_l_mushroom.csv", sep=',', index_col=0) #50bit
y_mushroom = pd.read_csv('../input/y_l_mushroom.csv', sep=',', index_col=0)['21']

num_samples = X_mushroom.shape[0]
num_cols = X_mushroom.shape[1]
columns = list(X_mushroom.columns)

y_sum = sum(y_mushroom)

In [4]:
num_samples

50

In [5]:
y_sum

26

## t^0だけでサンプリングをしてみる

In [23]:
#numreadsは作るサンプル数（ダブリはあるはず）
#numreadsは100, 500, 1000とするか、logをとるか(np.logspace)

numreads_10 = np.logspace(1, 4, num=4, base=10, dtype=int)
len(numreads_10) #4
print(numreads_10)

numreads_5 = [ll*100 for ll in range(1, 21) if ll%5==0 or ll==1]
len(numreads_5) #5
print(numreads_5)

[   10   100  1000 10000]
[100, 500, 1000, 1500, 2000]


In [31]:
qbl_p_list = [] ##
for k in range(10): #10回回して平均、分散を得る
    y = Array.create('y', shape=num_samples, vartype='BINARY')

    #QUBO式で定式化
    H = (sum(y) - y_sum)**2
    qb = H.compile().to_qubo()
    sa_sampler = SimulatedAnnealingSampler()
    res = sa_sampler.sample_qubo(qb[0], num_reads=numreads_10[3]) #0, 1, 2, 3
    #res = sa_sampler.sample_qubo(qb[0], num_reads=numreads_5[4]) #0, 1, 2, 3, 4
    
    #fig, ax = plt.subplots(int(num_cols/2), 2, figsize=(30,30))

    qb_minP_cols = []
    servey_num_col_list = columns
    qbl_p = {} ##
    
    for i in servey_num_col_list: #A, B, ..., 
        ori_ti = int(np.dot(X_mushroom[i], y_mushroom)) #特徴量iを表すp

        ti_dic = {} #{内積の値:該当数}という辞書。

        for y_info in list(res.record):
            y = pd.Series(y_info[0])

            if int(np.dot(X_mushroom[i], y)) in ti_dic.keys():
                ti_dic[int(np.dot(X_mushroom[i], y))] += 1
            else:
                ti_dic[int(np.dot(X_mushroom[i], y))] = 1

        ti_list = sorted(ti_dic.items()) #listになり、要素はタプルになる。
        franc = sum([j[1] for j in ti_list if j[0] >= ori_ti])
        qbl_p[i] = franc/numreads_10[3] ##
        #qbl_p[i] = franc/numreads_5[4] ##
    
    ##一周終了
    qbl_p_list.append(qbl_p) ##

## 10回終了
to_qubol_p_10times = pd.DataFrame(qbl_p_list)
print(to_qubol_p_10times)

#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_10samples_p_10time.csv')
#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_100samples_p_10time.csv')
#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_1000samples_p_10time.csv')
to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_10000samples_p_10time.csv')


##to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_100samples_p_10time.csv')
#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_500samples_p_10time.csv')
##to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_1000samples_p_10time.csv')
#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_1500samples_p_10time.csv')
#to_qubol_p_10times.to_csv('../output/mushrooml/to_qubol_2000samples_p_10time.csv')

     0    1       2    3    4       5       6    7       8       9  ...  \
0  1.0  0.0  0.5729  1.0  1.0  0.7108  0.0041  1.0  0.9797  0.9147  ...   
1  1.0  0.0  0.5759  1.0  1.0  0.7121  0.0061  1.0  0.9764  0.9189  ...   
2  1.0  0.0  0.5750  1.0  1.0  0.7091  0.0063  1.0  0.9781  0.9182  ...   
3  1.0  0.0  0.5750  1.0  1.0  0.7157  0.0058  1.0  0.9789  0.9188  ...   
4  1.0  0.0  0.5788  1.0  1.0  0.7096  0.0052  1.0  0.9777  0.9163  ...   
5  1.0  0.0  0.5691  1.0  1.0  0.7190  0.0059  1.0  0.9780  0.9196  ...   
6  1.0  0.0  0.5798  1.0  1.0  0.7147  0.0045  1.0  0.9771  0.9231  ...   
7  1.0  0.0  0.5768  1.0  1.0  0.6999  0.0044  1.0  0.9758  0.9226  ...   
8  1.0  0.0  0.5786  1.0  1.0  0.7068  0.0051  1.0  0.9780  0.9219  ...   
9  1.0  0.0  0.5688  1.0  1.0  0.7082  0.0051  1.0  0.9775  0.9229  ...   

      109  110     111     112     113     114     115     116     117     118  
0  0.7091  1.0  0.0508  0.1464  0.1986  0.8229  0.5273  0.9998  0.7821  0.2585  
1  0.7072  1

In [6]:
def make_y(minP_cols):
    # BINARY変数
    y = Array.create('y', shape=num_samples, vartype='BINARY')
    
    #QUBO式で定式化
    H = (sum(y) - y_sum)**2
    
    if len(minP_cols) > 0:
        for i in minP_cols:
            H_plus = (np.dot(X_mushroom[i], y) - np.dot(X_mushroom[i], y_mushroom))**2
            H = H + H_plus
    
    bqm = H.compile().to_dimod_bqm()

    sa_sampler = SimulatedAnnealingSampler()

    res = sa_sampler.sample(bqm)

    return pd.Series(list(list(res.record)[0][0]))

In [7]:
minP_cols = []
servey_num_col_list = columns
distri_element_num = 100

while len(servey_num_col_list) > 0:    
    p = {}
    for i in servey_num_col_list:
        ori_ti = int(np.dot(X_mushroom[i], y_mushroom)) #特徴量iを表すp
        
        ti_dic = {} #{内積の値:該当数}という辞書。
        
        for _ in range(distri_element_num): 
            y = make_y(minP_cols)
            
            if int(np.dot(X_mushroom[i], y)) in ti_dic.keys():
                ti_dic[int(np.dot(X_mushroom[i], y))] += 1
            else:
                ti_dic[int(np.dot(X_mushroom[i], y))] = 1
                
        count_better_t = sum([v for k, v in ti_dic.items() if k >= ori_ti])
        p[i] = count_better_t / distri_element_num

        
    minP_keys = [k for k, v in p.items() if v == min(list(p.values()))]
    minP_cols += minP_keys
    servey_num_col_list = [i for i in servey_num_col_list if i not in minP_keys]

In [8]:
minP_cols

['1',
 '21',
 '27',
 '38',
 '56',
 '94',
 '100',
 '2',
 '6',
 '14',
 '25',
 '31',
 '37',
 '44',
 '45',
 '49',
 '50',
 '51',
 '60',
 '65',
 '67',
 '74',
 '76',
 '78',
 '83',
 '88',
 '97',
 '99',
 '101',
 '107',
 '108',
 '111',
 '112',
 '115',
 '18',
 '113',
 '52',
 '117',
 '118',
 '0',
 '3',
 '4',
 '5',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '15',
 '16',
 '17',
 '19',
 '20',
 '22',
 '23',
 '24',
 '26',
 '28',
 '29',
 '30',
 '32',
 '33',
 '34',
 '35',
 '36',
 '39',
 '40',
 '41',
 '42',
 '43',
 '46',
 '47',
 '48',
 '53',
 '54',
 '55',
 '57',
 '58',
 '59',
 '61',
 '62',
 '63',
 '64',
 '66',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '75',
 '77',
 '79',
 '80',
 '81',
 '82',
 '84',
 '85',
 '86',
 '87',
 '89',
 '90',
 '91',
 '92',
 '93',
 '95',
 '96',
 '98',
 '102',
 '103',
 '104',
 '105',
 '106',
 '109',
 '110',
 '114',
 '116']