**完了確認済み2020/07/24**

# 不等式のQUBO変換を用いて基底エネルギー以下のサンプルを得る


#### **このファイルのミッション**
今までの誤解を取り払って、条件を整えて分析する


#### **データ、条件**
small, binary data

### **調整したこと**
- XはSNPを想定。SNPの発生する箇所は95%ほとんどある塩基、5%ある別の塩基としか出現しないので0, 1で置換可能。
- Xは標準化しない
- 特徴量選択は内積が大きいものを取り出す
- ay-byは内積のままでok
- 場合によってはイジングモデルを採用しよう

### import

In [1]:
!pwd

/data/202007/scripts


In [2]:
import itertools
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
import random
from pyqubo import Array, OneHotEncInteger, solve_qubo
import scipy.stats

In [3]:
#標準偏差が0である列をなくす
def drop_str0_X(X):
    for each_col in X.columns:
        if X[each_col].std()==0.0:
            X = X.drop(each_col, axis=1)
    return X

In [4]:
X_ori55 = pd.read_csv("../input/SNP_df55.csv", sep=',', index_col=0)
X_ori55_ad = drop_str0_X(X_ori55)

In [5]:
#ori_y5 = pd.Series([1, 0, 1, 1, 0], name='y')
#ori_y5.to_csv('../input/y5.csv')
#ori_y5

In [64]:
ori_y5 = pd.read_csv('../input/y5.csv', sep=',', index_col=0)['y']
ori_y5

0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64

In [65]:
ori_y5.iloc[0]

1

In [68]:
y1_sum = sum(ori_y5)

In [7]:
selected_col_num1 = 1
selected_col_num2 = 2
selected_col_num3 = 3

## 特徴量選択

<br/>

#### **ピアソンの相関係数**
$
r_{xy} = \frac{{\displaystyle \sum_{i = 1}^n (x_i - \overline{x})
(y_i - \overline{y})}}{\sqrt{{\displaystyle \sum_{i = 1}^n 
(x_i - \overline{x})^2}} \sqrt{{\displaystyle \sum_{i = 1}^n 
(y_i - \overline{y})^2}}} = \frac{s_{xy}}{s_xs_y} = \frac{x'・y'}{||x'||  ||y'||} = cosθ
$

- [２つのデータをベクトルと捉えると、n次元空間上の2本のベクトルの間の角度と捉える事ができる](https://www.slideshare.net/matsukenbook/5-55326268?next_slideshow=1)
- どちらか一方でも標準偏差が0、分散が0のとき分母が0となり計算できないことに注意

#### 最適な関数と出力

In [8]:
def featrure_InnerProduct_sel_mul(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns] #内積のリスト
    sort_inner_product_list = sorted(inner_product_list ,reverse=True)
    
    select_cols = []
    for i in range(sel_col_num):
        iindex = inner_product_list.index(sort_inner_product_list[i])
        select_cols.append(X.columns[iindex])
    
    else_cols = list(X.columns)
    for col in select_cols:
        else_cols.remove(col)
    
    return select_cols, else_cols

In [9]:
select_cols, else_cols = featrure_InnerProduct_sel_mul(X_ori55_ad, ori_y5, selected_col_num2)

#### 試行錯誤過程

In [10]:
from decimal import *

In [11]:
#標準偏差が0である列をなくす
def drop_str0_X(X):
    for each_col in X.columns:
        if X[each_col].std()==0.0:
            X = X.drop(each_col, axis=1)
    return X

In [13]:
drop_str0_X(X_ori55_ad)

Unnamed: 0,A,B,C,D,E
0,0,1,0,0,1
1,1,1,1,1,0
2,0,1,0,1,0
3,1,0,1,0,0
4,0,0,0,1,1


In [16]:
max_X_element = X_ori55_ad.max(axis=1).max() 
min_X_element = X_ori55_ad.min(axis=1).min() 

num_samples = X_ori55_ad.shape[0]
num_cols = X_ori55_ad.shape[1]

In [18]:
#特徴量選択(内積で取り出す)
inner_product_list = [np.dot(X_ori55_ad[each_col], ori_y5) for each_col in X_ori55_ad.columns]
inner_product_list

[array([1]), array([2]), array([1]), array([1]), array([1])]

In [19]:
max_index = inner_product_list.index(max(inner_product_list))
max_index

1

In [20]:
select_cols = [X_ori55_ad.columns[max_index]]
select_cols

['B']

In [21]:
else_cols = list(X_ori55_ad.columns)
del else_cols[max_index]
else_cols

['A', 'C', 'D', 'E']

In [22]:
def featrure_InnerProduct_sel(X, y, sel_col_num):
    inner_product_list = [np.dot(X[each_col], y) for each_col in X.columns] #内積のリスト
    sort_inner_product_list = sorted(inner_product_list ,reverse=True)
    
    select_cols = []
    for i in range(sel_col_num):
        iindex = inner_product_list.index(sort_inner_product_list[i])
        select_cols.append(X.columns[iindex])
    
    else_cols = list(X.columns)
    for col in select_cols:
        else_cols.remove(col)
    
    return select_cols, else_cols

In [25]:
#内積での様子を知る

ij = []
ax_by = []
for i in range(selected_col_num1):
    for j in range(num_cols - selected_col_num1):
        row_each_axby = np.dot(X_ori55_ad[select_cols[i]], ori_y5) - np.dot(X_ori55_ad[else_cols[j]], ori_y5)
        print(np.dot(X_ori55_ad[select_cols[i]], ori_y5))
        print(np.dot(X_ori55_ad[else_cols[j]], ori_y5))
        #print(row_each_axby)
        print('正になっている必要があります👉{}'.format(int(row_each_axby)))
        print((i, j))
        ax_by.append(int(row_each_axby))
        ij.append((i, j))
        print('----------------')

[2]
[1]
正になっている必要があります👉1
(0, 0)
----------------
[2]
[1]
正になっている必要があります👉1
(0, 1)
----------------
[2]
[1]
正になっている必要があります👉1
(0, 2)
----------------
[2]
[1]
正になっている必要があります👉1
(0, 3)
----------------


## すべての不等式が成り立つ：条件
したいこと→ハミルトニアンを最小化させる（xを最適化）

In [26]:
# W : ay - byの最大値

#xの要素で最大値とxの要素の最小値を知れば推定はできる
W = max_X_element *1*num_samples - min_X_element*1*num_samples

In [27]:
type(W)

numpy.int64

In [28]:
#while not バージョン
def make_y1(y_after):
    select_index = random.randrange(num_samples)
    y_after[select_index] = 1 - y_after[select_index]
    while not y_after.std()>0:
        select_index = random.randrange(num_samples)
        y_after[select_index] = 1 - y_after[select_index]
    return y_after

In [73]:
# while True & if break バージョン
def make_y(y_after):
    while True:
        select_index = random.randrange(num_samples)
        y_after[select_index] = 1 - y_after[select_index]
        if np.dot(np.array([1]*num_samples).T, y_after) == y1_sum:
            if y_after.std():
                #print(y_after.std())
                break
    return y_after 

In [34]:
def make_ax_by(X, y, select_cols, else_cols):
    ij = []
    ax_by = []
    for i in select_cols:
        for j in else_cols:
            row_each_axby = np.dot(X[i], y) - np.dot(X[j], y)
            ax_by.append(row_each_axby)
    return ax_by

In [39]:
def H(ax_by, index_ax_by):
    z = [0]*W
        
    each_axby = int(ax_by[index_ax_by])
    sum_w_z = 0
    
    if each_axby >= 0: #変えた
        z[each_axby] = 1
        sum_w_z = sum([each_axby*z[w] for w in range(W)])
    else:
        z[0] = 1
        sum_w_z = sum([1*z[w] for w in range(W)])
        
    sum_z = sum(z)

    ans_h =  (1-sum_z)**2 + (sum_w_z - each_axby)**2
    return ans_h

In [36]:
def H_sum(ax_by):
    H_sum =  sum([H(ax_by, index_ax_by) for index_ax_by in range(len(ax_by))])
    return H_sum

In [37]:
ax_by = make_ax_by(X_ori55_ad, ori_y5, select_cols, else_cols)
ax_by

[array([1]), array([1]), array([1]), array([1])]

In [40]:
ori_H_sum = H_sum(ax_by)
ori_H_sum

0

In [41]:
#ori_yからはめでたいことに一つも出力されなくなった
for index_ax_by in range(len(ax_by)):
    if H(ax_by, index_ax_by) > 0:
        print(H(ax_by, index_ax_by)) #Hの値を知る
        print(ij[index_ax_by]) #どの特徴量同士から発生している問題か知る
        print(ax_by[index_ax_by]) #マイナスになっているのか確認
        print('--------')

In [75]:
#yを変えてみて、どれだけダミーyが得られるのか調べてみる
appropriate_y_list = []
y_after = make_y(ori_y5)
for ii in range(100):
    y_after = make_y(y_after)
    ax_by = make_ax_by(X_ori55_ad, y_after, select_cols, else_cols)
    each_H_sum = H_sum(ax_by)
    if each_H_sum == 0:
        print(y_after)
        print(ij[index_ax_by])
        print(ax_by[index_ax_by])
        print('--------')
        if not list(y_after) in appropriate_y_list:
            #appropriate_y_list.append(list(y_after))
            appropriate_y_list.append(list(y_after))
appropriate_y_list

0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    1
2    1
3    0
4    0
Name: y, dtype: int64
(0, 3)
2
--------
0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    0
2    1
3    1
4    0
Name: y, dtype: int64
(0, 3)
1
--------
0    1
1    0
2    1
3   

[[1, 0, 1, 1, 0], [1, 1, 1, 0, 0]]

In [None]:
#本当に選ばれたyを用いるとBが特徴量として選ばれるのか確認する。

selector.fit(X_ori, pd.Series(appropriate_y_list[0]))
mask = selector.get_support()
mask #たしかにA, DのみがTrue

In [None]:
selector.fit(X_ori, pd.Series(appropriate_y_list[1]))
mask = selector.get_support()
mask #たしかにA, DのみがTrue

In [None]:
#得られるyは他にないのか確認
appropriate_y_list = []
y_after = make_y(ori_y)
for ii in range(200):
    y_after = make_y(y_after)
    ij, ax_by = make_ax_by(y_after)
    each_H_sum = H_sum(ax_by)
    if each_H_sum == 0:
        print(y_after)
        print(ij[index_ax_by])
        print(ax_by[index_ax_by])
        print('--------')
        if not list(y_after) in appropriate_y_list:
            #appropriate_y_list.append(list(y_after))
            appropriate_y_list.append(list(y_after))
appropriate_y_list #3個以上にはならなかった