# 不等式のQUBO変換を用いて基底エネルギー以下のサンプルを得る


#### **このファイルのミッション**
基底状態を0にする


#### **データ、条件**
small data

### **調整してみたこと**
- Xの標準化
- 内積を用いて、絶対値としたこと

### import

In [1]:
#!pip3 install pyqubo

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
import random
from pyqubo import Array, OneHotEncInteger, solve_qubo
import scipy.stats

### 各変数

In [3]:
# X
#matrix = np.random.randn(5,5)
#X_ori= pd.DataFrame(matrix, columns=list('ABCDE'))
#X_ori

# CSV ファイル (df1.csv) として出力
#X_ori.to_csv("../input/df1.csv")

In [4]:
X_ori = pd.read_csv("../input/df1.csv", sep=',', index_col=0)
X_ori

Unnamed: 0,A,B,C,D,E
0,-1.297303,-0.049925,0.185726,-0.580797,0.926729
1,1.129216,0.263337,-2.072408,0.269691,-0.565672
2,-1.140225,0.447902,-0.641875,-0.032109,0.415254
3,-1.004241,0.700738,-1.250194,-1.82718,1.972915
4,1.119464,0.248735,1.328148,0.29528,1.007603


In [5]:
# y
ori_y = pd.Series([1, 0, 1, 1, 0])
ori_y

0    1
1    0
2    1
3    1
4    0
dtype: int64

In [6]:
selected_col_num = 2

## 特徴量選択

<br/>

#### **ピアソンの相関係数**
$
r_{xy} = \frac{{\displaystyle \sum_{i = 1}^n (x_i - \overline{x})
(y_i - \overline{y})}}{\sqrt{{\displaystyle \sum_{i = 1}^n 
(x_i - \overline{x})^2}} \sqrt{{\displaystyle \sum_{i = 1}^n 
(y_i - \overline{y})^2}}} = \frac{s_{xy}}{s_xs_y} = \frac{x'・y'}{||x'||  ||y'||} = cosθ
$

- [２つのデータをベクトルと捉えると、n次元空間上の2本のベクトルの間の角度と捉える事ができる](https://www.slideshare.net/matsukenbook/5-55326268?next_slideshow=1)
- どちらか一方でも標準偏差が0、分散が0のとき分母が0となり計算できないことに注意

In [7]:
from decimal import *

In [8]:
#標準偏差が0である列をなくす
for each_col in X_ori.columns:
    if not X_ori[each_col].std()>0:
        X_ori.drop(each_col, axis=1)

In [10]:
#Xの要素を標準化(Standardization : 平均0分散1)
X_standardization = pd.DataFrame(scipy.stats.zscore(X_ori), columns=X_ori.columns)
X_standardization

Unnamed: 0,A,B,C,D,E
0,-0.948036,-1.502995,0.576689,-0.259926,0.211537
1,1.224875,-0.237599,-1.350141,0.814379,-1.588719
2,-0.807376,0.507934,-0.12949,0.433156,-0.405446
3,-0.685604,1.529245,-0.648559,-1.834311,1.473533
4,1.216142,-0.296585,1.5515,0.846703,0.309095


In [11]:
selector = SelectKBest(score_func=f_regression, k=selected_col_num) 
selector.fit(X_standardization, ori_y)
mask = selector.get_support() 

In [12]:
select_cols = []
else_cols = []
for ii in range(len(mask)):
    if mask[ii] == 1:
        select_cols.append(X_standardization.columns[ii])
    else:
        else_cols.append(X_standardization.columns[ii])

select_cols

['A', 'D']

In [14]:
max_X_element = X_standardization.max(axis=1).max() 
min_X_element = X_standardization.min(axis=1).min() 

num_samples = X_standardization.shape[0]
num_cols = X_standardization.shape[1]

In [17]:
#内積での様子を知る

ij = []
ax_by = []
for i in range(selected_col_num):
    for j in range(num_cols - selected_col_num):
        row_each_axby = abs(np.dot(X_standardization[select_cols[i]], ori_y)) - abs(np.dot(X_standardization[else_cols[j]], ori_y))
        print(np.dot(X_standardization[select_cols[i]], ori_y))
        print(np.dot(X_standardization[else_cols[j]], ori_y))
        #print(row_each_axby)
        print(int(row_each_axby + 1))
        print((i, j))
        ax_by.append(int(row_each_axby))
        ij.append((i, j))
        print('--------------------')

-2.4410162734247844
0.5341841098653275
2
(0, 0)
--------------------
-2.4410162734247844
-0.20135913539440736
3
(0, 1)
--------------------
-2.4410162734247844
1.2796242236271855
2
(0, 2)
--------------------
-1.6610813048025694
0.5341841098653275
2
(1, 0)
--------------------
-1.6610813048025694
-0.20135913539440736
2
(1, 1)
--------------------
-1.6610813048025694
1.2796242236271855
1
(1, 2)
--------------------


In [18]:
#今回変えた箇所　：　もともとは内積だった
#相関係数は内積/長さ/長さ であり、とくにxの長さは毎回変わるので内積で比較はできないのだろうと考えたため。

ij = []
ax_by = []
for i in range(selected_col_num):
    for j in range(num_cols - selected_col_num):
        row_each_axby = abs(np.corrcoef(X_standardization[select_cols[i]], ori_y)[0, 1]) - abs(np.corrcoef(X_standardization[else_cols[j]], ori_y)[0, 1])
        print(np.corrcoef(X_standardization[select_cols[i]], ori_y)[0, 1])
        print(np.corrcoef(X_standardization[else_cols[j]], ori_y)[0, 1])
        #print(row_each_axby)
        print(int(row_each_axby + 1))
        print((i, j))
        ax_by.append(int(row_each_axby))
        ij.append((i, j))
        print('--------------------')

-0.9965407206201372
0.21807974964548024
1
(0, 0)
--------------------
-0.9965407206201372
-0.08220452279404826
1
(0, 1)
--------------------
-0.9965407206201372
0.5224044017319465
1
(0, 2)
--------------------
-0.678133603007132
0.21807974964548024
1
(1, 0)
--------------------
-0.678133603007132
-0.08220452279404826
1
(1, 1)
--------------------
-0.678133603007132
0.5224044017319465
1
(1, 2)
--------------------


## すべての不等式が成り立つ：条件
したいこと→ハミルトニアンを最小化させる（xを最適化）

In [19]:
#特徴量として選ばれているのに、相関係数はマイナス。-1に限りなく近いマイナス。
#負ではあるが相関しているために特徴量として選ばれているようだ。絶対値を取ることが必要そう。

#np.corrcoef(X_standardization[select_cols[0]], ori_y)[0, 1]
abs(np.corrcoef(X_standardization[select_cols[0]], ori_y)[0, 1])

0.9965407206201372

In [20]:
# W : ay - byの最大値

#xの要素で最大値とxの要素の最小値を知れば推定はできる
W = int((max_X_element *1*num_samples - min_X_element*1*num_samples)//1)

In [21]:
#while not バージョン
def make_y1(y_after):
    select_index = random.randrange(num_samples)
    y_after[select_index] = 1 - y_after[select_index]
    while not y_after.std()>0:
        select_index = random.randrange(num_samples)
        y_after[select_index] = 1 - y_after[select_index]
    return y_after

In [22]:
# while True & if break バージョン
def make_y(y_after):
    while True:
        select_index = random.randrange(num_samples)
        y_after[select_index] = 1 - y_after[select_index]
        if y_after.std():
            #print(y_after.std())
            break
    return y_after 

In [23]:
def make_ax_by_dot(y_after):
    ij = []
    ax_by = []
    for i in range(selected_col_num):
        for j in range(num_cols - selected_col_num):
            row_each_axby = abs(np.dot(X_standardization[select_cols[i]], y_after)) - abs(np.dot(X_standardization[else_cols[j]], y_after))
            ax_by.append(int(row_each_axby + 1))
            ij.append((i, j))
    return ij, ax_by

In [24]:
def H(ax_by, index_ax_by):
    z = [0]*W
        
    each_ax_by = ax_by[index_ax_by]
    int_each_axby = each_ax_by //1
    sum_w_z = 0
    
    if int_each_axby >= 1:
        z[int(int_each_axby)] = 1
        sum_w_z = sum([int_each_axby*z[w] for w in range(W)])
    else:
        z[0] = 1
        sum_w_z = sum([1*z[w] for w in range(W)])
        
    sum_z = sum(z)

    ans_h =  (1-sum_z)**2 + (sum_w_z -  int(int_each_axby))**2
    return ans_h

In [25]:
def H_sum(ax_by):
        H_sum =  sum([H(ax_by, index_ax_by) for index_ax_by in range(len(ax_by))])
        return H_sum

In [26]:
ij, ax_by = make_ax_by_dot(ori_y)
ori_H_sum = H_sum(ax_by)
ori_H_sum

0

In [27]:
#ori_yからはめでたいことに一つも出力されなくなった
for index_ax_by in range(len(ax_by)):
    if H(ax_by, index_ax_by) > 0:
        print(H(ax_by, index_ax_by)) #Hの値を知る
        print(ij[index_ax_by]) #どの特徴量同士から発生している問題か知る
        print(ax_by[index_ax_by]) #マイナスになっているのか確認
        print('--------')

In [28]:
#yを変えてみて、どれだけダミーyがえられるのか調べてみる
appropriate_y_list = []
y_after = make_y(ori_y)
for ii in range(100):
    y_after = make_y(y_after)
    ij, ax_by = make_ax_by_dot(y_after)
    each_H_sum = H_sum(ax_by)
    if each_H_sum == 0:
        print(y_after)
        print(ij[index_ax_by])
        print(ax_by[index_ax_by])
        print('--------')
        if not list(y_after) in appropriate_y_list:
            #appropriate_y_list.append(list(y_after))
            appropriate_y_list.append(list(y_after))
appropriate_y_list

0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------


[[0, 1, 0, 0, 1]]

In [31]:
#本当に選ばれたyを用いるとA,Dが特徴量として選ばれるのか確認する。

selector.fit(X_standardization, pd.Series(appropriate_y_list[0]))
mask = selector.get_support()
mask #たしかにA, DのみがTrue

array([ True, False, False,  True, False])

In [32]:
#相関係数を用いた計算では選ばれたが|内積|では選ばれなかった[1, 0, 1, 1, 0]について知る
selector.fit(X_standardization, pd.Series([1, 0, 1, 1, 0]))
mask = selector.get_support()
mask #A, DのみがTrueなのは同じ

array([ True, False, False,  True, False])

In [33]:
ij, ax_by = make_ax_by_dot(pd.Series([1, 0, 1, 1, 0]))
each_H_sum = H_sum(ax_by)
each_H_sum  #0だった。100回では得られなかっただけのようだ

0

In [34]:
#yを変えてみて、どれだけダミーyがえられるのか調べてみる
appropriate_y_list = []
y_after = make_y(ori_y)
for ii in range(200):
    y_after = make_y(y_after)
    ij, ax_by = make_ax_by_dot(y_after)
    each_H_sum = H_sum(ax_by)
    if each_H_sum == 0:
        print(y_after)
        print(ij[index_ax_by])
        print(ax_by[index_ax_by])
        print('--------')
        if not list(y_after) in appropriate_y_list:
            #appropriate_y_list.append(list(y_after))
            appropriate_y_list.append(list(y_after))
appropriate_y_list #相関係数と同じy listを得ることができた。

0    1
1    0
2    1
3    1
4    0
dtype: int64
(1, 2)
1
--------
0    1
1    0
2    1
3    1
4    0
dtype: int64
(1, 2)
1
--------
0    1
1    0
2    1
3    1
4    0
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    1
1    0
2    1
3    1
4    0
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    1
1    0
2    1
3    1
4    0
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    0
1    1
2    0
3    0
4    1
dtype: int64
(1, 2)
1
--------
0    1
1  

[[1, 0, 1, 1, 0], [0, 1, 0, 0, 1]]