In [26]:
import pandas as pd 
import numpy as np
from scipy.special import comb
import math
from neal import SimulatedAnnealingSampler
from pyqubo import Array, Constraint, Placeholder, solve_qubo

### 解析用データ作成

（3年間骨肉腫に罹患しない）事象と各特徴量とのP値を計算する。
- Osteogenic sarcoma data:骨肉腫
- LI:リンパ球浸出
- AOP:類骨病理学

In [2]:
ost = pd.read_csv("../input/ost.csv", sep=',', index_col=0)
ost
#論文と照らし合わせ済み

Unnamed: 0_level_0,LI,SEX,AOP,COUNT
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,0,0
0,0,0,1,0
0,0,1,0,0
0,0,1,1,0
0,1,0,0,0
0,1,0,1,2
0,1,1,0,4
0,1,1,1,11
1,0,0,0,3
1,0,0,1,2


#### 論文記載のTable1を再現

In [3]:
ost_original_LISEXAOP = [
    [0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1], [1, 1, 0], [1, 1, 1]
]
len(ost_original_LISEXAOP)

8

In [4]:
ost_original = pd.DataFrame(np.arange(32).reshape(8, 4), columns=['LI', 'SEX', 'AOP', 'Propotion DFI3'])

pro_pra = [0]*8
pro_mina = [0]*8
for i in range(ost.shape[0]):
    for ii in range(len(ost_original_LISEXAOP)):
        if list(ost.iloc[i, 0:3])==ost_original_LISEXAOP[ii]:
            if ost.index[i]==1:
                pro_pra[ii] += ost.iloc[i, 3]
            else:
                pro_mina[ii] += ost.iloc[i, 3]

print(pro_pra)
print(pro_mina)  

[3, 2, 4, 1, 5, 3, 5, 6]
[0, 0, 0, 0, 0, 2, 4, 11]


In [5]:
for i in range(len(ost_original_LISEXAOP)):#num_samples
    ost_original.iloc[i, :3] = ost_original_LISEXAOP[i]
    ost_original.iloc[i, 3] = round([pro_pra[k]/(pro_pra[k]+pro_mina[k]) for k in range(len(pro_pra))][i], 2)
ost_original

Unnamed: 0,LI,SEX,AOP,Propotion DFI3
0,0,0,0,1.0
1,0,0,1,1.0
2,0,1,0,1.0
3,0,1,1,1.0
4,1,0,0,1.0
5,1,0,1,0.6
6,1,1,0,0.56
7,1,1,1,0.35


In [21]:
ost.head()

Unnamed: 0_level_0,LI,SEX,AOP,COUNT
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,0,0
0,0,0,1,0
0,0,1,0,0
0,0,1,1,0
0,1,0,0,0


### (t_0, t_1, t_2, t_3)を表示
- t_i = 「（3年後に骨肉腫に罹患していない(=1)）かつ（特徴量iが1である）」サンプル数

In [6]:
t_list = []
count_y = sum([ost.index[i]*ost.iat[i, 3] for i in range(ost.shape[0])])#単に、3年後に骨肉腫に罹患していないサンプル数
t_list.append(count_y)
for j in range(3):
    count = sum([ost.iat[i, j]*ost.iat[i, 3] for i in range(ost.shape[0]) if ost.index[i]==1])
    t_list.append(count)

In [7]:
t_list#Y, LI, SEX, AOP

[29, 19, 16, 12]

### t_1(LI)以外固定してサンプリング

In [31]:
pacient_num = sum(ost.iloc[:, -1])

In [38]:
li = Array.create('LI', shape=pacient_num, vartype='BINARY')
H = (sum([li[i]*ost.iloc[i, -1] for i in range(ost.shape[0])]) - t_list[1])**2 #19になるように固定した
qb = H.compile().to_qubo()
sa_sampler = SimulatedAnnealingSampler()

res = sa_sampler.sample_qubo(qb[0], num_reads=100) 

In [41]:
print(len(res))

100


In [39]:
for li_info in list(res.record):
    print(li_info[0])

[1 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1]
[0 1 0 0 0 1 1 1 1 0 0 1 0 0 0 1]
[1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0]
[0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 1]
[0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1]
[1 0 1 1 0 0 0 0 0 0 0 1 0 1 0 0]
[1 0 1 0 1 0 0 0 1 0 0 1 0 1 0 1]
[0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 1]
[0 1 0 0 1 0 1 1 0 0 1 1 1 0 0 0]
[1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 0]
[0 1 1 1 0 1 0 1 0 0 1 0 1 0 0 0]
[1 0 0 0 0 1 1 1 1 0 1 1 1 0 0 1]
[1 0 0 1 1 0 1 1 0 1 1 0 0 0 1 1]
[1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1]
[0 0 0 0 1 1 1 0 1 0 0 1 0 0 1 0]
[0 1 0 1 0 1 0 1 0 0 1 0 0 0 1 1]
[1 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0]
[1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 0]
[1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 0]
[0 0 0 1 1 0 0 1 1 1 0 1 1 0 1 1]
[0 0 0 0 1 1 0 1 1 1 0 1 1 0 1 1]
[0 0 1 0 1 0 1 0 0 1 1 1 1 0 1 0]
[0 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1]
[0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 0]
[1 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1]
[1 1 0 1 0 1 0 1 0 0 1 1 0 0 1 0]
[0 0 1 0 0 1 1 1 0 1 0 1 0 0 1 1]
[0 0 1 1 0 1 0 1 0 1 0 1 1 0 0 1]
[1 0 1 0 1 0 1 0 1 1 0 0 1 0 1 1]
[1 1 1 0 0 1 1

In [30]:
y = Array.create('Y', shape=pacient_num, vartype='BINARY')
sex = Array.create('SEX', shape=pacient_num, vartype='BINARY')
aop = Array.create('AOP', shape=pacient_num, vartype='BINARY')
li = Array.create('LI', shape=pacient_num, vartype='BINARY')

H = (sum(y) - t_list[0])**2 + (sum(sex) - t_list[2])**2 + (sum(aop) - t_list[3])**2
qb = H.compile().to_qubo()
sa_sampler = SimulatedAnnealingSampler()

res = sa_sampler.sample_qubo(qb[0], num_reads=100)

In [None]:
res.record

In [9]:
ost

Unnamed: 0_level_0,LI,SEX,AOP,COUNT
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,0,0
0,0,0,1,0
0,0,1,0,0
0,0,1,1,0
0,1,0,0,0
0,1,0,1,2
0,1,1,0,4
0,1,1,1,11
1,0,0,0,3
1,0,0,1,2


In [22]:
#超幾何分布の見本
feature_Y = pd.DataFrame(columns=['feature1', 'feature0', 'feature_sum'],index=['Y1', 'Y0', 'Y_sum'])
feature_Y.iloc[0, 0] = 'k'
feature_Y.iloc[0, 2] = 'K'
feature_Y.iloc[2, 0] = 'n'
feature_Y.iloc[2, 2] = 'N'

feature_Y

Unnamed: 0,feature1,feature0,feature_sum
Y1,k,,K
Y0,,,
Y_sum,n,,N


In [11]:
LI_Y = pd.DataFrame(columns=['LI1', 'LI0', 'LIsum'],index=['Y1', 'Y0', 'Ysum'])
LI_Y.fillna(0, inplace=True)
LI_Y

Unnamed: 0,LI1,LI0,LIsum
Y1,0,0,0
Y0,0,0,0
Ysum,0,0,0


In [12]:
for i in range(ost.shape[0]):
    if ost.index[i] == 1 and ost.iloc[i, 0] == 1:
        LI_Y.iloc[0, 0] += ost.iloc[i, -1]
    elif ost.index[i] == 1 and ost.iloc[i, 0] == 0:
        LI_Y.iloc[0, 1] += ost.iloc[i, -1]
    elif ost.index[i] == 0 and ost.iloc[i, 0] == 1:
        LI_Y.iloc[1, 0] += ost.iloc[i, -1]
    else:
        LI_Y.iloc[1, 1] += ost.iloc[i, -1]

#print(LI_Y)
LI_Y.iloc[0, 2] = LI_Y.iloc[0, 0] + LI_Y.iloc[0, 1]
LI_Y.iloc[1, 2] = LI_Y.iloc[1, 0] + LI_Y.iloc[1, 1]
LI_Y.iloc[2, 0] = LI_Y.iloc[0, 0] + LI_Y.iloc[1, 0]
LI_Y.iloc[2, 1] = LI_Y.iloc[0, 1] + LI_Y.iloc[1, 1]

if LI_Y.iloc[0, 2] + LI_Y.iloc[1, 2] == LI_Y.iloc[2, 0] + LI_Y.iloc[2, 1]:
    LI_Y.iloc[2, 2] = LI_Y.iloc[2, 0] + LI_Y.iloc[2, 1]
else:
    print('error is occured')

In [13]:
LI_Y

Unnamed: 0,LI1,LI0,LIsum
Y1,19,10,29
Y0,17,0,17
Ysum,36,10,46


In [14]:
LI_Y_list = LI_Y.values.tolist()
LI_Y_list

[[19, 10, 29], [17, 0, 17], [36, 10, 46]]

超幾何分布を考えると、$K=29, n=36, k=19, N=46$

マルコフ基底において固定する十分統計量は$29, 17, 46, 10, 36$（周辺和）

In [15]:
#NCn
print(comb(46, 36, exact=True))
print(comb(LI_Y.iloc[2, 2], LI_Y.iloc[2, 0], exact=True))

4076350421
4076350421


In [16]:
KCk = comb(LI_Y.iloc[0, 2], LI_Y.iloc[0, 0], exact=True)
NKCnk = comb(LI_Y.iloc[1, 2], LI_Y.iloc[1, 0], exact=True)
NCn = comb(LI_Y.iloc[2, 2], LI_Y.iloc[2, 0], exact=True)
p_ori = KCk*NKCnk/NCn
p_ori

0.004913711514302612

In [17]:
p_ori_chi = math.factorial(LI_Y.iloc[0, 2])*math.factorial(LI_Y.iloc[1, 2])*math.factorial(LI_Y.iloc[2, 1])*math.factorial(LI_Y.iloc[2, 0])
p_ori_mom = math.factorial(LI_Y.iloc[2, 2])*math.factorial(LI_Y.iloc[0, 0])*math.factorial(LI_Y.iloc[0, 1])*math.factorial(LI_Y.iloc[1, 0])*math.factorial(LI_Y.iloc[1, 1])
p_ori = p_ori_chi/p_ori_mom
p_ori

0.004913711514302612

In [18]:
p_ori_chi = math.factorial(LI_Y_list[0][-1])*math.factorial(LI_Y_list[1][-1])*math.factorial(LI_Y_list[2][1])*math.factorial(LI_Y_list[2][0])
p_ori_mom = math.factorial(LI_Y_list[2][-1])*math.factorial(LI_Y_list[0][0])*math.factorial(LI_Y_list[0][1])*math.factorial(LI_Y_list[1][0])*math.factorial(LI_Y_list[1][1])
p_ori = p_ori_chi/p_ori_mom
p_ori


0.004913711514302612

In [19]:
p_sum = 0
range_num = min(LI_Y.iloc[0, 2], LI_Y.iloc[2, 0])+1
for k in range(0, range_num): #k = LI_Y.iloc[0, 0]として
    LI_Y.iloc[0, 0] = k
    LI_Y.iloc[0, 1] = LI_Y.iloc[0, 2] - k
    LI_Y.iloc[1, 0] = LI_Y.iloc[2, 0] - k
    LI_Y.iloc[1, 1] = LI_Y.iloc[2, 2] - LI_Y.iloc[0, 2] - (LI_Y.iloc[2, 0] - k)
    if min(k, LI_Y.iloc[0, 1], LI_Y.iloc[1, 0], LI_Y.iloc[1, 1]) >= 0:
        print(LI_Y)
        KCk = comb(LI_Y.iloc[0, 2], k, exact=True)
        NKCnk = comb(LI_Y.iloc[1, 2], LI_Y.iloc[1, 0], exact=True)
        NCn = comb(LI_Y.iloc[2, 2], LI_Y.iloc[2, 0], exact=True)
        p = KCk*NKCnk/NCn
        print('p =',p)
        
        p_sum += p
        print('p_sum =', p_sum)

p_sum

      LI1  LI0  LIsum
Y1     19   10     29
Y0     17    0     17
Ysum   36   10     46
p = 0.004913711514302612
p_sum = 0.004913711514302612
      LI1  LI0  LIsum
Y1     20    9     29
Y0     16    1     17
Ysum   36   10     46
p = 0.041766547871572204
p_sum = 0.04668025938587482
      LI1  LI0  LIsum
Y1     21    8     29
Y0     15    2     17
Ysum   36   10     46
p = 0.14319959270253327
p_sum = 0.18987985208840807
      LI1  LI0  LIsum
Y1     22    7     29
Y0     14    3     17
Ysum   36   10     46
p = 0.26036289582278777
p_sum = 0.45024274791119584
      LI1  LI0  LIsum
Y1     23    6     29
Y0     13    4     17
Ysum   36   10     46
p = 0.27734308468079566
p_sum = 0.7275858325919915
      LI1  LI0  LIsum
Y1     24    5     29
Y0     12    5     17
Ysum   36   10     46
p = 0.18027300504251717
p_sum = 0.9078588376345087
      LI1  LI0  LIsum
Y1     25    4     29
Y0     11    6     17
Ysum   36   10     46
p = 0.07210920201700687
p_sum = 0.9799680396515156
      LI1  LI0  LIsu

1.0