In [1]:
import pandas as pd                                         
import numpy as np                                              
from scipy.special import comb                                      
import math
from operator import mul
import neal 
import dimod
#from pyqubo import Array, Constraint, Placeholder, solve_qubo           
import itertools                                                        
import random                                                           
import matplotlib.pyplot as plt                                         
import timeit
from itertools import combinations

In [2]:
def calc_marginals(df):                   
    return np.array([                      
        sum(df['Y']),                     
        np.dot(df['Y'], df['LI']),      
        np.dot(df['Y'], df['SEX']),      
        np.dot(df['Y'], df['AOP']),      
    ])                                 

In [3]:
def make_Hamiltonian(df, t1, weight):
    t_list = calc_marginals(df)
    ## サイズは確かにdf.shape[0]ですが、あらけじめNという変数に格納しておくと可読性が上がります。
    ## 変数は関数の中で使うときに混同がおこらなければもっと短くてかまいません。
    N=len(df)
    dup_list = [(i, i) for i in range(N)]
    comb_list = [(i, j) for i in range(N) for j in range(i+1, N)]
    
    lin_Y = [1-2*t_list[0] for (i, _) in dup_list] #同じy同士
    quad_Y = [2 for (i, j) in comb_list] #異なるy同士
    num_Y = t_list[0]**2 #数字の二乗
    
    LI = df['LI'].iloc
    lin_LI = [(LI[i] - 2 * t1) * LI[i] for (i, _) in dup_list]
    quad_LI = [2*LI[i] * LI[j] for (i, j) in comb_list]
    num_LI = t1**2
    
    SEX = df['SEX'].iloc
    lin_SEX  = [(SEX[i] - 2 * t_list[2]) * SEX[i] for (i, _) in dup_list]
    quad_SEX  = [2*SEX[i] * SEX[j] for (i, j) in comb_list]
    num_SEX  = t_list[2]**2
    
    AOP = df['AOP'].iloc
    lin_AOP = [(AOP[i] - 2 * t_list[3]) * AOP[i] for (i, _) in dup_list]
    quad_AOP = [2*AOP[i] * AOP[j] for (i, j) in comb_list]
    num_AOP = t_list[3]**2
    
    ## zipは二つ以上のリストを引数にとることができるのと、
    ## リスト内包表記は、辞書をつくるときにも使えるので、
    ## コードを短くできます。
    ##統合作業
    #lin
    lin_list = [sum(lin) for lin in zip(lin_Y, lin_LI, lin_SEX, lin_AOP)]
    lin = {i: lin_list[i] for (i, _) in dup_list}
    
    #quad
    quad_values = [sum(quad) for quad in zip(quad_Y, quad_LI, quad_SEX, quad_AOP)]
    quad = {ij: quad_values[n] for (n, ij) in enumerate(comb_list)}
    
    #num
    num = num_Y + num_LI + num_SEX + num_AOP
    #print('lin:', lin)
    #print('quad:', quad)
    #print('num:', num)
    
    return dimod.BinaryQuadraticModel(lin, quad, weight*num, dimod.Vartype.BINARY)#dic, dic, num

In [4]:
def find_valid_y(df, num_reads, weight):                                                        
    sa_sampler = neal.sampler.SimulatedAnnealingSampler()
    t_list = calc_marginals(df)
    
    valid_y_list = {}                                                                   
    valid_y_num = {}                                                                    
    for t1 in range(0, sum(df['LI'])+1):                                                                                                                                                                                                                                               
        bqm = make_Hamiltonian(df, t1 ,weight)
        res = sa_sampler.sample(bqm)
                                                                                        
        valid_y_list[t1] = []                                                           
        valid_y_num[t1] = 0                                                             
        for y_info in list(res.record):                                                 
            if y_info[1] == 0.:
                valid_y_num[t1] += 1                                        
                valid_y_list[t1].append(list(y_info[0]))                    
                #print('energy0')
                                                   
    return valid_y_list, valid_y_num                                                                                                                                          

In [5]:
#==========
#テストコード
#==========
def test_find_valid_y():
    df = pd.read_csv('../../input/ost20.csv', sep=',', index_col=0)
    true_t1 = sum(df['Y'] * df['LI'])
    valid_y_list, valid_y_num = find_valid_y(df,  num_reads = 10, weight = 10)
    print(valid_y_list, valid_y_num)
    assert valid_y_num[true_t1] > 0  
    
#test_find_valid_y()

In [6]:
def test_validity(canditate_list):
    df1 = pd.read_csv('../../input/ost20.csv', sep=',',index_col=0)
    df2 = pd.read_csv('../../input/ost20.csv', sep=',',index_col=0)
    new_y = np.array(canditate_list)
    df2['Y'] = new_y
    t_list1 = calc_marginals(df1)
    t_list2 = calc_marginals(df2)
    print(t_list1)
    print(t_list2)
    assert np.all(t_list1[[0,2,3]] == t_list2[[0,2,3]]) 

#test_validity()