In [106]:
from scipy.optimize import minimize
from scipy.stats import entropy
import math
import numpy as np
import copy

In [219]:
def info_entropy(p, num_list):
    S = sum(num_list)
    N = len(num_list)
    Entropy = 0
    num_list_aft = []
    for n in num_list:
        n_pos = n*(1-p)
        n_neg = (S-n)*p/(N-1)
        n_tol = n_pos + n_neg
        num_list_aft.append(n_tol)
        P_pos = n_pos / n_tol
        P_neg = n_neg / n_tol
        Ei = - P_pos * math.log(P_pos, 2) - P_neg * math.log(P_neg, 2)
        Pi = n / S # proba
        # Pi = math.exp(Pi*1.17)
        # print(Pi)
        Entropy = Entropy + Pi * Ei
        
    p_ori = np.array(num_list) / S
    p_aft = np.array(num_list_aft) / S
    KL_div = entropy(p_ori, p_aft)
    
    return -Entropy

def info_entropy_v2(P_list, Proba_matrix, num_list):
    S = sum(num_list)
    N = len(num_list)
    num_list_aft = []
    Entropy = 0
    for i in range(N): # 一共N个数据
        p = P_list[i]
        n = num_list[i]
        n_pos = n*(1-p) # 正例个数
        n_neg = 0
        for j in range(N):
            if j != i:
                n_other = num_list[j]
                p_neg = P_list[j]
                proba = Proba_matrix[j, i] # 第j个属性生成第i个属性的概率
                n_neg += n_other * p_neg * proba # 其他属性，每个生成负例的个数乘以生成i属性的概率
        n_tol = n_pos + n_neg
        num_list_aft.append(n_tol)
        P_pos = n_pos / n_tol
        P_neg = n_neg / n_tol
        Ei = - P_pos * math.log(P_pos, 2) - P_neg * math.log(P_neg, 2)
        Pi = n / S # proba
        # print(Pi)
        Entropy = Entropy + Ei
        
    # 还要约束比例前后尽可能一致，用KL散度衡量前后分布的一致性
    p_ori = np.array(num_list) / S
    p_aft = np.array(num_list_aft) / S
    KL_div = entropy(p_ori, p_aft)
    
    return -Entropy + KL_div
    # return -Entropy + 0.5 * KL_div

In [229]:
num_list = [6122, 34906, 418, 747]
# num_list = [843, 1738, 633, 913]
N = len(num_list)
P_list = [0.5] * N
Proba_matrix = np.ones([N, N]) / (N-1)
pmax = 0.7; pmin = 0.3

def makefunmin(i, pmin):
    return lambda x: x[i]-pmin
def makefunmax(i, pmax):
    return lambda x: pmax-x[i]
cons = ()
for i in range(N):
    cons = cons +({'type': 'ineq', 'fun': makefunmin(i, pmin)},
                  {'type': 'ineq', 'fun': makefunmax(i, pmax)},)
    

p = minimize(info_entropy_v2, P_list, args=(Proba_matrix, num_list), constraints=cons, method='SLSQP')
print(p.x)

[0.4625358 0.3       0.3       0.3      ]


In [16]:
P_list = [0.5, 0.5, 0.5, 0.5]
num_list = [6122,6122,6122,747]
# num_list = [100, 100, 100, 100]
N = len(num_list)
Proba_matrix = np.ones([N, N]) / (N-1)

In [209]:
p0 = [0.5]
num_list = [6122,34906,418,747]
cons = ({'type': 'ineq', 'fun': lambda x: np.array([x[0]-0.01])},
        {'type': 'ineq', 'fun': lambda x: np.array([0.99-x[0]])})
p = minimize(info_entropy, p0, args=(num_list), constraints=cons, method='SLSQP')
print(p.x)

In [241]:
import json 

attr_dict_file = 'data/equal_processed_data/attr_to_attrvals.json'
vocab_dict_file = 'dataset/vocab/vocab_dict.json'
with open(attr_dict_file, 'r') as f:
    attr_dict = json.load(f)
with open(vocab_dict_file, 'r') as f:
    vocab_dict = json.load(f)
def get_negative_dict(attr_dict, vocab_dict):
    proba_negative_dict = {}
    for query, attr_list in attr_dict.items():
        proba_negative_dict[query] = {}
        proba_negative_dict[query]['attr_list'] = attr_list
        proba_list = []
        for attr in attr_list:
            proba_list.append(vocab_dict[attr])
        proba_negative_dict[query]['attr_freq'] = proba_list
            
    return proba_negative_dict

proba_negative_dict = get_negative_dict(attr_dict, vocab_dict)

In [242]:
import copy 
from scipy.optimize import minimize
import math
import numpy as np

# 每个query统一一个概率p
# dict_copy = copy.deepcopy(proba_negative_dict)
# for query, attr_dict in dict_copy.items():
#      num_list = attr_dict['attr_freq']
#      p0 = [0.5] # 注意函数p是负例生成概率
#      cons = ({'type': 'ineq',
#           'fun': lambda x: np.array([x[0]-0.01]),
#           'jac': lambda x: np.array([1.0])},
#           {'type': 'ineq',
#           'fun': lambda x: np.array([0.99-x[0]]),
#           'jac': lambda x: np.array([-1.0])})
#      p = minimize(info_entropy, p0, args=(num_list), constraints=cons, method='SLSQP')
#      proba_negative_dict[query]['entropy_proba'] = p.x.item()


# 每个attr一个概率p
dict_copy = copy.deepcopy(proba_negative_dict)
for query, attr_dict in dict_copy.items():
     num_list = attr_dict['attr_freq']
     N = len(num_list)
     P_list = [0.5] * N
     Proba_matrix = np.ones([N, N]) / (N-1)
     pmax = 0.7; pmin = 0.3
     
     def makefunmin(i, pmin):
        return lambda x: x[i]-pmin
     def makefunmax(i, pmax):
          return lambda x: pmax-x[i]
     cons = ()
     for i in range(N):
          cons = cons +({'type': 'ineq', 'fun': makefunmin(i, pmin)},
                         {'type': 'ineq', 'fun': makefunmax(i, pmax)},)
    
     p = minimize(info_entropy_v2, P_list, args=(Proba_matrix, num_list), constraints=cons, method='SLSQP')
     proba_negative_dict[query]['entropy_proba'] = p.x.tolist()
     

In [244]:
attr_save_file = 'data/equal_processed_data/proba_negative_dict_independent.json'
with open(attr_save_file, 'w') as f:
    json.dump(proba_negative_dict, f, ensure_ascii=False, indent=4)