In [1]:
import numpy as np
import pandas as pd
import sklearn
import itertools

from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split

In [2]:
# load tatanic dataset
def load_titanic():
    titanic_data = pd.read_csv('titanic/train_after.csv')
    print(titanic_data.head())
    y = titanic_data['Survived']
    print(y)

    # feature_names = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Has_Cabin', 'FamilySize', 'Title']
    feature_names = ['Pclass', 'Sex', 'Age']
    x = titanic_data[feature_names]
    print(x)
    
    return x, y, feature_names

In [3]:
# load dataset
def load_dataset(name, feature_num, discret_cat=5):
    datafile = './{}/{}.csv'.format(name, name)
    data_pd = pd.read_csv(datafile)
    feature_names = []
    for i in range (1,feature_num+1):
        feature_name = 'f'+str(i)
        if discret_cat > 0: # need discretization
            data_pd[feature_name+'_c'] = pd.cut(data_pd[feature_name], discret_cat, labels = list(range(discret_cat)))
            feature_name += '_c'
        feature_names.append(feature_name)
    data_pd.head()
    y = data_pd['y']
    x = data_pd[feature_names]
    return x, y, feature_names, name

In [4]:
# for calculating mutual information
from collections import Counter
def entropy(labels): # H(A)
    pro_dict = Counter(labels) #计数
    s = sum(pro_dict.values())#总数
    probs = np.array([i/s for i in pro_dict.values()])#概率
    return - probs.dot(np.log(probs))

def MI_(s1,s2):# 互信息
    s_s_1=["%s%s"%(i,j) for i,j in zip(s1,s2)]
    MI_1=entropy(s1)+entropy(s2)-entropy(s_s_1)
    return MI_1


def features_to_keys (features):
    features_to_keys = features.copy()
    features_to_keys.sort()
    return '*'.join(str(i) for i in features_to_keys)
    
cache_approx_MIs = {}
def approx_MI_(y, data_party_features, data_feature_count, task_party_features, X_train):
    data_party_num = len(data_party_features)
    data_party_combinations = itertools.product(range(data_feature_count), repeat=data_party_num)
    mi = 0
    if data_party_num == 0: # no data parties
        for task_party_feature in task_party_features:
            x_new = X_train[[task_party_feature,]]
            mi += MI_(y,list(x_new.itertuples(index=False)))
    else:
        for data_party_feature_indices in data_party_combinations:
            all_features = []
            for i in range(data_party_num):
                all_features.append(data_party_features[i][data_party_feature_indices[i]])
            if len(task_party_features) == 0:
                feature_keys = features_to_keys(all_features)
                if feature_keys not in cache_approx_MIs:
                    x_new = X_train[all_features]
                    cache_approx_MIs[feature_keys] = MI_(y,list(x_new.itertuples(index=False)))
                mi += cache_approx_MIs[feature_keys]
            else:
                for task_party_feature in task_party_features:
                    all_features.append(task_party_feature)
                    feature_keys = features_to_keys(all_features)
                    if feature_keys not in cache_approx_MIs:
                        x_new = X_train[all_features]
                        cache_approx_MIs[feature_keys] = MI_(y,list(x_new.itertuples(index=False)))
                    mi += cache_approx_MIs[feature_keys]
    return mi

#take all the task party's X features as a whole
def approx_MI_2(y, data_party_features, data_feature_count, task_party_features, X_train):
    data_party_num = len(data_party_features)
    data_party_combinations = itertools.product(range(data_feature_count), repeat=data_party_num)
    mi = 0
    if data_party_num == 0: # no data parties
        x_new = X_train[task_party_features]
        mi = MI_(y,list(x_new.itertuples(index=False)))
    else:
        for data_party_feature_indices in data_party_combinations:
            all_features = []
            for i in range(data_party_num):
                all_features.append(data_party_features[i][data_party_feature_indices[i]])
            all_features.extend(task_party_features)
            x_new = X_train[all_features]
            mi += MI_(y,list(x_new.itertuples(index=False)))
    return mi
        

In [5]:
# get all the permutations of the features and then calculate conditional mutual information regarding Y

#x, y, feature_names = load_titanic()

#x, y, feature_names, dataset_name = load_dataset('winequality-red', 11)
#x, y, feature_names, dataset_name = load_dataset('winequality-white', 11)
#x, y, feature_names, dataset_name = load_dataset('wine', 13) 
#x, y, feature_names, dataset_name = load_dataset('parkinsons', 22)
x, y, feature_names, dataset_name = load_dataset('spect', 22, 0)
#x, y, feature_names, dataset_name = load_dataset('breast', 30)


#x, y, feature_names, dataset_name = load_dataset('ionosphere', 34)
#x, y, feature_names, dataset_name = load_dataset('congress', 16, 0)
#x, y, feature_names, dataset_name = load_dataset('landsat', 36)


X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size=0.2, random_state=0)

Y_value_list = Y_train.values.tolist()

# multiple parties' approximation performance:
data_party_num = 5
random_permutation_times = 20
orignal_cont = []
approx_cont = []
corrs = []
corrs_v2 = []
corrs_v3 = []
aggndcgs = []
aggndcgs_v2 = []
aggndcgs_v3 = []

print("dataset: {}, data party num: {}".format(dataset_name, data_party_num))

for i in range(random_permutation_times): 
    print("====Permutation {}====".format(i))
    data_parties_features = []
    each_feature_permutation = np.random.permutation(feature_names)
    
    # initial each parties' features
    split_num = int(len(feature_names)/(data_party_num+1)) # one task party + multi data parties
    start_i = 0
    for data_i in range(data_party_num):
        data_parties_features.append(list(each_feature_permutation[start_i:start_i+split_num]))
        start_i += split_num
    task_party_feature = list(each_feature_permutation[start_i:]) 
    
    #print(task_party_feature)
    #print(data_parties_features)
    
    # all party permutations
    all_party_permutations = list(itertools.permutations(range(data_party_num)))
    Y_value_list = Y_train.values.tolist()
    
    # original Shapley-CMI calculation
    contribution = [0] * data_party_num
    
    for each_permutation in all_party_permutations:
        current_feature_set = task_party_feature.copy()
        if len(current_feature_set) > 0:
            x_new = X_train[current_feature_set]
            current_MI = MI_(Y_value_list, list(x_new.itertuples(index=False)))
        else:
            current_MI = 0
        for each_party in each_permutation:
            current_feature_set.extend(data_parties_features[each_party])
            x_new = X_train[current_feature_set]
            new_MI = MI_(Y_value_list, list(x_new.itertuples(index=False)))
            contr = new_MI - current_MI # conditional CMI of the current feature in the specific permutation
            contribution[each_party] += contr/len(all_party_permutations) # add the CMI together in all the permutations
            current_MI = new_MI

    print('original', contribution)
    
    # Shapley-CMI approxmiation
    contribution_approx = [0] * data_party_num
    
    for each_permutation in all_party_permutations:
        current_feature_set = task_party_feature.copy()
        if len(current_feature_set) > 0:
            x_new = X_train[current_feature_set]
            current_MI = MI_(Y_value_list, list(x_new.itertuples(index=False)))
        else:
            current_MI = 0
        for each_party in each_permutation:
            new_MI = 0
            for each_feature in data_parties_features[each_party]:
                x_new = X_train[current_feature_set + [each_feature,]]
                new_MI += MI_(Y_value_list, list(x_new.itertuples(index=False)))
            contr = new_MI - current_MI # conditional CMI of the current feature in the specific permutation
            contribution_approx[each_party] += contr/len(all_party_permutations) # add the CMI together in all the permutations
            current_MI = new_MI
            current_feature_set.extend(data_parties_features[each_party])
            
    print('approx', contribution_approx)
    corr = np.corrcoef(contribution, contribution_approx)[0,1]
    corrs.append(corr)
    print('corr', corr)
    aggndcg = (ndcg_score([contribution], [contribution_approx])+ndcg_score([contribution_approx], [contribution]))/2
    aggndcgs.append(aggndcg)
    print('AggNDCG', aggndcg)
    
    # Shapley-CMI approximation v2 (feature independence on each party)
    contribution_approx2 = [0] * data_party_num
    for each_permutation in all_party_permutations:
        current_MI = approx_MI_(Y_value_list, [], 0, task_party_feature, X_train)
        new_data_parties_features = []
        for each_party in each_permutation:
            new_data_parties_features.append(data_parties_features[each_party])
            new_MI = approx_MI_(Y_value_list, new_data_parties_features, split_num, task_party_feature, X_train)
            contr = new_MI - current_MI # conditional CMI of the current feature in the specific permutation
            contribution_approx2[each_party] += contr/len(all_party_permutations) # add the CMI together in all the permutations
            current_MI = new_MI
            
    print('approx_v2', contribution_approx2)
    corr = np.corrcoef(contribution, contribution_approx2)[0,1]
    corrs_v2.append(corr)
    print('corr_v2', corr)
    aggndcg = (ndcg_score([contribution], [contribution_approx2])+ndcg_score([contribution_approx2], [contribution]))/2
    aggndcgs_v2.append(aggndcg)
    print('AggNDCG_v2', aggndcg) 
    
    # Shapley-CMI approximation v3 (feature independence on each data party, task party features as a whole)
    contribution_approx3 = [0] * data_party_num
    for each_permutation in all_party_permutations:
        current_MI = approx_MI_2(Y_value_list, [], 0, task_party_feature, X_train)
        new_data_parties_features = []
        for each_party in each_permutation:
            new_data_parties_features.append(data_parties_features[each_party])
            new_MI = approx_MI_2(Y_value_list, new_data_parties_features, split_num, task_party_feature, X_train)
            contr = new_MI - current_MI # conditional CMI of the current feature in the specific permutation
            contribution_approx3[each_party] += contr/len(all_party_permutations) # add the CMI together in all the permutations
            current_MI = new_MI
            
    print('approx_v3', contribution_approx3)
    corr = np.corrcoef(contribution, contribution_approx3)[0,1]
    corrs_v3.append(corr)
    print('corr_v3', corr)
    aggndcg = (ndcg_score([contribution], [contribution_approx3])+ndcg_score([contribution_approx3], [contribution]))/2
    aggndcgs_v3.append(aggndcg)
    print('AggNDCG_v3', aggndcg) 

print("avg corr", np.nanmean(corrs))
print("avg aggndcg", np.nanmean(aggndcgs))
print("avg corr_v2", np.nanmean(corrs_v2))
print("avg aggndcg_v2", np.nanmean(aggndcgs_v2))
print("avg corr_v3", np.nanmean(corrs_v3))
print("avg aggndcg_v3", np.nanmean(aggndcgs_v3))

dataset: spect, data party num: 5
====Permutation 0====
original [0.02015248146269425, 0.033194162884565115, 0.038241438153938795, 0.0501818461052096, 0.0309537704702448]
approx [0.17673977268852162, 0.19145605855582312, 0.1867905232883198, 0.20440357764625885, 0.19883317068011014]
corr 0.7883071670403569
AggNDCG 0.9913876602881493
approx_v2 [87.65670296707916, 85.86597845071175, 86.59900510871401, 88.21361930422398, 87.42408678796552]
corr_v2 0.1988668587200572
AggNDCG_v2 0.9798170271587727
approx_v3 [15.682498888396045, 15.861253647006102, 15.842625963021634, 16.097402224759126, 16.0623384661676]
corr_v3 0.7304885583016134
AggNDCG_v3 0.9930193289411475
====Permutation 1====
original [0.03097661565704189, 0.07425197713588756, 0.04190374547942296, 0.02455677691286821, 0.03324997405555948]
approx [0.19000219109229596, 0.21797685032224703, 0.19500739185887084, 0.19167623881034626, 0.18322801073713757]
corr 0.9248652161108449
AggNDCG 0.9956803352873003
approx_v2 [84.92435858682578, 87.486

approx [0.20191112372466175, 0.2097181162416931, 0.21426540240640565, 0.20075684007745206, 0.18079042531551645]
corr 0.7664156091734806
AggNDCG 0.9812049997327708
approx_v2 [78.97817018726884, 79.27581145240227, 80.21575694130597, 78.99941262091103, 77.52603614017646]
corr_v2 0.761395151832453
AggNDCG_v2 0.9863206791705768
approx_v3 [14.439934351571925, 14.287466860068626, 14.448359524741544, 14.26971072202445, 13.943132125877359]
corr_v3 0.9059945410632647
AggNDCG_v3 0.9914287553707504
====Permutation 14====
original [0.0486304324813929, 0.02263385706280337, 0.03262944671896313, 0.046925813328518674, 0.04657892905864554]
approx [0.20373880176655026, 0.18672519250320813, 0.19092604463261514, 0.19743093131309264, 0.20325572455549104]
corr 0.9458809508879653
AggNDCG 0.9991657059678482
approx_v2 [85.93184656519675, 84.01723795302765, 84.85793446339082, 85.34166826705587, 85.67045546669003]
corr_v2 0.967214824971813
AggNDCG_v2 0.9997318457725821
approx_v3 [15.36671331247772, 15.10613153602

In [6]:
# get all the permutations of the features and then calculate conditional mutual information regarding Y
import itertools

#x, y, feature_names = load_titanic()
#x, y, feature_names = load_dataset('wine', 13) 
#x, y, feature_names = load_dataset('parkinsons', 22)
#x, y, feature_names = load_dataset('breast', 30)
#x, y, feature_names = load_dataset('ionosphere', 34)
#x, y, feature_names = load_dataset('landsat', 36)
x, y, feature_names = load_dataset('spect', 22, 0)
#x, y, feature_names = load_dataset('congress', 16, 0)
#x, y, feature_names = load_dataset('winequality-red', 11)
#x, y, feature_names = load_dataset('winequality-white', 11)


X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size=0.2, random_state=0)

contribution = {}
for feature_name in feature_names:
    contribution[feature_name] = 0

Y_value_list = Y_train.values.tolist()

# two parties' approximation performance:
random_permutation_times = 10
orignal_cont = []
approx_cont = []
for i in range(random_permutation_times): # random sample permutations for 10000 times
    if i%100 == 0:
        print(i)
    each_permutation = np.random.permutation(feature_names)
    
    split_num = int(len(feature_names)/3)
    task_party_features = list(each_permutation[0:split_num])
    data_party_features = list(each_permutation[split_num:len(feature_names)])
    x_task_party = X_train[task_party_features]
    # x_data_party = X_train[data_party_features]
    
    # original computation
    current_MI = MI_(Y_value_list, list(x_task_party.itertuples(index=False)))
    new_MI = MI_(Y_value_list, list(X_train[feature_names].itertuples(index=False)))
    contribution_data_party = new_MI - current_MI
    orignal_cont.append(contribution_data_party)
    
    # approximation 1
    contribution_data_party_app_1 = 0
    current_MI = MI_(Y_value_list, list(x_task_party.itertuples(index=False)))
    for data_paty_feature_name in data_party_features:
        tmp_features = task_party_features.copy()
        tmp_features.append(data_paty_feature_name)
        x_new = X_train[tmp_features]
        new_MI = MI_(Y_value_list, list(x_new.itertuples(index=False)))
        contribution_data_party_app_1 += new_MI - current_MI
        
    approx_cont.append(contribution_data_party_app_1)
    
print(orignal_cont)
print(approx_cont)

ValueError: too many values to unpack (expected 3)