This code generates Shapley-CMI valuation results (used to output Table 2)， which are then recorded in {dataset}/{dataset}_data_values.csv.
This code also includes the entropy calculation reported in Figure 5.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import entropy
from sklearn.utils import resample

In [2]:
# load dataset
def load_dataset(name, feature_num, discret_cat=5):
    datafile = './{}/{}.csv'.format(name, name)
    data_pd = pd.read_csv(datafile)
    feature_names = []
    for i in range (1,feature_num+1):
        feature_name = 'f'+str(i)
        if discret_cat > 0: # need discretization
            data_pd[feature_name+'_c'] = pd.cut(data_pd[feature_name], discret_cat, labels = list(range(discret_cat)))
            feature_name += '_c'
        feature_names.append(feature_name)
    data_pd.head()
    y = data_pd['y']
    x = data_pd[feature_names]
    return x, y, feature_names

In [3]:
# for calculating mutual information
from collections import Counter
def our_entropy(labels): # H(A)
    pro_dict = Counter(labels) #计数
    s = sum(pro_dict.values())#总数
    probs = np.array([i/s for i in pro_dict.values()])#概率
    return - probs.dot(np.log(probs))
def MI_(s1,s2):# 互信息
    s_s_1=["%s%s"%(i,j) for i,j in zip(s1,s2)]
    MI_1=our_entropy(s1)+our_entropy(s2)-our_entropy(s_s_1)
    return MI_1
def N_MI(s1,s2): # 标准化互信息
    MI_1 = MI_(s1,s2)
    NMI_1 = MI_1/(our_entropy(s1)*our_entropy(s2))**0.5
    return NMI_1

In [4]:
# get all the permutations of the features and then calculate conditional mutual information regarding Y
import itertools

x, y, feature_names = load_dataset('wine', 13) 
#x, y, feature_names = load_dataset('parkinsons', 22)
#x, y, feature_names = load_dataset('breast', 30)
#x, y, feature_names = load_dataset('spect', 22, 0)
#x, y, feature_names = load_dataset('winequality-red', 11)
#x, y, feature_names = load_dataset('winequality-white', 11)


X_train,X_test,Y_train,Y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# all_feature_permutations = list(itertools.permutations(feature_names)) # time-consuming if feature number is large

contribution = {}
for feature_name in feature_names:
    contribution[feature_name] = []

Y_value_list = Y_train.values.tolist()

# for each_permutation in all_feature_permutations:
random_permutation_times = 10000
for i in range(random_permutation_times): # random sample permutations for certain times
    each_permutation = np.random.permutation(feature_names)
    current_feature_set = []
    current_MI = 0
    for feature_name in each_permutation:
        current_feature_set.append(feature_name)
        x_new = X_train[current_feature_set]
        new_MI = MI_(Y_value_list, list(x_new.itertuples(index=False)))
        contr = new_MI - current_MI # conditional CMI of the current feature in the specific permutation
        contribution[feature_name].append(contr) # add the CMI together in all the permutations
        current_MI = new_MI
    i += 1
    
    if i%100 == 0:
        features_values = [0]*len(feature_names)
        for feature_i in range(len(feature_names)):
            feature_name = feature_names[feature_i]
            features_values[feature_i] = np.mean(contribution[feature_name])
        features_values = np.array(features_values)/np.sum(features_values) #normalization, not necessary, for ease of presentation
        print ('permutation', i, features_values, 'entropy', entropy(features_values)) #entropy to quantify the change of feature values

print("final Shapley-CMI after {} samplings".format(random_permutation_times))
for feature_value in features_values:
    print(feature_value)


permutation 100 [0.08564944 0.05293587 0.04758506 0.05235356 0.06561247 0.05813009
 0.11465224 0.06065011 0.05453388 0.12540637 0.08615864 0.09240606
 0.10392621]
permutation 200 [0.08490263 0.05470527 0.03914239 0.05770483 0.06762958 0.06884168
 0.11069329 0.056607   0.05495289 0.12355806 0.08162996 0.10415274
 0.09547968]
permutation 300 [0.09264334 0.0537921  0.0361083  0.0563754  0.06311805 0.07287325
 0.12605215 0.05653659 0.05207367 0.1174566  0.08202785 0.09448799
 0.0964547 ]
permutation 400 [0.08861215 0.05975976 0.03539796 0.05804635 0.05758536 0.07635959
 0.12846901 0.05563872 0.05126009 0.114055   0.07892097 0.09869819
 0.09719686]
permutation 500 [0.08859909 0.06011338 0.03487779 0.05675044 0.06005632 0.07656682
 0.13044708 0.05350806 0.04957781 0.11634333 0.08068554 0.09636722
 0.09610711]
permutation 600 [0.0874045  0.05815788 0.03513215 0.05873506 0.05882381 0.07501749
 0.12867778 0.0533113  0.0500917  0.11754306 0.08352073 0.09778339
 0.09580114]
permutation 700 [0.086