In [86]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

In [87]:
from collections.abc import Iterable


def merge(database):
    # from collections import Iterable

    result = []
    if isinstance(database, list):
        if not isinstance(database[0], Iterable):
            return database
        res = database
    else:
        res = list(zip(database.values))

    for i in res:
        result.append(''.join(str(i[0])))
    return result


def init_data(path, names, use_cols, savePath):
    use_cols = [x.lower() for x in use_cols]
    p = pd.read_csv(path, header=None, names=names, index_col=False)
    p = p[use_cols].replace(' ?', np.nan).dropna()
    p.to_csv(savePath)
    return p


# 读取
def read_file(path, sep=','):
    if sep == ';':
        p = pd.read_csv(path, sep=';', index_col=False, dtype=str)
    else:
        p = pd.read_csv(path, index_col=0, dtype=str)
    return p

In [88]:
###########################################
'''统计数组中的不同数据及其出现次数'''
'''返回值：diffData，存储data中出现的不同数据'''
'''       diffDataNum, 存储不同数据的出现次数'''


###########################################
def StatDataInf(data):
    dataArrayLen = len(data)
    diffData = []
    diffDataNum = []
    dataCpy = []
    for n in data:
        dataCpy.append(n)
    for i in range(dataArrayLen):
        count = 0
        j = i
        if dataCpy[j] != '/':
            temp = dataCpy[i]
            diffData.append(temp)
            while j < dataArrayLen:
                if dataCpy[j] == temp:
                    count = count + 1
                    dataCpy[j] = '/'
                j = j + 1
            diffDataNum.append(count)
    return diffData, diffDataNum


#########################################################
'''生成（X,Y）的联合分布律'''
'''返回值：distributionXY，二维随机变量（X,Y）的联合分布律'''


#########################################################
def CreateDistribution(X, Y):
    # sampleCount is the sample's count
    sampleCount = len(X)

    [diffData_X, diffDataNum_X] = StatDataInf(X)
    [diffData_Y, diffDataNum_Y] = StatDataInf(Y)

    # diffDataCount_X is the horizontal component is distribution table
    diffDataCount_X = len(diffData_X)

    # diffDataCount_Y is the vertical component is distribution table
    diffDataCount_Y = len(diffData_Y)

    distributionXY = np.zeros((diffDataCount_Y, diffDataCount_X))

    for i in range(diffDataCount_Y):
        for j in range(diffDataCount_X):
            count = 0
            for k in range(sampleCount):
                if (Y[k] == diffData_Y[i]) and (X[k] == diffData_X[j]):
                    count = count + 1
            distributionXY[i][j] = count / sampleCount

    return distributionXY


#############################################
'''计算联合熵'''


#############################################
def JointEntropy(distributionXY):
    je = 0
    [lenY, lenX] = np.shape(distributionXY)
    for i in range(lenY):
        for j in range(lenX):
            if (distributionXY[i][j] != 0):
                je = je - distributionXY[i][j] * math.log2(distributionXY[i][j])
    return je


###########################################
'''计算条件熵'''
'''返回值：HX_Y, Y发生条件下X发生的条件熵'''
'''       HY_X, X发生条件下Y发生的条件熵'''


###########################################
def ConditionEntropy(distributionXY):
    HX_Y = 0
    HY_X = 0
    [lenY, lenX] = np.shape(distributionXY)
    '''边缘概率'''
    pY = np.sum(distributionXY, axis=1)
    pX = np.sum(distributionXY, axis=0)

    ''' Y 发生条件下 X 发生的概率表'''
    ConditionPX_Y = np.zeros((lenY, lenX))
    for i in range(lenY):
        for j in range(lenX):
            ConditionPX_Y[i][j] = distributionXY[i][j] / pY[i]
    '''计算Y发生条件下X发生的条件熵HX_Y'''
    for i in range(lenY):
        for j in range(lenX):
            if (ConditionPX_Y[i][j] != 0):
                HX_Y = HX_Y - distributionXY[i][j] * math.log2(ConditionPX_Y[i][j])

    ''' X 发生条件下 Y 发生的概率表'''
    ConditionPY_X = np.zeros((lenY, lenX))
    for i in range(lenY):
        for j in range(lenX):
            ConditionPY_X[i][j] = distributionXY[i][j] / pX[j]
    '''计算X发生条件下Y发生的条件熵HX_Y'''
    for i in range(lenY):
        for j in range(lenX):
            if (ConditionPY_X[i][j] != 0):
                HY_X = HY_X - distributionXY[i][j] * math.log2(ConditionPY_X[i][j])

    return HX_Y, HY_X


###########################################
'''计算已知数据的熵'''


###########################################
def DataEntropy(data, diffData, diffDataNum):
    dataArrayLen = len(data)
    diffDataArrayLen = len(diffDataNum)
    entropyVal = 0
    for i in range(diffDataArrayLen):
        proptyVal = diffDataNum[i] / dataArrayLen
        if proptyVal != 0:
            entropyVal = entropyVal - proptyVal * math.log2(proptyVal)
    return entropyVal

In [89]:
from collections.abc import Iterable


def get_value(X):
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        return X.values
    return X


def entropy(X):
    X = get_value(X)
    [diffData, diffDataNum] = StatDataInf(X)
    entropyVal = DataEntropy(X, diffData, diffDataNum)
    return entropyVal


def condition_entropy(X, Y):
    X = get_value(X)
    Y = get_value(Y)
    distributionXY = CreateDistribution(X, Y)
    return ConditionEntropy(distributionXY)[0]


def mutual_information(X, Y):
    return entropy(X) - condition_entropy(X, Y)


# def joint_entropy(X, Y):
#     return entropy(Y) + condition_entropy(X, Y)


def joint_entropy(X, Y, Z=None):
    XYZ = []
    for i in range(len(X)):
        XYZ.append(str(X[i]) + str(Y[i]) + str(Z[i]))
    return entropy(XYZ)


# 3个数据互信息
def mutual_information_3(X, Y, Z):
    return -(entropy(X) + entropy(Y) + entropy(Z) - mutual_information(X, Y) - mutual_information(X,
                                                                                                  Z) - mutual_information(
        Z, Y) - joint_entropy(X, Y, Z))

In [90]:
class calculate_my:
    def __init__(self, raw_data, disinfection_data, prv, pub, others=None):
        self.prv = raw_data[prv]
        self.pub = raw_data[pub]
        self.sprv = disinfection_data[prv]
        self.spub = disinfection_data[pub]
        self.prv_row = prv

    def calculate_np(self):
        p = []
        for i in self.prv_row:
            ans = 1 - (mutual_information(merge(self.sprv[i]), merge(self.spub)) / entropy(self.sprv[i])) / (
                    mutual_information(merge(self.prv[i]), merge(self.pub)) / entropy(self.prv[i]))
            p.append(ans)
        return p

    def calculate_nu(self):
        p = []
        for i in self.prv_row:
            p.append((entropy(merge(self.sprv[i])) + entropy(merge(self.spub))) / (
                    entropy(merge(self.prv[i])) + entropy(merge(self.pub))))
        return p

In [96]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult2-anonymized.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
print(cm.calculate_np(), cm.calculate_nu())
disinfection_data

[0.26586005894097564, 0.2303791037381827] [0.9323300449433686, 0.9369201317967215]


Unnamed: 0_level_0,age,workclass,education,marital-status,occupation,race,sex,native-country,salary
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
*,"[1, 20]",*,*,Never-married,*,Amer-Indian-Eskimo,Female,*,<=50K
*,"[1, 20]",*,*,Never-married,*,Amer-Indian-Eskimo,Female,*,<=50K
*,"[1, 20]",*,*,Married-spouse-absent,*,Amer-Indian-Eskimo,Female,*,<=50K
*,"[1, 20]",*,*,Never-married,*,Amer-Indian-Eskimo,Female,*,<=50K
*,"[21, 40]",*,*,Never-married,*,Amer-Indian-Eskimo,Female,*,<=50K
...,...,...,...,...,...,...,...,...,...
*,"[21, 40]",State-gov,Undergraduate,Married-civ-spouse,Transport-moving,White,Male,United-States,<=50K
*,"[41, 60]",State-gov,Undergraduate,Divorced,Transport-moving,White,Male,United-States,<=50K
*,"[41, 60]",State-gov,Undergraduate,Married-civ-spouse,Transport-moving,White,Male,United-States,<=50K
*,19,Without-pay,HS-grad,Never-married,Farming-fishing,White,Male,United-States,<=50K


In [97]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult4-anonymized.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.3969048319374098, 0.3700634945242798] [0.9015925506745844, 0.908267577117894]


In [98]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult9-anonymized.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.5116686246273989, 0.499626323480875] [0.8254275231116667, 0.8372688614198095]


In [99]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult2-closeness.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.9033190370582617, 0.9240246720244067] [0.5023032605209907, 0.536062278964662]


In [100]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult4-closeness.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.6600314826669976, 0.6854930140736482] [0.7469910132780279, 0.7641527392280566]


In [101]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult9-closeness.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.027868302301904735, 0.04516767498528429] [0.9718847128832027, 0.9737917868522197]


[6.661338147750939e-16, -1.1102230246251565e-15] [0.9999999999999628, 0.9999999999999653]


In [106]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult2-diversity.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.6622029651163528, 0.6399408023110831] [0.5828291125607753, 0.611126022237069]


In [107]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult4-diversity.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.7427011249870465, 0.7015606829571901] [0.4324005398752139, 0.47090109477743985]


In [108]:
database_path = {
    'pre_database_path': './data2/adult.csv',
    'mod_database_path': './data2/adult6-diversity.csv'
}
raw_data = read_file(database_path['pre_database_path'])
disinfection_data = read_file(database_path['mod_database_path'])
cm = calculate_my(raw_data, disinfection_data, ['salary', 'marital-status', ],
                  ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'native-country'])
# cm = calculate_my(raw_data, disinfection_data, ['Salary', ], ['ZIP Code', 'Age'], ['Disease', ])
print(cm.calculate_np(), cm.calculate_nu())

[0.8127183947094324, 0.8577081249078726] [0.36125557473640163, 0.4045819281616593]
