In [1]:
from helpers.read_dataset import *
from scipy.stats import entropy
import numpy as np

In [2]:
adult = read_dataset()

In [3]:
def score(data, option):
    return data.value_counts()[option] / 1000

In [4]:
def exponential(x, R, u, sensitivity, epsilon):  # per https://programming-dp.com/ch9.html#the-exponential-mechanism-for-finite-sets
    # Calculate the score for each element of R:
    scores = [u(x, r) for r in R]

    # Calculate the probability for each element, based on its score:
    probabilities = [np.exp(epsilon * score / (2 * sensitivity)) for score in scores]

    # Normalize the probabilities so they sum to 1:
    probabilities = probabilities / np.linalg.norm(probabilities, ord=1)

    # Choose an element from R based on the probabilities:
    return np.random.choice(R, 1, p=probabilities)[0]

Generate 1,000 results for the query over each of three other datasets, with $\epsilon=1$.

In [5]:
groups = {
    'original': adult,
    'sans_1st_freq': adult.drop(index=
        adult[adult['education'] == adult['education'].value_counts().keys()[0]].first_valid_index()
    ),  # removing a record with the most frequent "Education".
    'sans_2nd_freq': adult.drop(index=
        adult[adult['education'] == adult['education'].value_counts().keys()[1]].index
    ), # removing any record  with the second most frequent "Education".
    'sans_1st_infr': adult.drop(index=
        adult[adult['education'] == adult['education'].value_counts().keys()[-1]].index
    ) # removing any record with the least frequent "Education".
}

# Generate 1,000 results of the query for each group:
results = {
    group_name: pd.Series([exponential(group_data['education'], group_data['education'].unique(), score, 1, 1) for i in range(1000)]).value_counts()
    for group_name, group_data in groups.items()
}

Save the results into a csv file:

In [6]:
pd.DataFrame(results).to_csv('hw1-ii-7.csv')

Define validation measure:

In [7]:
def validate_group(group_name, epsilon):
    # Calculate frequency of each %.2f result (bin):
    orig_freq = results['original']
    nebr_freq = results[group_name]

    # Keep only the common categories:
    common_keys = nebr_freq[np.nan_to_num(nebr_freq) != 0].keys()
    orig_v = orig_freq[common_keys]
    nebr_v = nebr_freq[common_keys]

    return entropy(orig_v, nebr_v) < epsilon

Run the check for all 3 groups:

In [8]:
print({name: validate_group(name, 0.5) for name in ['sans_1st_freq', 'sans_2nd_freq', 'sans_1st_infr']})

{'sans_1st_freq': True, 'sans_2nd_freq': True, 'sans_1st_infr': True}
