In [1]:
from helpers.read_dataset import *
from scipy.stats import entropy
import numpy as np

In [2]:
original = read_dataset()

In [3]:
def query_average(table, epsilon, times):
    sens = {'count': 1, 'sum': 125}

    above25 = table[table['age'] > 25]

    # Query the count of individuals with age>25:
    count_noisy = above25.shape[0] + np.random.laplace(loc=0, scale=sens['count']/(epsilon/2.0), size=times)

    # Query the sum of the ages of individuals with age>25:
    sum_noisy = above25['age'].sum() + np.random.laplace(loc=0, scale=sens['sum']/(epsilon/2.0), size=times)

    return sum_noisy / count_noisy

Generate 1,000 results for the query over each of three other datasets, with $\epsilon=1$:

In [4]:
# Define neighboring groups:
groups = {
    'original': original,
    'sans_oldest': original.drop(index=original['age'].idxmax()),  # removing a record with the oldest age.
    'sans_26': original[original['age'] != 26],  # removing any record with age 26.
    'sans_youngest': original[original['age'] != original['age'].min()]  # removing any record with the youngest age.
}

# Generate 1,000 results of the query for each group:
results = {group_name: query_average(group_data, 1.0, 1000) for group_name, group_data in groups.items()}

Save the results into a csv file:

In [5]:
pd.DataFrame(results).to_csv('hw1-ii-3.csv', index=False)

Round each number to two decimal places:

In [6]:
results_rounded = {group_name: np.array(results).round(2) for group_name, results in results.items()}

Define validation measure:

In [7]:
def validate_group(group_name, epsilon):
    # Calculate frequency of each %.2f result (bin):
    orig_bins, orig_freq = np.unique(results_rounded['original'], return_counts=True)
    nebr_bins, nebr_freq = np.unique(results_rounded[group_name], return_counts=True)

    # Calculate vector of frequencies over common %.2f results (bins):
    orig_v = orig_freq[np.nonzero(np.in1d(orig_bins, nebr_bins))[0]]  # per https://stackoverflow.com/a/2333682
    nebr_v = nebr_freq[np.nonzero(np.in1d(nebr_bins, orig_bins))[0]]

    return entropy(orig_v, nebr_v) < epsilon

Run the check for all 3 groups:

In [8]:
print({name: validate_group(name, 1.0) for name in ['sans_oldest', 'sans_26', 'sans_youngest']})

{'sans_oldest': True, 'sans_26': True, 'sans_youngest': True}
