In [1]:
import numpy as np
import pandas as pd
import math
from src.hdmm.error import expected_error, strategy_supports_workload
from src.hdmm.matrix import EkteloMatrix

from matplotlib import pyplot as plt
from collections import OrderedDict
from src.hdmm.workload import AllRange

In [2]:
s1zero_array = np.zeros((5,4))
s1random_array = np.random.randint(2, size=(5,4))
s1alice_q = np.hstack((s1random_array, s1zero_array))
s1bob_q = np.hstack((s1zero_array, s1random_array))

In [3]:
# write a function that makes workloads not repeat

In [4]:
query = np.random.randint(2, size=(11,8))[0] 
storage = {}

def cache(query, storage, ans, error):
    """caches query into a dictionary with values of (ans, error)"""
    storage[np.array2string(query)] = (ans, error)
    return storage
    
def is_reusable(query, storage):
    """returns whether or not a query is in a strategy matrix 
    (cache)"""
    return np.array2string(query) in storage

def reuse(query, storage):
    """returns tuple with (query answer, error) stored in 
    a storage dictionary"""
    return storage[np.array2string(query)]
    

cache(query, storage, 0.5, 0.5)
is_reusable(query, storage)
reuse(query, storage)

(0.5, 0.5)

In [5]:
def cache_and_reuse(workload, x, eps=0.01, k=0, analyst_labels=[]):
    """
    Takes in workload, database, eps, k (# total update steps PER ANALYST)
    
    Returns list of error per query.
    """
    budgets = {}
    for analyst in list(set(analyst_labels)): 
        budgets[analyst] = k # each analyst starts with k update stepss
    
    error_list = []
    updated_list = []
    used_cache_list = []
    
    n = x.sum()
    x_norm = x/sum(x)
    storage = {}
    for i, query in enumerate(workload): 
        analyst = analyst_labels[i]
        if is_reusable(query, storage): # reuse step
            noisy_ans, abs_error = reuse(query, storage)
            
            error_list.append(abs_error)
            ans_list.append(noisy_ans)
            updated_list.append(False)
            used_cache_list.append(True)
        elif budgets[analyst] > 0: # not reusable and analyst has update steps left
            noise = np.random.laplace(0, k/(n * eps), 1)[0]
            noisy_ans = (np.dot(query, x_norm)) + noise
            
            true_ans = np.matmul(query, x_norm)
            abs_error = np.abs(noisy_ans - true_ans)
            error_list.append(abs_error)
            
            budgets[analyst] -= 1
            storage = cache(query, storage, noisy_ans, abs_error)
            updated_list.append(True)
            ans_list.append(noisy_ans)
            used_cache_list.append(False)
        elif budgets[analyst] == 0: # this analyst has run out of update steps
            error_list.append(None)
            ans_list.append(None)
            updated_list.append(False)
            used_cache_list.append(False)
    d = {'queries': workload.tolist(), 
        'abs_error': error_list,
        'ans': ans_list,
        'updated': updated_list,
        'used_cache': used_cache_list,
        'analyst': analyst_labels,
    }
    test_data = pd.DataFrame(data=d)
    test_data = test_data.round(3)
    test_data['isNa'] = np.where(test_data.abs_error.isnull(), True, False)
    return test_data

In [6]:
def add_to_strategy(query, strategy):
    """Append query to the end fo the strategy matrix"""
    return np.concatenate((strategy, query), axis = 0)


def cache_and_reconstruct(workload, x, eps=0.01, k=0, analyst_labels=[]):
    """
    Takes in workload, database, eps (privacy budget), k (number of total update steps PER ANALYST). 
    
    Returns list of error per query.
    """
    budgets = {}
    for analyst in list(set(analyst_labels)): 
        budgets[analyst] = k # each analyst starts with k update steps
    
    numAnalysts = len(budgets)
    error_list = []
    laplace_list = [] # if the algorithm simply added noise to the answer
    used_reconstruct_list = []
    used_reuse_list = []
    
    storage = {} # storage dictionary for reuse, k: query,v: error
    strategy = workload[0:0] # workload matrix for reconstruction, 
    # make strategy = empty workload to create matrix of same dtype to be used in reconstruct step (avoid error)
    
    n = x.sum()
    x_norm = x/sum(x) # normalize database
    
    for i, query in enumerate(workload): 
        query = np.expand_dims(query, axis = 0)
        analyst = analyst_labels[i]
        
        # If query has answered before, then use old query answer
        if is_reusable(query, storage): 
            abs_error = reuse(query, storage)
            
            error_list.append(abs_error)
            used_reconstruct_list.append(False) 
            laplace_list.append(False)
            used_reuse_list.append(True)
            strategy = add_to_strategy(query, strategy)
            
        # If analyst still has update steps left
        elif budgets[analyst] > 0: 
            noise = np.random.laplace(0, (k * numAnalysts) / (n * eps), 1)[0]
            noisy_ans = (np.dot(query, x_norm)) + noise
            true_ans = np.matmul(query, x_norm)
            abs_error = np.abs(noisy_ans - true_ans)[0]
            
            error_list.append(abs_error) # *n
            
            storage[np.array2string(query)] = abs_error
            budgets[analyst] -= 1 
            strategy = add_to_strategy(query, strategy)
            laplace_list.append(True)
            used_reconstruct_list.append(False)
            used_reuse_list.append(False)
        
        # If query is reconstructable, then reconstruct
        elif strategy_supports_workload(EkteloMatrix(query), EkteloMatrix(strategy)): # how to convert numpy array to ektelo matrix https://github.com/yikai-wu/Multi-Analyst-DP/blob/fadc7ac1d20199e8b31914f44323e51a05ed072d/src/hdmm/matrix.py#L34
            
            squared_error = expected_error(query, strategy, len(strategy) / (k * numAnalysts) * eps) # do i mult by 100
            abs_error = math.sqrt(squared_error) / n #
            
            storage[np.array2string(query)] = abs_error
            error_list.append(abs_error)
            laplace_list.append(False)
            used_reconstruct_list.append(True) 
            used_reuse_list.append(False)
            strategy = add_to_strategy(query, strategy)
            
        # If analyst ran out of update steps
        else: # this analyst has run out of update steps
            error_list.append(None)
            laplace_list.append(False)
            used_reconstruct_list.append(False)
            used_reuse_list.append(False)
        
    d = {'queries': workload.tolist(), 
        'abs_error': error_list,
        'used_reconstruct': used_reconstruct_list,
        'used_reuse': used_reuse_list,
        'laplace': laplace_list,
        'analyst': analyst_labels,
    }
    test_data = pd.DataFrame(data=d)
    test_data = test_data.round(3)
    test_data['isNa'] = np.where(test_data.abs_error.isnull(), True, False)
    return test_data

In [7]:
# db and workloads
x_exp = np.array([20, 160, 20, 160])

s1zero_array = np.zeros((4,2))
s1random_array = np.random.randint(2, size=(4,2))
s1alice_q = np.hstack((s1random_array, s1zero_array))
s1bob_q = np.hstack((s1zero_array, s1random_array))

print('Database:\n', x_exp,'\n')
print('Alice Workload:\n', s1alice_q,'\n')
print('Bob Workload: \n', s1bob_q)

Database:
 [ 20 160  20 160] 

Alice Workload:
 [[0. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]] 

Bob Workload: 
 [[0. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# 2/11 Experiments
In these experiments, I will be testing whether Cache and Reconstruct satisfy our desiderata

- In 2/25, I am making fixes to the c&r algorithm

In [8]:
x_exp = np.array([20, 160, 20, 160])

In [9]:
# alice and bob's queries that were used in the 2/11 updates
s1alice_q = np.array([[1, 0, 0, 0],
 [1, 0, 0, 0],
 [0, 1, 0, 0],
 [1, 0, 0, 0]])

s1bob_q = np.array([[0, 0, 1, 0],
 [0, 0, 1, 0],
 [0, 0, 0, 1],
 [0, 0, 1, 0]])

In [10]:
rec_alice_df = cache_and_reconstruct(s1alice_q, x_exp, eps=2, k=1, analyst_labels=['Alice'] * 4)
rec_alice_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[1, 0, 0, 0]",0.006,False,False,True,Alice,False
1,"[1, 0, 0, 0]",0.006,False,True,False,Alice,False
2,"[0, 1, 0, 0]",,False,False,False,Alice,True
3,"[1, 0, 0, 0]",0.006,False,True,False,Alice,False


In [11]:
rec_bob_df = cache_and_reconstruct(s1bob_q, x_exp, eps=2, k=1, analyst_labels=['Bob'] * 4)
rec_bob_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[0, 0, 1, 0]",0.001,False,False,True,Bob,False
1,"[0, 0, 1, 0]",0.001,False,True,False,Bob,False
2,"[0, 0, 0, 1]",,False,False,False,Bob,True
3,"[0, 0, 1, 0]",0.001,False,True,False,Bob,False


In [12]:
# joint

rec_joint_df = cache_and_reconstruct(np.vstack((s1alice_q, s1bob_q)), x_exp, eps=2, k=1, 
                      analyst_labels=['Alice'] * 4 + ['Bob'] * 4)
rec_joint_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[1, 0, 0, 0]",0.001,False,False,True,Alice,False
1,"[1, 0, 0, 0]",0.001,False,True,False,Alice,False
2,"[0, 1, 0, 0]",,False,False,False,Alice,True
3,"[1, 0, 0, 0]",0.001,False,True,False,Alice,False
4,"[0, 0, 1, 0]",0.0,False,False,True,Bob,False
5,"[0, 0, 1, 0]",0.0,False,True,False,Bob,False
6,"[0, 0, 0, 1]",,False,False,False,Bob,True
7,"[0, 0, 1, 0]",0.0,False,True,False,Bob,False


# 2/25 Experiments

In [13]:
s2alice_q = [[0, 1, 0, 0],
 [1, 0, 0, 0],
 [1, 1, 0, 0],
 [0, 1, 0, 0]]

s2alice_q = np.array(s2alice_q)

s2bob_q = [[0, 0, 0, 1],
 [0, 0, 1, 0],
 [0, 0, 1, 1],
 [0, 0, 0, 1]]

s2bob_q = np.array(s2bob_q)



In [14]:
rec_alice_df = cache_and_reconstruct(s2alice_q, x_exp, eps=2, k=2, analyst_labels=['Alice'] * 4)
rec_alice_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[0, 1, 0, 0]",0.004,False,False,True,Alice,False
1,"[1, 0, 0, 0]",0.0,False,False,True,Alice,False
2,"[1, 1, 0, 0]",0.003,True,False,False,Alice,False
3,"[0, 1, 0, 0]",0.004,False,True,False,Alice,False


In [15]:
rec_bob_df = cache_and_reconstruct(s2bob_q, x_exp, eps=2, k=2, analyst_labels=['Bob'] * 4)
rec_bob_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[0, 0, 0, 1]",0.006,False,False,True,Bob,False
1,"[0, 0, 1, 0]",0.002,False,False,True,Bob,False
2,"[0, 0, 1, 1]",0.003,True,False,False,Bob,False
3,"[0, 0, 0, 1]",0.006,False,True,False,Bob,False


In [16]:
rec_joint_df = cache_and_reconstruct(np.vstack((s2alice_q, s2bob_q)), x_exp, eps=2, k=2, 
                      analyst_labels=['Alice'] * 4 + ['Bob'] * 4)
rec_joint_df

Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[0, 1, 0, 0]",0.022,False,False,True,Alice,False
1,"[1, 0, 0, 0]",0.007,False,False,True,Alice,False
2,"[1, 1, 0, 0]",0.006,True,False,False,Alice,False
3,"[0, 1, 0, 0]",0.022,False,True,False,Alice,False
4,"[0, 0, 0, 1]",0.004,False,False,True,Bob,False
5,"[0, 0, 1, 0]",0.002,False,False,True,Bob,False
6,"[0, 0, 1, 1]",0.006,True,False,False,Bob,False
7,"[0, 0, 0, 1]",0.004,False,True,False,Bob,False


# Seeded Cache and Reconstruct
May 13, 2022

In [21]:
np.vstack((s2alice_q, s2bob_q))

array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 1],
       [0, 0, 0, 1]])

In [247]:
def proved_cache_and_reconstruct(workload, x, eps=0.01, k=1, analyst_labels=[], seed="identity", 
                                seed_pct=0.1):
    """
    Seeded Cache and Reconstruct Algorithm. 
    
    Parameters: 
    - workload
    - x - database (works best with large databases where n > 1000)
    - eps - privacy budget
    - k - number of update steps per analyst (ability to access the database via Laplace mechanism)
    - analyst_labels - list where analyst_labels[i] is the name (string) of the analyst who submitted a 
    query at time i 
    - seed - type of basis for the seed, types: "identity"
    - seed_pct - percent of the privacy budget to be used to create the seed of the workload
    
    Returns list of error per query.
    
    Last edited: May 18-2022
    """
    budgets = {}
    for analyst in list(set(analyst_labels)): 
        budgets[analyst] = k # each analyst starts with k update steps
    
    numAnalysts = len(budgets)
    error_list = []
    laplace_list = [] # if the algorithm simply added noise to the answer
    used_reconstruct_list = []
    used_reuse_list = []
    
    storage = {} # storage dictionary for reuse, k: query,v: error
    strategy = workload[0:0] # workload matrix for reconstruction, 
    # make strategy = empty workload to create matrix of same dtype to be used in reconstruct step (avoid error)
    
    n = x.sum()
    x_norm = x/sum(x) # normalize database
    
    #Stage 1: Seed using basis matrix
    
    if seed == "identity":
        strategy = add_to_strategy(np.identity(len(x)) * len(set(analyst_labels)), strategy)
    eps = eps - eps * seed_pct
    print(eps)
    
    for i, query in enumerate(workload): 
        query = np.expand_dims(query, axis = 0)
        analyst = analyst_labels[i]
        
        #Stage 2: Reuse or use privacy budget
        # 2a. If query has answered before, then use old query answer
        if is_reusable(query, storage): 
            abs_error = reuse(query, storage)
            
            error_list.append(abs_error)
            used_reconstruct_list.append(False) 
            laplace_list.append(False)
            used_reuse_list.append(True)
            strategy = add_to_strategy(query, strategy)
            
        # 2b. If analyst still has update steps left, use the update step
        elif budgets[analyst] > 0: 
            noise = np.random.laplace(0, (k * numAnalysts) / (n * eps), 1)[0]
            noisy_ans = (np.dot(query, x_norm)) + noise
            true_ans = np.matmul(query, x_norm)
            abs_error = np.abs(noisy_ans - true_ans)[0]
            
            error_list.append(abs_error) # *n
            storage[np.array2string(query)] = abs_error
            budgets[analyst] -= 1 
            strategy = add_to_strategy(query, strategy)
            laplace_list.append(True)
            used_reconstruct_list.append(False)
            used_reuse_list.append(False)
        
        # Stage 3: Using Reconstructed Answers
        # If query is reconstructable, then reconstruct
        elif strategy_supports_workload(EkteloMatrix(query), EkteloMatrix(strategy)): # how to convert numpy array to ektelo matrix https://github.com/yikai-wu/Multi-Analyst-DP/blob/fadc7ac1d20199e8b31914f44323e51a05ed072d/src/hdmm/matrix.py#L34
            
            squared_error = expected_error(query, strategy, len(strategy) / (k * numAnalysts) * eps) 
            abs_error = math.sqrt(squared_error) / n
            
            storage[np.array2string(query)] = abs_error
            error_list.append(abs_error)
            laplace_list.append(False)
            used_reconstruct_list.append(True) 
            used_reuse_list.append(False)
            strategy = add_to_strategy(query, strategy)
            
    d = {'queries': workload.tolist(), 
        'abs_error': error_list,
        'used_reconstruct': used_reconstruct_list,
        'used_reuse': used_reuse_list,
        'laplace': laplace_list,
        'analyst': analyst_labels,
    }
    test_data = pd.DataFrame(data=d)
    test_data = test_data.round(3)
    test_data['isNa'] = np.where(test_data.abs_error.isnull(), True, False)
    return test_data

FAQ: 
- Why is my noise too large when doing the Laplace mechanism?
    - It likely means that your epsilon is too high compared to the number of people in your database.
    - Also, check to make sure that x isn't normalized; we need true x in order know n, or the number of people, in the dataset

In [251]:
proved_cache_and_reconstruct(workload=np.array([[1, 1, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1]]), 
                             x= np.array([250, 250, 250, 250]), eps=10, k=1, 
                             analyst_labels=['Alice', 'Alice', 'Chris'])

9.0


Unnamed: 0,queries,abs_error,used_reconstruct,used_reuse,laplace,analyst,isNa
0,"[1, 1, 1, 1]",0.0,False,False,True,Alice,False
1,"[0, 1, 1, 1]",0.0,True,False,False,Alice,False
2,"[1, 1, 1, 1]",0.0,False,True,False,Chris,False


Let's try a test case where each analyst has one update step. We want to test to make sure that 

- the strategy matrix is being added on correctly (i.e., with three analysts in analyst label, the strategy matrix is I * 3
- queries that are being reused are being reused correctly
- expected error is returning the right amount of error


In [203]:
k = 1
numAnalysts = 3
n = 1000
eps = 10
np.random.laplace(0, (k * numAnalysts) / (n * eps), 1)[0]

0.00018588845422625526

# How to compute noise scale for one query: 
Parameters: 
- total budget = eps
- k = update per analyst
- num = number of analyts
- n = individuals in the database
- eps = total privacy budget for the entire system
- epsilon = privacy budget for one query
- sensitivity = 1/n 
- number of update steps = k * num
- budget per update step = epsilon /(k*num)

Noise scale = sensitivty * 1/epsilon

= 1/n * 1/epsilon 

= 1/n * 1/(eps/(k*num)) 

= 1/n * k*num/eps 

= k* num /eps *n 


In [None]:
proved_cache_and_reconstruct(workload, x=np.array([[1, 1, 1, 1],[1, 1, 1, 1]]), eps=0.01, k=1, 
                             analyst_labels=['Alice', 'Bob'])

In [27]:
workload = np.vstack((s2alice_q, s2bob_q))
workload

array([[0, 1, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 1, 1],
       [0, 0, 0, 1]])

In [None]:
proved_cache_and_reconstruct(workload, x, eps=0.01, k=0, analyst_labels=[]):

- PMW (no adaptions), Seeded C&R, Randomized scheduler PMW, Round Robin Scheduler PMW
- vary on:
    - 3+ values of p 
    - type of workload (a differing seed): 
        - randomized workload - identity seed
        - point queries - identity seed
        - range queries - HB workload seed (branching factor = 2)
        - (marginal queries - matrix mechanism to decide a workload)
- measure: 
    - total utility
    - max ratio
    - emp interference
- number of analysts: 
    - 5
- use same counting threshold

In [None]:
emp interference = choose random analyst instead of max. 

- Add C&R Implementation
- Implement random scheduler for more than 2 multi-analyst case (p = probability of Alice, 1-p = other analysts uniform, should be similar implementation, just need to implement a random selection of other analysts that still have queries)
- Implement round robin scheduler (evenly shuffle the analysts workloads, come up with a list of analysts
- Output values for visualization: Utility
- Output values for visualization: Empirical/Max Ratio
- Output boxplots using the utility/empirical max ratio
