# Simple Examples With Duet

This notebook contains simple examples of using Duet to run basic queries on a dataset.

Contents:
- [Laplace Examples:](#laplace)
    - [Counting Query](#counting)
    - [Sum Query](#sum)
    - [Average Query](#average)
    - [Range Query](#range)
    - [Parallel Composition](#parallel)
- [Other Examples:](#other)
    - [Count Query using Renyi-Gauss differential privacy](#counting-rdp)
    - [Average query using Sparse Vector Technique](#average-svt)

### Setup

In [2]:
import duet

from duet import pandas as pd
from duet import numpy as np

import matplotlib.pyplot as plt

adult_data = pd.read_csv("adult_with_pii.csv")

## Laplace Examples <a name="laplace"></a>

### Counting Query <a name="counting"></a>

In [3]:
def dp_counting_query(df, col, val, epsilon):
    """ Returns a differentially private answer to a counting query 
    using the laplace function with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to count
    val -- value to check for count
    epsilon -- privacy cost
    """
    
    val = df[df[col] == val].shape[0]
    return duet.laplace(val, ε=epsilon)

epsilon = 0.01
with duet.EDOdometer() as odo:
    print('Query result:', dp_counting_query(adult_data, 'Marital Status', 'Never-married', epsilon))
    print('Privacy cost:', odo)

Query result: 10683.147075430106
Privacy cost: ([DataSource(adult_with_pii.csv): (0.01, 0)])


### Sum Query <a name="sum"></a>

In [4]:
def dp_sum_query(df, col, clip_lower, clip_upper, epsilon):
    """ Returns a differentially private answer to a sum query 
    using the laplace function with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to count
    clip_lower -- lower clipping parameter
    clip_upper -- upper clipping parameter
    epsilon -- privacy cost
    """
    
    clip_sum = df[col].clip(clip_lower, clip_upper).sum() 
    return duet.laplace(clip_sum, ε=epsilon)

epsilon = 0.01
with duet.EDOdometer() as odo:
    print('Query result:', dp_sum_query(adult_data, 'Age', 0, 100, epsilon))
    print('Privacy cost:', odo)

Query result: 1265330.1599048548
Privacy cost: ([DataSource(adult_with_pii.csv): (0.01, 0)])


### Average Query <a name="average"></a>

In [5]:
def dp_avg_query(df, col, clip_lower, clip_upper, epsilon):
    """ Returns a differentially private answer to a counting query 
    using the laplace function with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to count
    clip_lower -- lower clipping parameter
    clip_upper -- upper clipping parameter
    epsilon -- privacy cost
    """
    
    clip_sum = df[col].clip(clip_lower, clip_upper).sum() 
    noisy_sum = duet.laplace(clip_sum, ε=epsilon / 2)
    
    count = df.shape[0]
    noisy_count = duet.laplace(count, ε=epsilon / 2)
    
    return noisy_sum / noisy_count

epsilon = 0.01
with duet.EDOdometer() as odo:
    print('Query result:', dp_avg_query(adult_data, 'Age', 0, 100, epsilon))
    print('Privacy cost:', odo)

Query result: 38.674806527974866
Privacy cost: ([DataSource(adult_with_pii.csv): (0.01, 0)])


### Range Query <a name="range"></a>

In [6]:
def dp_range_query(df, col, lower, upper, epsilon):
    """ Returns a differentially private answer to a range query
    using the laplace function with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to count
    lower -- lower bound of the range
    upper -- upper bound of the range
    epsilon -- privacy cost
    """
    query = df[(df[col] >= lower) & (df[col] < upper)].shape[0]
    return duet.laplace(query, ε=epsilon)

epsilon = 0.01
with duet.EDOdometer() as odo:
    print('Query result:', dp_range_query(adult_data, 'Age', 20, 32, epsilon))
    print('Privacy cost:', odo)

Query result: 9851.34126151757
Privacy cost: ([DataSource(adult_with_pii.csv): (0.01, 0)])


### Parallel Composition <a name="parallel"></a>

In [7]:
def dp_histogram_parallel(df, col, epsilon):
    """ Returns a differentially private answer histogram using the laplace mechanism
    and parallel composition with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to generate histogram
    epsilon -- privacy cost
    """
    return duet.laplace(df.groupby(col).size(), epsilon=epsilon)

epsilon = 1
with duet.EDOdometer() as odo:
    hist = dp_histogram_parallel(adult_data, 'Education', epsilon) 
    print(hist.to_frame().head(5))
    print('Privacy cost:', odo)

                     0
Education             
10th        933.204570
11th       1174.588546
12th        431.594554
1st-4th     167.199422
5th-6th     333.359135
Privacy cost: ([DataSource(adult_with_pii.csv): (1, 0)])


## Other Examples <a name="other"></a>

### Renyi-Gauss Counting Query <a name="counting-rdp"></a>

In [10]:
def count_RDP(df, alpha, epsilon):
    """ Returns a differentially private length of database
    using Renyi Differential privacy
    
    Args:
    df -- dataframe 
    alpha -- alpha value for RDP
    epsilon -- epsilon value for RDP
    """
    
    return duet.renyi_gauss(df.shape[0], alpha=alpha, epsilon=epsilon)

alpha = 5
epsilon = 0.01
delta = (alpha, epsilon)
with duet.RenyiOdometer(delta) as odo:
    print('Query result:', count_RDP(adult_data, alpha, epsilon))
    print('Privacy cost:', odo)

Query result: 32575.49906350958
Privacy cost: (5, 0.02)


### Average using Sparse Vector Technique <a name="average-svt"></a>

In [11]:
def auto_avg_svt(df, col, bs, epsilon):
    """ Returns a differentially private answer to a counting query 
    using the laplace function with privacy cost epsilon
    
    Args:
    df -- dataframe 
    col -- column in the dataframe to count
    clip_lower -- lower clipping parameter
    epsilon -- privacy cost
    """
    
    def create_query(b):
        """ Helper function for auto_avg_svt to help generate queries"""
        return lambda df: df[col].clip(0, b).sum() - df[col].clip(0, b+1).sum()
    
    queries = [create_query(b) for b in bs]
    
    #Use 1/3 of privacy budget for each part
    final_b = bs[duet.above_threshold(queries, df, 0, epsilon / 3)]
    noisy_sum = duet.laplace(df[col].clip(0, final_b).sum(), epsilon=epsilon / 3)
    noisy_count = duet.laplace(df.shape[0], epsilon=epsilon / 3)
    
    return noisy_sum / noisy_count

bs = range(1,150000,5)
epsilon = 1
with duet.EDOdometer() as odo:
    print('Query result:', auto_avg_svt(adult_data, 'Age', bs, epsilon))
    print('Privacy cost:', odo)

Query result: 38.59107411862699
Privacy cost: ([DataSource(adult_with_pii.csv): (1.0, 0)])
