# CS211 Data Privacy - Final Project
## Vincent Moeykens

In [2]:
# Load the data and libraries
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

# Number of columns to include in dataset
NUM_COLUMNS = 21

def laplace_mech(v, sensitivity, epsilon):
    return v + np.random.laplace(loc=0, scale=sensitivity / epsilon)

def gaussian_mech(v, sensitivity, epsilon, delta):
    return v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)

def gaussian_mech_vec(vec, sensitivity, epsilon, delta):
    return [v + np.random.normal(loc=0, scale=sensitivity * np.sqrt(2*np.log(1.25/delta)) / epsilon)
            for v in vec]

def pct_error(orig, priv):
    return np.abs(orig - priv)/orig * 100.0

cc_data = pd.read_csv('https://raw.githubusercontent.com/vmoeykens/cs211-final-project/main/data/BankChurners.csv', usecols=[x for x in range(NUM_COLUMNS)])

In [3]:
cc_data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


### Privacy Budget
We set our master privacy budget, $\epsilon$.

In [4]:
epsilon = 1.0

### Strategy
First we want to use the sparse vector technique and determine a clipping parameter for our data, then we want to compute a differentially private count and sum and generate an average value that is differentially private by post processing. 

In [5]:
# Here we use the above_threshold from the textbook and preserve the generality of the method
def above_threshold(queries, df, T, epsilon):
    T_hat = T + np.random.laplace(loc=0, scale = 2/epsilon)
    
    for idx, q in enumerate(queries):
        nu_i = np.random.laplace(loc=0, scale = 4/epsilon)
        if q(df) + nu_i >= T_hat:
            return idx
    return -1 # the index of the last element

def calculate_average(df, epsilon):
    partial_epsilon = epsilon / 3
    
    def create_query(b):
        return lambda df: df.clip(lower=0, upper=b).sum() - df.clip(lower=0, upper=b+1).sum()

    # Construct the stream of queries
    bs = range(1,150000,5)
    queries = [create_query(b) for b in bs]
    
    # Run AboveThreshold, using 1/3 of the privacy budget, to find a good clipping parameter
    epsilon_svt = partial_epsilon
    final_b = bs[above_threshold(queries, df, 0, epsilon_svt)]

    # Compute the noisy sum and noisy count, using 1/3 of the privacy budget for each
    epsilon_sum = partial_epsilon
    epsilon_count = partial_epsilon
    
    noisy_sum = laplace_mech(df.clip(lower=0, upper=final_b).sum(), final_b, epsilon_sum)
    noisy_count = laplace_mech(len(df), 1, epsilon_count)
    
    return noisy_sum/noisy_count

### Final Statistics to be Produced
#### Overall Averages
- Average Customer Age
- Average Months on Book
- Average Credit Limit

#### Counts
- Count of most common Income Ranges
- Count of most common Education Status

#### Averages by demographic
- Average Credit Limit of Customers <= 33y/o vs Average Credit Limit of Customers >35y/o
- Most Common Income Range of Customers with a College Degree vs Customers Without


In [15]:
# Here we define a general method for computing a e-DP Average of a Field, 
# its actual value, and calculating the accuracy

def calc_avg_and_acc(df, epsilon):
    """Returns data in the form: (dp_avg, orig_avg, error)"""
    avg = sum(df) / len(df)
    dp_avg = calculate_average(df, epsilon)
    error = pct_error(avg, dp_avg)
    return (dp_avg, avg, error)

dp_age_avg = calc_avg_and_acc(cc_data['Customer_Age'], epsilon / 3)
dp_months_avg = calc_avg_and_acc(cc_data['Months_on_book'], epsilon / 3)
dp_credit_limit_avg = calc_avg_and_acc(cc_data['Credit_Limit'], epsilon / 3)

averages = [dp_age_avg, dp_months_avg, dp_credit_limit_avg]
for avg in averages:
    print(f'Actual Average: {avg[1]}, DP Average: {avg[0]}, Percent Error: {avg[2]}')


Actual Average: 46.32596030413745, DP Average: 46.579212916197996, Percent Error: 0.5466753638735128
Actual Average: 35.928409203120374, DP Average: 35.99442809580968, Percent Error: 0.18375122682463027
Actual Average: 8631.953698034848, DP Average: 8736.452705296093, Percent Error: 1.2106066704810488


## Synthetic Representation