In [76]:
from sklearn.datasets._samples_generator import make_blobs
import pandas as pd
from sklearn.preprocessing import StandardScaler
X, y_true = make_blobs(n_samples=50, centers=4, n_features=2,
                       cluster_std=0.60, random_state=0)

X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=['X', 'Y'])


In [77]:
'''Calculates the empirical root mean squared error. algorithm is a function
that takes in X and y and returns a model.'''
from math import sqrt
from sklearn.metrics import mean_squared_error


def get_empirical_error(algorithm, X, y):
    model = algorithm.fit(X)
    mse = mean_squared_error(y, model.predict(X))
    return sqrt(mse)


In [78]:
import warnings
import numpy as np
from scipy import sparse
from sklearn.model_selection import train_test_split
from scipy.sparse import issparse, vstack
from sklearn.base import clone


TRAIN = 100
TEST = 200

def get_multipliers(errors, rmse):
    zs = errors / rmse
    return np.array([Logprob(-z**2 / 2, True) for z in zs])

def gaussian_pdf(sd, x):
    if sd <= 0:
        raise ValueError('standard deviation must be positive but is {}'.format(sd))
    else: #sd > 0
        return np.e ** (-0.5*(x/sd)**2) / sd
    
def sklearn_invert(model, dist, X, y, target_cols, rmse):
    assert X.shape[0] == y.shape[0] #check that X and y have compatible dimensions
    
    if issparse(X): #deal with sparse matrices correctly
        stack = vstack
    else:
        stack = np.stack
    
    guesses = []
    
    assert len(target_cols) > 0
    one_hot = (len(target_cols) > 1) #whether the target attribute was one-hot encoded (binary otherwise)
    num_variants = len(target_cols) if one_hot else 2 #number of possible values of the target
    
    for i in range(X.shape[0]): #iterate over the rows of X and y
        row_X = stack([X[i] for _ in range(num_variants)]) #create copies of X[i]
        if one_hot:
            row_X[:, target_cols] = np.eye(num_variants) #fill in with all possible values of target (one-hot encoded)
        else: #fill in with all possible values of target (binary)
            row_X[0, target_cols] = 0
            row_X[1, target_cols] = 1
        row_y = np.repeat(y[i], num_variants)
        
        errors = row_y - model.predict(row_X)
        likelihood_scores = dist * get_multipliers(errors, rmse)
        guess = np.where(likelihood_scores == max(likelihood_scores))[0][0] #an integer in range(num_variants)
        guesses.append(guess)
    
    return np.array(guesses)
def sklearn_do_inversion(model, dist, X, y, t, target_cols, rmse):
    assert X.shape[0] == y.shape[0] == t.shape[0]
    num_rows = X.shape[0]
    
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', sparse.SparseEfficiencyWarning)
        results = sklearn_invert(model, dist, X, y, target_cols, rmse)
    num_correct = np.count_nonzero(results == t)
    
    return num_correct / num_rows
def gaussian_pdf(sd, x):
    if sd <= 0:
        raise ValueError('standard deviation must be positive but is {}'.format(sd))
    else: #sd > 0
        return np.e ** (-0.5*(x/sd)**2) / sd
    
def sklearn_decide(errors, r_emp):
    return np.where(abs(errors) < r_emp, TRAIN, TEST)

def sklearn_inclusion(model, X, y, r_emp):
    pred_vals = model.predict(X)
    actual_vals = y
    errors = actual_vals - pred_vals
    return sklearn_decide(errors, r_emp)

def sklearn_do_inclusion(model, X, y, r_emp):
    assert X.shape[0] == y.shape[0]
    num_rows = X.shape[0]
    
    results = sklearn_inclusion(model, X, y, r_emp)
    num_train = np.count_nonzero(results == TRAIN)
    
    return num_train / num_rows

def iterate_inclusion_and_write(algorithm, X, X_perturbed, y, r_emp):
  
  model = algorithm.fit(X)
  model_p = clone(algorithm).fit(X_perturbed)
  
  #Calculate attribute inference accuracy
  train_correct = sklearn_inclusion(model, X, y, r_emp)
  test_correct = sklearn_inclusion(model_p, X_perturbed, y, r_emp)
  
  print(r_emp, train_correct, test_correct)

In [79]:
from sklearn.cluster import KMeans

algorithm = KMeans(n_clusters=8)  
r_emp = get_empirical_error(algorithm, X, y_true) #root mean squared
r_emp

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fc7d5fe0e50>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


2.9086079144497976

In [80]:
import math
import random

from sympy import LambertW



def addVectorToPoint(point, distance, angle):
	x1, y1 = point
	x2 = x1 + (distance * np.cos(angle))
	y2 = y1 + (distance * np.sin(angle))
	return x2, y2
    

def inverseCumulativeGamma (eps, p): 
    x = (p - 1) / np.e
    return -(LambertW(x) + 1)/eps

def generate_laplace_noise(eps, x, y): 
    theta = np.random.rand()*np.pi*2
    p = random.random()
    r = inverseCumulativeGamma(eps, p) # draw radius distance
    return addVectorToPoint([x, y], r, theta)


def calculate_radius_with_noise(x0, n, epsilon): 
    """
        x0: Point to perturb
        n: amount of points to generate
        epsilon: privacy budget
    """
    Z = []
    total_dis = 0
    for nm in range(0, n):
        x1, y1 = x0
        noise = generate_laplace_noise(epsilon, x1, y1)
        x2, y2 = noise
        total_dis = total_dis + math.dist(x0, noise)
        Z.append(noise)

    R = total_dis / n
    return np.array(Z), R

def generate_truncated_laplace_noise(X, epsilon): 
    Z = []
    x_max = [np.max(X[:, 0]), np.max(X[:, 1])]
    x_min = [np.min(X[:, 0]), np.min(X[:, 1])]
    for x0 in X:
        z, R = calculate_radius_with_noise(x0, 1, epsilon)
        z = truncate(x_max, x_min, x0, z[0], epsilon)
        Z.append(z)
    return x_max, x_min, Z


def truncate(x_max, x_min, x0, z, epsilon): 
    """
    x_max: max domain point (x, y)
    x_min: min domain point (x, y)
    x0: point to truncate (radius centre)
    z: x0 + noise
    epsilon: privacy budget
    """
    x2, y2 = x_max
    x1, y1 = x_min

    zx, zy = z
    if(x1 < zx < x2 and y1 < zy < y2): 
        # print('inside', x, y)
        return z
    else:
        x, y = x0
        z2 = generate_laplace_noise(epsilon, x, y)
        return truncate(x_max, x_min, x0, z2, epsilon)

In [81]:
eps = 0.5
_, _, Z = generate_truncated_laplace_noise(X, eps)
Z_scaled = pd.DataFrame(StandardScaler().fit_transform(Z), columns=['X', 'Y'])
Z_scaled.head()

Unnamed: 0,X,Y
0,-0.270071,0.293512
1,-0.351979,-0.510719
2,1.487741,-0.318715
3,-0.819864,0.187774
4,-0.341477,1.457624


In [82]:

iterate_inclusion_and_write(algorithm, X_scaled, Z_scaled, y_true, r_emp)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fc7d5fe0e50>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/opt/conda/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fc7d5fe0e50>
Traceback 

2.9086079144497976 [200 100 100 100 100 200 100 200 100 100 100 200 200 100 200 200 200 100
 100 200 200 200 200 200 200 200 200 200 100 100 200 100 200 200 200 200
 200 100 200 200 100 200 200 100 200 100 200 200 100 100] [100 200 100 100 100 200 100 100 100 200 100 200 200 100 200 100 100 100
 100 200 100 200 100 100 100 200 100 200 100 100 200 100 100 100 200 100
 200 100 100 200 100 200 200 100 200 100 100 100 100 100]
