In [16]:
# nbi:hide_in
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display, HTML
from ipywidgets import Layout, interact, IntSlider, widgets
import PIL
from skimage import io


In [17]:
# nbi:hide_in
#Random data set with a sensative attribue of food choise
data = {'Name (ID)' : ['Bill','Julia','Rose','Gwen','Peter','James','Anita','Rob',
                  'Jess','Edith','Catherin','Ellie','Andrew','Ruth','Barry','Hope',
                  'Ruby','Sian','Edward','James','Amanda'],
        'Age (QID)' : [23,19,22,25,30,18,18,16,28,29,21,
                  25,18,27,19,13,25,19,18,19,23], 
        'Gender (QID)' : ['M','F','F','F','M','M','F','M','F','M','M','F','M','F',
                'F','M','F','F','M','F','M'],
        'Political Party' : ['Green Party','Red Party','Red Party','Green Party','Green Party',
                'Red Party','Red Party','Green Party','Red Party','Green Party',
                'Green Party','Green Party','Green Party','Red Party','Green Party',
                'Red Party','Green Party','Red Party','Red Party','Red Party',
                'Green Party']}
                  
df = pd.DataFrame(data=data)

df['Name (ID)'] = '*'


In [18]:
# nbi:hide_in
def k_annonymity(df, qid):
    """returns the k annonymity value given the quasi-identifier field titles
    
    Parameters
    ----------
    df : DataFrame
        dataframe for which k value is to be calculated
    qid : list
        list of quasi-identifier columns
        
    Returns
    -------
    int 
        k value 
    """

    quid_groupsize = df.groupby(qid).size()
    return quid_groupsize[quid_groupsize > 0].min()


def l_diversity(df, qid, sa):
    """returns the l diversity value given the quasi-identifier field titles
    
    Parameters
    ----------
    df : DataFrame
        dataframe for which k value is to be calculated
    qid : list
        list of quasi-identifier columns
    sa : list
        list of sensitive attribute columns
        
    Returns
    -------
    int 
        l value 
    """
    sa_size_quids = df.groupby(qid)[sa].nunique()
    return sa_size_quids[sa_size_quids > 0].squeeze().min() # This assumes only one SA!
    # l-divesity generally only applies to a single SA 
    
def GIL(uq, lq, u, l):
    return (uq - lq) / (u - l)
    

In [19]:
# nbi:hide_in
k = widgets.IntSlider(value=1, min=1, max=5, layout=Layout(width='400px'))
step_size = widgets.IntSlider(value=1, min=1, max=40,layout=Layout(width='400px'))

dfs = df.copy()


def generalise_supress(k, step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    dfs = dfs.drop(columns=['ranged'])
    QID = ['Age (QID)', 'Gender (QID)']
    index = dfs.groupby(QID).filter(lambda x: len(x)<k).index
    dfs.iloc[index] = '----------'
    display(HTML(dfs.to_html(index=False)))

    
def label_age(step_size):
    print('Ages changed to %d year steps' %(step_size))
    

def meassure_k(k, step_size):
    QID = ['Age (QID)', 'Gender (QID)'] # define quasi-identifyer groups
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    gil = GIL(max(dfs['ranged'] + step_size) , min(dfs['ranged']), max(dfs['Age (QID)']), min(dfs['Age (QID)']))
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
   
    orig_length = len(df)
    new_length = len(dfs.groupby(QID).filter(lambda x: len(x)>=k))
    
    dfs = dfs.groupby(QID).filter(lambda x: len(x)>=k)
    quid_groupsize = dfs.groupby(QID).size()
    k_val = quid_groupsize[quid_groupsize > 0].min()
    if np.isnan(k_val): k_val = 0
    
    print('PRIVACY -> k-annonymity = %d' %k_val)
    print('\n')
    print("UTILITY -> percentage suppressed %d%%" %(np.round(((orig_length - new_length) / orig_length)* 100)))
    print("UTILITY -> generalisation information loss = %.3f" %gil)

    
def image(k, step_size):
    dfs = df.copy()
    dfs['ranged'] = np.floor((dfs['Age (QID)']/step_size))*step_size
    dfs['ranged'] = dfs['ranged'].astype(int)
    dfs['Age (QID)'] = dfs['ranged'].astype(str) + '-' + (dfs['ranged']+step_size).astype(str)
    dfs = dfs.drop(columns=['ranged'])
    QID = ['Age (QID)', 'Gender (QID)']
  
    orig_length = len(df)
    new_length = len(dfs.groupby(QID).filter(lambda x: len(x)>=k))
    
    dfs = dfs.groupby(QID).filter(lambda x: len(x)>=k)
    quid_groupsize = dfs.groupby(QID).size()
    k_val = quid_groupsize[quid_groupsize > 0].min()
    
    perc_loss = (orig_length - new_length) / orig_length * 100
    perc_step = step_size / 40 * 100 
    utility_index = (perc_loss + perc_step)

    if (k_val > 0):
        privacy_index = (k_val / 10) * 100 
    else:
        privacy_index = 100
    
    
    plt.figure()
    plt.bar(1, np.clip([100-utility_index], 0 , 100)[0], width=0.6, color='#2ca7d7')
    plt.bar(2, privacy_index, width=0.6, color='#002f91') 
    plt.ylim(-2, 102)
    plt.xlim(0.5, 2.5)
    plt.xticks([1,2], ["Utility", "Privacy"], fontsize=12)
    plt.yticks([0, 100], ["Bad", "Good"], fontsize=12)
    plt.show()



out2 = widgets.interactive_output(generalise_supress, {'k': k, 'step_size': step_size})

out3 = widgets.interactive_output(label_age, {'step_size': step_size})
out4 = widgets.interactive_output(meassure_k, {'k': k, 'step_size': step_size})
image = widgets.interactive_output(image, {'k': k, 'step_size': step_size})

print('Set minimum privacy value with k-anonymity:')
display(k)

print('\n')

print('Set age generalisation step size')
display(step_size)
display(out3)


display(out4)


display(image)

display(out2)


Set minimum privacy value with k-anonymity:


IntSlider(value=1, layout=Layout(width='400px'), max=5, min=1)



Set age generalisation step size


IntSlider(value=1, layout=Layout(width='400px'), max=40, min=1)

Output()

Output()

Output()

Output()