In [1]:
import random as rdm
import pandas as pd
import numpy as np

In [23]:
df_disease = pd.read_csv("doencas.csv")
df_disease.head()

Unnamed: 0,id,genero,data,cidade,estado,doenca
0,1,Female,31/01/2000,Brooklyn,New York,Virose
1,2,Female,16/10/1999,Topeka,Kansas,Zika
2,3,Female,10/10/1999,Schenectady,New York,Virose
3,4,Female,09/01/2000,New York City,New York,Conjuntivite
4,5,Male,03/11/1999,Syracuse,New York,Conjuntivite


In [24]:
df_public = pd.read_csv("background.csv")
df_public.reset_index(inplace=True)
df_public.rename(columns={'index': 'id'}, inplace=True)
df_public.head()

Unnamed: 0,id,nome,genero,data,cidade,estado
0,0,Alessandro Cowterd,Male,27/12/1999,New York City,New York
1,1,Cathie Bernhardi,Female,07/10/1999,Brooklyn,New York
2,2,Dorothee McCluskey,Female,14/12/1999,Buffalo,New York
3,3,Mace Scrine,Male,13/03/2000,New York City,New York
4,4,Ethyl Suttie,Female,18/11/1999,New York City,New York


# K-Anonimato

## Joining the locality attributes

In [25]:
ids_disease = df_disease.id
cities_disease = df_disease.cidade
states_disease = df_disease.estado

In [26]:
ids_public = df_public.id
cities_public = df_public.cidade
states_public = df_public.estado

In [27]:
locality_disease = []
for i in range(len(cities_disease)):
    locality_disease.append([ids_disease[i], str('{}, {}'.format(cities_disease[i], states_disease[i]))])

In [28]:
locality_public = []
for i in range(len(cities_public)):
    locality_public.append([ids_public[i], str('{}, {}'.format(cities_public[i], states_public[i]))])

In [29]:
df_locality_disease = pd.DataFrame(locality_disease, columns=['id', 'localidade'])
df_locality_disease.head()

Unnamed: 0,id,localidade
0,1,"Brooklyn, New York"
1,2,"Topeka, Kansas"
2,3,"Schenectady, New York"
3,4,"New York City, New York"
4,5,"Syracuse, New York"


In [30]:
df_locality_public = pd.DataFrame(locality_public, columns=['id', 'localidade'])
df_locality_public.head()

Unnamed: 0,id,localidade
0,0,"New York City, New York"
1,1,"Brooklyn, New York"
2,2,"Buffalo, New York"
3,3,"New York City, New York"
4,4,"New York City, New York"


## Joining the dataframes

In [31]:
join_dataset_disease = pd.concat([df_disease, df_locality_disease], axis=1, join='inner')
join_dataset_disease.head()

Unnamed: 0,id,genero,data,cidade,estado,doenca,id.1,localidade
0,1,Female,31/01/2000,Brooklyn,New York,Virose,1,"Brooklyn, New York"
1,2,Female,16/10/1999,Topeka,Kansas,Zika,2,"Topeka, Kansas"
2,3,Female,10/10/1999,Schenectady,New York,Virose,3,"Schenectady, New York"
3,4,Female,09/01/2000,New York City,New York,Conjuntivite,4,"New York City, New York"
4,5,Male,03/11/1999,Syracuse,New York,Conjuntivite,5,"Syracuse, New York"


In [32]:
join_dataset_public = pd.concat([df_public, df_locality_public], axis=1, join='inner')
join_dataset_public.head()

Unnamed: 0,id,nome,genero,data,cidade,estado,id.1,localidade
0,0,Alessandro Cowterd,Male,27/12/1999,New York City,New York,0,"New York City, New York"
1,1,Cathie Bernhardi,Female,07/10/1999,Brooklyn,New York,1,"Brooklyn, New York"
2,2,Dorothee McCluskey,Female,14/12/1999,Buffalo,New York,2,"Buffalo, New York"
3,3,Mace Scrine,Male,13/03/2000,New York City,New York,3,"New York City, New York"
4,4,Ethyl Suttie,Female,18/11/1999,New York City,New York,4,"New York City, New York"


## Removing attributes

In [33]:
dataset_disease = join_dataset_disease.drop(columns = ['id', 'estado', 'cidade'])
dataset_disease.describe()

Unnamed: 0,genero,data,doenca,localidade
count,400,400,400,400
unique,2,160,5,24
top,Male,30/03/2000,Dengue,"New York City, New York"
freq,204,6,87,103


In [34]:
dataset_public = join_dataset_public.drop(columns = ['id', 'estado', 'cidade'])
dataset_public.describe()

Unnamed: 0,nome,genero,data,localidade
count,2000,2000,2000,2000
unique,2000,2,182,24
top,Hayward Blackmoor,Male,13/01/2000,"New York City, New York"
freq,1,1032,22,519


## Date generalization function

In [35]:
def date_generalization(dataset, level):
    if level == 1:
        # date '**/MM/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**' + dataset.iloc[data]['data'][2:10])
    elif level == 2:
        # date '**/**/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**' + dataset.iloc[data]['data'][5:10])
    elif level == 3:
        # date'**/**/****'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**/****')
        
    return dataset

In [36]:
def locality_generalization(dataset, level):
    if level == 0:
        # locality 'city'
        for data in range(len(dataset)):
            place = dataset.iloc[data]['localidade'].split(',')
            dataset.iloc[data]['localidade'] = place[0] + "," + place[1]
    elif level == 1:
        # locality 'state,'
        for data in range(len(dataset)):
            place = dataset.iloc[data]['localidade'].split(',')
            dataset.iloc[data]['localidade'] = "**," + place[1]
    elif level == 2:
        # locality '*'
        for data in range(len(dataset)):
            dataset.iloc[data]['localidade'] = '*'
        
    return dataset

In [37]:
def gender_generalization(dataset, level):
    if level == 1:
        # gender '*'
        for data in range(len(dataset)):
            dataset.iloc[data]['genero'] = '*'
            
    return dataset

## Precision metric

$precision (D, Na, h, HGVA) = 1 - \frac{\sum_{i=1}^{Na} \sum_{j=1}^{|D|} \frac{h}{HGVA_i}}{|D| * Na}
$

- D: dataset
- Na: number of semi-identifier attributes
- h: height of attribute value generalization hierarchy Ai after generalization
- |HGVAi|: maximum height of hierarchy

In [38]:
def precision(D, Na, HGVA):
    h = []
    summation = 0
    for j in range(len(D)):
        data = D.iloc[j]["data"].count('*')
        localidade = D.iloc[j]["localidade"].count('*')
        genero = D.iloc[j]["genero"].count('*')
        
        h_data = 0
        if data == 2:
            h_data = 1
        elif data == 4:
            h_data = 2
        elif data == 8:
            h_data = 3
            
        h_localidade = 0
        if   localidade == 1:
            h_localidade = 2
        elif localidade == 2:
            h_localidade = 1
            
        h_genero = 0
        if genero == 1:
            h_genero = 1
            
        h.append([h_data, h_genero, h_localidade])
    
    for i in range(Na):
        for j in range(len(D)):
            summation += (h[j][i] / (HGVA[i]))
    
    return 1 - (summation / (len(D) * Na))

# Tree generalization

In [39]:
def treeGeneralization():
    return [(date, gender, locality) for date in range(4) for gender in range(2) for locality in range(3)]

# Verification Delta Presence

In [40]:
def verifyDeltaPresence(dataset, public, minimum, maximum, level_anonymization):
    new_dataset = dataset.copy()
    new_dataset.drop(columns=["doenca"])
    
    new_public_dataset = public.copy()
    new_public_dataset.drop(columns=["nome"])
    
    current_tree_generalization = treeGeneralization()[level_anonymization]
    new_public_dataset = date_generalization(new_public_dataset.copy(), current_tree_generalization[0])
    new_public_dataset = gender_generalization(new_public_dataset, current_tree_generalization[1])
    new_public_dataset = locality_generalization(new_public_dataset, current_tree_generalization[2])
    
    grouped_public_dataset = new_public_dataset.groupby(['data','genero','localidade'])
    grouped_dataset = new_dataset.groupby(['data','genero','localidade'])
        
    for i in range(len(new_public_dataset)):
        registry = (new_public_dataset.iloc[i]['data'], 
                    new_public_dataset.iloc[i]['genero'], 
                    new_public_dataset.iloc[i]['localidade'])
        
        anonymized_amount = 0
        public_amount = len(grouped_public_dataset.groups[registry])
        
        if registry in grouped_dataset.groups.keys():
            anonymized_amount = len(grouped_dataset.groups[registry])
     
        delta = anonymized_amount / public_amount

        if delta > maximum or delta < minimum:
            return False # passe para o proximo nivel de generalização
                
    return True

# Generating anonymization

In [41]:
def deltaPresence(dataset, public, minimum, maximum):
    
    current_level_generalization = 0
    maximum_precision = 0
    level_maximum_precision = 0
    
    for current_tree_generalization in treeGeneralization():
        
        current_dataset_generalization = date_generalization(dataset.copy(), 
                                                             current_tree_generalization[0])
        
        current_dataset_generalization = gender_generalization(current_dataset_generalization, 
                                                               current_tree_generalization[1])
        
        current_dataset_generalization = locality_generalization(current_dataset_generalization, 
                                                                 current_tree_generalization[2])
             
        return_verification = verifyDeltaPresence(current_dataset_generalization, 
                                                  public, 
                                                  minimum, 
                                                  maximum, 
                                                  current_level_generalization)
        
        if (return_verification):
            
            calculate_precision = float("{0:.4f}"
                                .format(precision(current_dataset_generalization, 3, [3, 1, 2])))
            
            if calculate_precision > maximum_precision:
                maximum_precision = calculate_precision
                level_maximum_precision = current_level_generalization
                
        current_level_generalization += 1
        
    tree_generalization_maximum_precision = treeGeneralization()[level_maximum_precision]

    dataset_generalization_maximum_precision = date_generalization(dataset.copy(), 
                                                         tree_generalization_maximum_precision[0])

    dataset_generalization_maximum_precision = gender_generalization(dataset_generalization_maximum_precision, 
                                                           tree_generalization_maximum_precision[1])

    dataset_generalization_maximum_precision = locality_generalization(dataset_generalization_maximum_precision, 
                                                             tree_generalization_maximum_precision[2])
            
    print("The best result is: {} => {}"
          .format(treeGeneralization()[level_maximum_precision], maximum_precision))

    return dataset_generalization_maximum_precision

In [42]:
def main():
    minimum = 0.10
    maxims = [0.5, 0.4, 0.3, 0.2]

    for maximum in maxims:
        generalized_dataset = deltaPresence(dataset_disease.copy(), dataset_public.copy(), minimum, maximum)
        generalized_dataset.to_csv (r'output/doencas_' + str(round((maximum - minimum) * 10)) +'.csv', 
                                    index = None, 
                                    header=True)

In [43]:
main()

The best result is: (2, 0, 1) => 0.6111
The best result is: (2, 0, 1) => 0.6111
The best result is: (2, 0, 1) => 0.6111
The best result is: (0, 0, 0) => 0
