In [2]:
import random as rdm
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("doencas.csv")
df.head()

Unnamed: 0,id,genero,data,cidade,estado,doenca
0,1,Female,31/01/2000,Brooklyn,New York,Virose
1,2,Female,16/10/1999,Topeka,Kansas,Zika
2,3,Female,10/10/1999,Schenectady,New York,Virose
3,4,Female,09/01/2000,New York City,New York,Conjuntivite
4,5,Male,03/11/1999,Syracuse,New York,Conjuntivite


# K-Anonimato

## Joining the locality attributes

In [10]:
ids = df.id
cidades = df.cidade
estados = df.estado

In [13]:
locality = []
for i in range(len(cidades)):
    locality.append([ids[i], str('{}, {}'.format(cidades[i], estados[i]))])

In [14]:
df_locality = pd.DataFrame(locality, columns=['id', 'localidade'])
df_locality.head()

Unnamed: 0,id,localidade
0,1,"Brooklyn, New York"
1,2,"Topeka, Kansas"
2,3,"Schenectady, New York"
3,4,"New York City, New York"
4,5,"Syracuse, New York"


## Joining the dataframes

In [15]:
join_dataset = pd.concat([df, df_locality], axis=1, join='inner')
join_dataset.head()

Unnamed: 0,id,genero,data,cidade,estado,doenca,id.1,localidade
0,1,Female,31/01/2000,Brooklyn,New York,Virose,1,"Brooklyn, New York"
1,2,Female,16/10/1999,Topeka,Kansas,Zika,2,"Topeka, Kansas"
2,3,Female,10/10/1999,Schenectady,New York,Virose,3,"Schenectady, New York"
3,4,Female,09/01/2000,New York City,New York,Conjuntivite,4,"New York City, New York"
4,5,Male,03/11/1999,Syracuse,New York,Conjuntivite,5,"Syracuse, New York"


## Removing attributes

In [17]:
dataset = join_dataset.drop(columns = ['id', 'estado', 'cidade'])
dataset.describe()

Unnamed: 0,genero,data,doenca,localidade
count,400,400,400,400
unique,2,160,5,24
top,Male,22/10/1999,Dengue,"New York City, New York"
freq,204,6,87,103


## Date generalization function

In [19]:
def date_generalization(dataset, level):
    if level == 1:
        # date '**/MM/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**' + dataset.iloc[data]['data'][2:10])
    elif level == 2:
        # date '**/**/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**' + dataset.iloc[data]['data'][5:10])
    elif level == 3:
        # date'**/**/****'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**/****')
        
    return dataset

In [20]:
def locality_generalization(dataset, level):
    if level == 0:
        # locality 'city'
        for data in range(len(dataset)):
            place = dataset.iloc[data]['localidade'].split(',')
            dataset.iloc[data]['localidade'] = place[0] + ","+place[1]
    elif level == 1:
        # locality 'state,'
        for data in range(len(dataset)):
            place = dataset.iloc[data]['localidade'].split(',')
            dataset.iloc[data]['localidade'] = "**," + place[1]
    elif level == 2:
        # locality '*'
        for data in range(len(dataset)):
            dataset.iloc[data]['localidade'] = '*'
        
    return dataset

In [21]:
def gender_generalization(dataset, level):
    if level == 1:
        # gender '*'
        for data in range(len(dataset)):
            dataset.iloc[data]['genero'] = '*'
            
    return dataset

## Precision metric

$precision (D, Na, h, HGVA) = 1 - \frac{\sum_{i=1}^{Na} \sum_{j=1}^{|D|} \frac{h}{HGVA_i}}{|D| * Na}
$

- D: dataset
- Na: number of semi-identifier attributes
- h: height of attribute value generalization hierarchy Ai after generalization
- |HGVAi|: maximum height of hierarchy

In [None]:
def precision(D, Na, HGVA):
    h = []
    summation = 0
    for j in range(len(D)):
        data = D.iloc[j]["data"].count('*')
        localidade = D.iloc[j]["localidade"].count('*')
        genero = D.iloc[j]["genero"].count('*')
        
        h_data = 0
        if data == 2:
            h_data = 1
        elif data == 4:
            h_data = 2
        elif data == 8:
            h_data = 3
            
        h_localidade = 0
        if   localidade == 1:
            h_localidade = 2
        elif localidade == 2:
            h_localidade = 1
            
        h_genero = 0
        if genero == 1:
            h_genero = 1
            
        h.append([h_data, h_genero, h_localidade])
    
    for i in range(Na):
        for j in range(len(D)):
            summation += (h[j][i] / (HGVA[i]))
    
    return 1 - (summation / (len(D) * Na))