In [1]:
import random as rdm
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("salario.csv")
df.head()

Unnamed: 0,id,pais,estado,cidade,genero,data,salario
0,1,United States,Alabama,Mobile,Male,10/01/1987,$5407.86
1,2,Portugal,Porto,Rego,Male,29/11/2007,$7833.82
2,3,Sweden,Kalmar,Emmaboda,Male,23/06/1984,$5530.85
3,4,Sweden,Västmanland,Sala,Female,04/11/1983,$2803.76
4,5,France,Bretagne,Quimperlé,Female,18/11/1990,$2201.87


## Unindo os atributos de localidade

In [3]:
ids = df.id
cities = df.cidade
states = df.estado
countries = df.pais

In [4]:
locality = []
for i in range(len(cities)):
    locality.append([ids[i], str('{}, {}, {}'.format(cities[i], states[i], countries[i]))])

In [5]:
df_locality = pd.DataFrame(locality, columns=['id', 'localidade'])
df_locality.head()

Unnamed: 0,id,localidade
0,1,"Mobile, Alabama, United States"
1,2,"Rego, Porto, Portugal"
2,3,"Emmaboda, Kalmar, Sweden"
3,4,"Sala, Västmanland, Sweden"
4,5,"Quimperlé, Bretagne, France"


## Unindo os dataframes

In [6]:
join_dataset = pd.concat([df, df_locality], axis=1, join='inner')
join_dataset.head()

Unnamed: 0,id,pais,estado,cidade,genero,data,salario,id.1,localidade
0,1,United States,Alabama,Mobile,Male,10/01/1987,$5407.86,1,"Mobile, Alabama, United States"
1,2,Portugal,Porto,Rego,Male,29/11/2007,$7833.82,2,"Rego, Porto, Portugal"
2,3,Sweden,Kalmar,Emmaboda,Male,23/06/1984,$5530.85,3,"Emmaboda, Kalmar, Sweden"
3,4,Sweden,Västmanland,Sala,Female,04/11/1983,$2803.76,4,"Sala, Västmanland, Sweden"
4,5,France,Bretagne,Quimperlé,Female,18/11/1990,$2201.87,5,"Quimperlé, Bretagne, France"


## Removendo atributos

In [12]:
dataset = join_dataset.drop(columns = ['id', 'pais', 'estado', 'cidade', 'salario'])
dataset.head()

Unnamed: 0,genero,data,localidade
0,Male,10/01/1987,"Mobile, Alabama, United States"
1,Male,29/11/2007,"Rego, Porto, Portugal"
2,Male,23/06/1984,"Emmaboda, Kalmar, Sweden"
3,Female,04/11/1983,"Sala, Västmanland, Sweden"
4,Female,18/11/1990,"Quimperlé, Bretagne, France"


## Função de generalização da data

In [29]:
def date_generalization(dataset, level):
    if level == 1:
        date = '**/MM/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**' + dataset.iloc[data]['data'][2:10])
    elif level == 2:
        date = '**/**/YYYY'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**' + dataset.iloc[data]['data'][5:10])
    elif level == 3:
        date = '**/**/****'
        for data in range(len(dataset)):
            dataset.iloc[data]['data'] = str('**/**/****')
        
    return dataset

In [30]:
new_dataset_test = []

for data in range(len(dataset)):
    if dataset.iloc[data]['localidade'] == "Mobile, Alabama, United States":
        new_dataset_test.append(dataset.iloc[data])
        
new_dataset_test = pd.DataFrame(new_dataset_test, columns=['genero', 'data', 'localidade'])
new_dataset_test                         

Unnamed: 0,genero,data,localidade
0,Male,10/01/1987,"Mobile, Alabama, United States"
17,Female,06/05/1983,"Mobile, Alabama, United States"
973,Female,24/01/2007,"Mobile, Alabama, United States"


In [32]:
print(date_generalization(new_dataset_test, 3))

     genero        data                      localidade
0      Male  **/**/****  Mobile, Alabama, United States
17   Female  **/**/****  Mobile, Alabama, United States
973  Female  **/**/****  Mobile, Alabama, United States


## Função de Precisão

(sum i -> Na (sum j -> |D|(h/(HGV Ai))))/ (|D| * Na)

- D: conjunto de dados
- Na: número de atributos semi-identificadores
- h: altura da hierarquia de generalização de valor do atributo Ai após anonimização
- |HGVAi|: altura máxima da hierarquia

In [10]:
def precision(D, Na, h, HGVA):
    summation = 0
    for i in range(Na):
        for j in range (D):
            summation += (h(j) / (HGVA(i)))
    
    return 1 - (summation / (D * Na))
            