In [1]:
import random as rdm
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("salario.csv")
df.head()

Unnamed: 0,id,pais,estado,cidade,genero,data,salario
0,1,United States,Alabama,Mobile,Male,10/01/1987,$5407.86
1,2,Portugal,Porto,Rego,Male,29/11/2007,$7833.82
2,3,Sweden,Kalmar,Emmaboda,Male,23/06/1984,$5530.85
3,4,Sweden,Västmanland,Sala,Female,04/11/1983,$2803.76
4,5,France,Bretagne,Quimperlé,Female,18/11/1990,$2201.87


In [3]:
df.describe()

Unnamed: 0,id
count,1000.0
mean,500.5
std,288.819436
min,1.0
25%,250.75
50%,500.5
75%,750.25
max,1000.0


## Unindo os atributos de localidade

In [4]:
ids = df.id
cities = df.cidade
states = df.estado
countries = df.pais

In [5]:
locality = []
for i in range(len(cities)):
    locality.append([ids[i], str('{}, {}, {}'.format(cities[i], states[i], countries[i]))])

In [6]:
df_locality = pd.DataFrame(locality, columns=['id', 'localidade'])
df_locality.head()

Unnamed: 0,id,localidade
0,1,"Mobile, Alabama, United States"
1,2,"Rego, Porto, Portugal"
2,3,"Emmaboda, Kalmar, Sweden"
3,4,"Sala, Västmanland, Sweden"
4,5,"Quimperlé, Bretagne, France"


## Unindo os dataframes

In [7]:
join_dataset = pd.concat([df, df_locality], axis=1, join='inner')
join_dataset.head()

Unnamed: 0,id,pais,estado,cidade,genero,data,salario,id.1,localidade
0,1,United States,Alabama,Mobile,Male,10/01/1987,$5407.86,1,"Mobile, Alabama, United States"
1,2,Portugal,Porto,Rego,Male,29/11/2007,$7833.82,2,"Rego, Porto, Portugal"
2,3,Sweden,Kalmar,Emmaboda,Male,23/06/1984,$5530.85,3,"Emmaboda, Kalmar, Sweden"
3,4,Sweden,Västmanland,Sala,Female,04/11/1983,$2803.76,4,"Sala, Västmanland, Sweden"
4,5,France,Bretagne,Quimperlé,Female,18/11/1990,$2201.87,5,"Quimperlé, Bretagne, France"


## Removendo atributos

In [8]:
dataset = join_dataset.drop(columns = ['id', 'pais', 'estado', 'cidade', 'salario'])
dataset.head()

Unnamed: 0,genero,data,localidade
0,Male,10/01/1987,"Mobile, Alabama, United States"
1,Male,29/11/2007,"Rego, Porto, Portugal"
2,Male,23/06/1984,"Emmaboda, Kalmar, Sweden"
3,Female,04/11/1983,"Sala, Västmanland, Sweden"
4,Female,18/11/1990,"Quimperlé, Bretagne, France"


## Funções de generalização

In [9]:
def date_generalization(dataset, level):
    new_dataset = dataset.copy()
    if level == 1:
        # date '**/MM/YYYY'
        for data in range(len(new_dataset)):
            new_dataset.iloc[data]['data'] = str('**' + new_dataset.iloc[data]['data'][2:10])
    elif level == 2:
        # date '**/**/YYYY'
        for data in range(len(new_dataset)):
            new_dataset.iloc[data]['data'] = str('**/**' + new_dataset.iloc[data]['data'][5:10])
    elif level == 3:
        # date'**/**/****'
        for data in range(len(new_dataset)):
            new_dataset.iloc[data]['data'] = str('**/**/****')
        
    return new_dataset

In [53]:
def locality_generalization(dataset, level):
    new_dataset = dataset.copy()
    if level == 0:
        # locality 'city'
        for data in range(len(new_dataset)):
            locais = new_dataset.iloc[data]['localidade'].split(',')
            new_dataset.iloc[data]['localidade'] = locais[0]+","+locais[1]+","+locais[2]
    elif level == 1:
        # locality 'state,'
        for data in range(len(new_dataset)):
            locais = new_dataset.iloc[data]['localidade'].split(',')
            new_dataset.iloc[data]['localidade'] = "**,"+locais[1]+","+locais[2]
    elif level == 2:
        # locality 'country'
        for data in range(len(new_dataset)):
            locais = new_dataset.iloc[data]['localidade'].split(',')
            new_dataset.iloc[data]['localidade'] = "**,**,"+locais[2]
    elif level == 3:
        # locality '*'
        for data in range(len(new_dataset)):
            new_dataset.iloc[data]['localidade'] = '**'
       
    return new_dataset

In [11]:
def gender_generalization(dataset, level):
    new_dataset = dataset.copy()
    if level == 1:
        # gender '*'
        for data in range(len(new_dataset)):
            new_dataset.iloc[data]['genero'] = '*'
            
    return new_dataset

## Gerando as Classes de Equivalência

In [59]:
dataset_date_gen = date_generalization(dataset, 0)
dataset_gender_gen = gender_generalization(dataset_date_gen, 0)
dataset_loc_gen = locality_generalization(dataset_gender_gen, 0)

dataset_loc_gen.head()

Unnamed: 0,genero,data,localidade
0,Male,10/01/1987,"Mobile, Alabama, United States"
1,Male,29/11/2007,"Rego, Porto, Portugal"
2,Male,23/06/1984,"Emmaboda, Kalmar, Sweden"
3,Female,04/11/1983,"Sala, Västmanland, Sweden"
4,Female,18/11/1990,"Quimperlé, Bretagne, France"


In [55]:
def define_groups(dataset_dict, k):
    flag = False
    for value, level in dataset_dict.values():
        if value < k:
            flag = True
    
    if not(flag):
        return dataset_dict
    
    without_group = dict()
    with_group = dict()
    
    for data in dataset_dict:
        if dataset_dict[data][0] < k:
            without_group[data] = dataset_dict[data]
        else:
            with_group[data] = dataset_dict[data]
        
    with_group.update(generate_equivalent_classes(without_group, k)) # concatenate dictionaries
    return define_groups(with_group, k)



In [56]:
def generate_equivalent_classes(dataset_dict, k):
    
    locality_level = list()
    date_level = list()
    gender_level = list()
    
    for value, level in dataset_dict.values():
            locality_level.append(level[0])
            date_level.append(level[1])
            gender_level.append(level[2])
    
    avgs = [np.average(locality_level), np.average(date_level), np.average(gender_level)]
    attr = 0
    i = 0
    min_avg = np.max(avgs)
    
    for avg in avgs:
        if avg < min_avg:
            min_avg = avg
            attr = i
        i += 1

    new_dataset = pd.DataFrame(list(dataset_dict.keys()), columns=['localidade', 'data', 'genero'])
    level = [np.floor(avgs[0]), np.floor(avgs[1]), np.floor(avgs[2])]
    if attr == 0:    
        new_group = locality_generalization(new_dataset, level[0] + 1)
    elif attr == 1:
        new_group = date_generalization(new_dataset, level[1] + 1)
    elif attr == 2:
        new_group = gender_generalization(new_dataset, level[2] + 1)
        
    level[attr] += 1
    
    
    return generate_group(new_group, level)

## Estudo sobre os dados

In [15]:
def sort_dict(unordered_dict):
    sorted_dict = dict()
    for key in unordered_dict:
        sorted_dict[key] = unordered_dict[key]

    return sorted(sorted_dict.items(), key=lambda kv: kv[1][0])

In [111]:
def generate_group(dataset, levels):
    # levels: [locality, date, gender]
    
    a = dataset.groupby(["localidade", "data", "genero"])["localidade"].count()
    group_dict = dict()

    for data in range(len(dataset)):
        register = dataset.iloc[data]
        triple = (register['localidade'], register['data'], register['genero'])
#         if triple in group_dict.keys():
#             group_dict[(register['localidade'], register['data'], register['genero'])][0] += 1
#         else:
        group_dict[(register['localidade'], register['data'], register['genero'])] = [a[triple], levels]
#             group_dict[(register['localidade'], register['data'], register['genero'])] = [1, levels]
    return group_dict

In [112]:
groups = generate_group(dataset_loc_gen, [0, 0, 0])
# sort_dict(groups)

## 2-anonimato

In [93]:
dataset_test1 = pd.DataFrame([['a', 'b', 'c'],
                 ['a', 'b', 'c'],
                 ['a', 'b', 'c'],
                 ['a', 'b', 'c'],
                 ['d', 'e', 'f']], columns=["localidade", "data", "genero"])

a = dataset_test1.groupby(["localidade", "data", "genero"])["localidade"].count()
a[('a', 'b', 'c')]

4

In [100]:
def sum_register(dataset_dict):
    total_registers = 0
    for value, level in dataset_dict.values():
        total_registers += value
        
    return total_registers

In [114]:
sum_register(anonimato_2)

1000

In [113]:
anonimato_2 = define_groups(groups, 2)
# sort_dict(anonimato_2)

## 16-anonimato

In [115]:
anonimato_16 = define_groups(groups, 16)
sort_dict(anonimato_16)

[(('**', '**/**/****', '*'), [31, [3.0, 3.0, 2.0]])]

In [116]:
sum_register(anonimato_16)

31

## Função de Precisão

$precision (D, Na, h, HGVA) = 1 - \frac{\sum_{i=1}^{Na} \sum_{j=1}^{|D|} \frac{h}{HGVA_i}}{|D| * Na}
$

- D: conjunto de dados
- Na: número de atributos semi-identificadores
- h: altura da hierarquia de generalização de valor do atributo Ai após anonimização
- |HGVAi|: altura máxima da hierarquia

In [None]:
def precision(D, Na, h, HGVA):
    summation = 0
    for i in range(Na):
        for j in range(len(D)):
            summation += (h[i] / (HGVA[i]))
    
    return 1 - (summation / (len(D) * Na))
#     return 1 - (summation / ((len(D) * Na)
            

In [None]:
precision(groups, 3, [2, 4, 0], [3, 4, 1])