In [None]:
import zipfile
import os
import sys
import math

from collections import defaultdict

import pandas as pd
import numpy as np
from bitarray import bitarray
import ngram
from tqdm.notebook import trange, tqdm

sys.path.append(".."+os.sep+"libs"+os.sep)
from datasetutil import open_processed_ds , gerar_gabarito 

sys.path.append(".."+os.sep+".."+os.sep)
from atuc.utils.file import open_ds
from pprl_utils.encrypt import compare_ds , compare_ds_based_on_blk

# Methods

In [None]:
from dateutil.parser import parse

def converter_data(date):
    try:
    #     date = 'October 18th 2010'
        dt = parse(date)
        return str(dt.month) +"/"+ str(dt.year)
    except Exception:
        return ''

In [None]:
# def get_chave(df,posicao_chave=1,n_char=2):
#     return a.iloc[:,posicao_chave].astype(str).str[0:n_char]
def get_chave(df,blk_key1=1,blk_key2=None,n_char=2):
    if blk_key2 != None:
        return df.iloc[:,blk_key1].astype(str).str[0:n_char] + df.iloc[:,blk_key2].astype(str).str[0:n_char]
    return df.iloc[:,blk_key1].astype(str).str[0:n_char]

In [None]:
def blocar(df1,df2):
    saida = []

    for bk in df1.blk_key.unique():
        sel_df1 = df1[df1.blk_key == bk]
        sel_df2 = df2[df2.blk_key == bk]

        if (len(sel_df1) == 0) or (len(sel_df2) == 0):
            pass
        else:
            for id_a in sel_df1.iloc[:,0]:
                for id_b in sel_df2.iloc[:,0]:
                    row = {'id1': id_a,'id2': id_b}
                    saida.append(row)

    return pd.DataFrame(saida)

### salva as comparacoes

In [None]:
def salvar_comparacoes(bf_len,dfa,dfb,gs,list_atts,names_atts,outdir,filename,baseado_no_gabarito=True):
    assert len(list_atts) == len(names_atts)

    stats = []

    # compaacoes
    #salvar resultados
    outfile = outdir + filename + '.zip'
    with zipfile.ZipFile(outfile, 'w' , zipfile.ZIP_DEFLATED) as csv_zip:
        
        for i in trange(0,len(list_atts),leave=False):
            atts = list_atts[i]
            if baseado_no_gabarito:
                ds, stat = compare_ds_based_on_blk(dfa,dfb,atts,gs,bf_len)
            else:
                ds, stat = compare_ds(dfa,dfb,atts,gs,bf_len)
#             print(len(ds[ds.is_match == 1]))
            att = names_atts[i]
            infile_ = filename + '-atts-' + att +'.csv'
            csv_zip.writestr(infile_, ds .to_csv(sep=';',index=False))
            
            del ds
            stats.append(stat)
            
        csv_zip.writestr('estatiscas.txt', pd.DataFrame(stats) .to_csv(sep=';',index=False))

In [None]:
def salvar_ds_blocado(a,b,gold,
                      list_atts,names_atts,
                      out_dir,filename,
                      nchar_key_list,blk_key1=1,blk_key2=None,bigrams=2,use_comps_in_gold=False):
    assert len(list_atts) == len(names_atts)
    
    outfile = out_dir + filename + '.zip'
    with zipfile.ZipFile(outfile, 'w' , zipfile.ZIP_DEFLATED) as csv_zip:
        for nchar_key in tqdm(nchar_key_list,leave=False):
            a['blk_key'] = get_chave(a,n_char=nchar_key,blk_key1=blk_key1,blk_key2=blk_key2)
            b['blk_key'] = get_chave(b,n_char=nchar_key,blk_key1=blk_key1,blk_key2=blk_key2)
            blocagem = blocar(a,b)

            for i in trange(0,len(list_atts),leave=False):
                stats = []
                atts = list_atts[i]
                att = names_atts[i]
                
                infile_ = filename + '-atts-' + str(att) +'-blk-'+str(nchar_key)+'.csv'
                
                ds, stat = compare_ds_based_on_blk(a, b, atts, gold, bf_len, blocagem, bigrams=bigrams,use_comps_in_gold=use_comps_in_gold)
                csv_zip.writestr(infile_, ds .to_csv(sep=';',index=False))

                del ds
                stats.append(stat)
                stats_file = 'estatiscas-atts-' + att +'-blk-'+str(nchar_key)+'.txt'
                csv_zip.writestr(stats_file, pd.DataFrame(stats) .to_csv(sep=';',index=False))

# Global Variables

In [None]:
datasets_dir = '..'+os.sep +'..'+os.sep +'datasets' + os.sep
dataset_out_put_dir = '..'+os.sep +'..'+os.sep +'datasets_comps' + os.sep

# Dataset Pessoas

## Census

In [None]:
ds_dir = "census" + os.sep
ds_file = "processed_census.zip"
out_dir = dataset_out_put_dir + ds_dir
os.makedirs(out_dir,exist_ok=True)

bf_len=100

blk_keys = [ 2 ]

out_filename = 'comparison_census' 

zatts_list  = [ [1,2] , [1,2,4] , [1,2,4,5] , [1,3,2,4,5] ]
zatts_names = [ '1' , '2' , '3' , '4' ]

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

a.zip_code = a.zip_code.astype(str)
b.zip_code = b.zip_code.astype(str)
gabarito = gerar_gabarito(gold)

salvar dados full

In [None]:
salvar_comparacoes(bf_len,a,b,gabarito,zatts_list,zatts_names,dataset_out_put_dir,out_filename,baseado_no_gabarito=False)

blocando

In [None]:
salvar_ds_blocado(a,b,gabarito,
                    zatts_list,zatts_names,
                    out_dir,out_filename+'_blk',
                    blk_keys,blk_key1=1,bigrams=2)

## NC Voters

In [None]:
ds_dir = "ncvr" + os.sep
out_dir = dataset_out_put_dir + ds_dir
os.makedirs(out_dir,exist_ok=True)
bf_len = 200

blk_keys = [ 2, 3 ]

out_filename = 'comparison_' 

zatts_list  = [ [2,3,4] , [2,3,4,5] , [2,3,4,5,8] , [2,3,4,5,8,7] ]
zatts_names = [ '1' , '2' , '3' , '4' ]

zatts_list  = [ [2,3,4] , [2,3,4,5,8] , [2,3,4,5,8,7] ]
zatts_names = [ '1' ,  '3' , '4' ]

files = [f for f in os.listdir(datasets_dir+ds_dir) if '.zip' in f]

In [None]:
for ds_file in tqdm(files,leave=False):
    out_filename_ = out_filename + ds_file.split('.zip')[0]+'_blk'
    zdf = datasets_dir + ds_dir + ds_file
    a,b,gold = open_processed_ds(zdf)
    
    a.zip_code = a.zip_code.astype(str)
    b.zip_code = b.zip_code.astype(str)
    gabarito = gerar_gabarito(gold)
    
    # full
    # salvar_comparacoes(bf_len,a,b,gabarito,zatts_list,zatts_names,dataset_out_put_dir,out_filename,baseado_no_gabarito=False)
    if len(a) >= 100000:
        salvar_ds_blocado(a,b,gabarito,
                zatts_list,zatts_names,
                out_dir,out_filename_,
                [3],blk_key1=2,blk_key2=3,bigrams=2)
    else:
        salvar_ds_blocado(a,b,gabarito,
                zatts_list,zatts_names,
                out_dir,out_filename_,
                blk_keys,blk_key1=2,blk_key2=3,bigrams=2)
    
    
    del a, b, gold, gabarito

## Michigan Voters

In [None]:
ds_dir = "mvr" + os.sep
out_dir = dataset_out_put_dir + ds_dir
os.makedirs(out_dir,exist_ok=True)
bf_len = 200

blk_keys = [ 2,3 ]

out_filename = 'comparison_' 

# 1 , 2, 3, 4 (mais ou menos) 10,8,9
# nome, (nome + sexo) , (nome + sexo + birth) , (nome + sexo + birth + endereco)
zatts_list  = [ [1,3,2] , [1,3,2,5] , [1,3,2,5,4] , [1,3,2,5,6,8,9,10] ]
zatts_names = [ '1' , '2' , '3' , '4' ]

zatts_list  = [ [1,3,2] , [1,3,2,5,4] , [1,3,2,5,6,8,9,10] ]
zatts_names = [ '1' , '3' , '4' ]

files = [f for f in os.listdir(datasets_dir+ds_dir) if '.zip' in f]


In [None]:
for ds_file in tqdm(files,leave=False):
    out_filename_ = out_filename + ds_file.split('.zip')[0]+'_blk'
    zdf = datasets_dir + ds_dir + ds_file
    a,b,gold = open_processed_ds(zdf)
    
    a.birthyear = a.birthyear.astype(str)
    b.birthyear = b.birthyear.astype(str)
    a.zip = a.zip.astype(str)
    b.zip = b.zip.astype(str)

    gabarito = gerar_gabarito(gold)
    
    # full
    # salvar_comparacoes(bf_len,a,b,gabarito,zatts_list,zatts_names,dataset_out_put_dir,out_filename,baseado_no_gabarito=False)
    # print(ds_file)
    # print(">>>>" + str(len(gabarito)) )
    
    if len(a) >= 100000:
        salvar_ds_blocado(a,b,gabarito,
                zatts_list,zatts_names,
                out_dir,out_filename_,
                [3],blk_key1=2,blk_key2=3,bigrams=2)
    else:
        salvar_ds_blocado(a,b,gabarito,
                    zatts_list,zatts_names,
                    out_dir,out_filename_,
                    blk_keys,blk_key1=1,blk_key2=2,bigrams=2)
    
    del a, b, gold, gabarito

## TSE

In [None]:
ds_dir = "tse" + os.sep
out_dir = dataset_out_put_dir + ds_dir
os.makedirs(out_dir,exist_ok=True)
bf_len = 400

blk_keys = [ 2,3,4 ]

out_filename = 'comparison_' 

# 1 , 2, 3, 4 (mais ou menos)
zatts_list  = [ [1] , [1,5] , [1,5,6] , [1,5,6,3] ]
zatts_names = [ '1' , '2' , '3' , '4' ]

files = [f for f in os.listdir(datasets_dir+ds_dir) if '.zip' in f]

In [None]:
for ds_file in tqdm(files):
    out_filename_ = out_filename + ds_file.split('.zip')[0]
    zdf = datasets_dir + ds_dir + ds_file
    a,b,gold = open_processed_ds(zdf)
    

    gabarito = gerar_gabarito(gold)
    
    # full
    # salvar_comparacoes(bf_len,a,b,gabarito,zatts_list,zatts_names,dataset_out_put_dir,out_filename,baseado_no_gabarito=False)
    salvar_ds_blocado(a,b,gabarito,
                zatts_list,zatts_names,
                out_dir,out_filename_+'_blk',
                blk_keys,blk_key1=1,bigrams=2)
    
    del a, b, gold, gabarito

## YV-ER

In [None]:
ds_dir = "yv-er" + os.sep
out_dir = dataset_out_put_dir + ds_dir
os.makedirs(out_dir,exist_ok=True)
bf_len = 600

blk_keys = [ 2,3 ]
blk_keys = [ 2 ]

out_filename = 'comparison_' 

zatts_list  = [ [2,3] , [2,3,4] , [2,3,4,7] , [2,3,4,7,8,10] ]
zatts_names = [ '1' , '2' , '3' , '4' ]

files = [f for f in os.listdir(datasets_dir+ds_dir) if '.zip' in f]

In [None]:
for ds_file in tqdm(files):
    out_filename_ = out_filename + ds_file.split('.zip')[0]
    zdf = datasets_dir + ds_dir + ds_file

    a,b,gold = open_processed_ds(zdf)
    a = a.astype(str)
    b = b.astype(str)
    gold.id1 =  gold.id1.astype(str)
    gold.id2 =  gold.id2.astype(str)
    
    gabarito = gerar_gabarito(gold)
    # salvar_comparacoes(bf_len,a,b,gabarito,zatts_list,zatts_names,dataset_out_put_dir,out_filename,baseado_no_gabarito=False)
  
    salvar_ds_blocado(a,b,gabarito,
                    zatts_list,zatts_names,
                    out_dir,out_filename_+'_blk',
                    blk_keys,blk_key1=2,bigrams=2,use_comps_in_gold=True)

# Produtos

nao portados

##  Abt-buy

In [None]:
ds_dir = "abt-buy\\"
ds_file = "processed_abt-buy.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_abt-buy_full.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 3
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## DBLP-ACM

In [None]:
ds_dir = "dblp-acm" + os.sep
ds_file = "processed_DBLP-ACM.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_dblp-acm_full.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 4
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

# Books

In [None]:
context = "/books"
ds_dir = context+ os.sep

## amazon-barnesnobel

In [None]:
ds_file = "processed_amazon-barnesnobel.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_books_amazon-barnesnobel.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 3
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## amazon-barnesnobel-small

In [None]:
ds_file = "processed_amazon-barnesnobel-small.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_books_amazon-barnesnobel-small.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 3
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## goodreads-barnesnobel

In [None]:
ds_file = "processed_goodreads-barnesnobel.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_books_goodreads-barnesnobel.zip'

non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 4
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

# Movies

In [None]:
context = "/movies"
ds_dir = context+ os.sep

## imdb-rottentomatos

In [None]:
ds_file = "processed_imdb-rottentomatos.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_movies_imdb-rottentomatos.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 3
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## imdb-tmd

In [None]:
ds_file = "processed_imdb-tmd.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_movies_imdb-tmd.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 2
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

# Restaurants

In [None]:
context = "/restaurants" 
ds_dir = context+ os.sep

## fodors-zagats

In [None]:
ds_file = "processed_fodors-zagats.zip"

zdf = datasets_dir + ds_dir + ds_file
a,b,gold = open_processed_ds(zdf)

In [None]:
comp_full_file = 'comparison_restaurants_fodors-zagats.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 4
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## yelp-yellowpages

In [None]:
ds_file = "processed_yelp-yellowpages.zip"

In [None]:
comp_full_file = 'comparison_restaurants_yelp-yellowpages.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 5
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)

## yelp-zomato

In [None]:
ds_file = "processed_yelp-zomato.zip"

In [None]:
comp_full_file = 'comparison_restaurants_yelp-zomato.zip'
non_blk_file = datasets_dir + ds_dir + comp_full_file
n_atts = 3
dedup = False
out_dir = datasets_dir + ds_dir

# blocar_ds_preprocessado(a,b,4,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,3,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,2,non_blk_file,n_atts,dedup,out_dir,comp_full_file)
blocar_ds_preprocessado(a,b,1,non_blk_file,n_atts,dedup,out_dir,comp_full_file)