In [None]:
import sys
import zipfile
import os
from random import sample 

import numpy as np
import pandas as pd
from IPython.display import display, HTML

sys.path.append(".")

from datasetutil import gerar_estatiscas_df, substituir_valores_nulos , verify_gg4cc , save_zip2

## Metodos

In [None]:
def gerar_ds(set_a,set_b,n=5000,m=.5):
    '''
    Recupera um conjunto com os ids das entidades que serão utilizadas no 
    conjunto a, conjunto b e gabarito
    
    '''
    
    um = 1 - m
    
    total_m = int(n * m)
    total_um = int(n * um)
    
    inter_ab = list(set_a & set_b)
    
    nb = set_a.difference(inter_ab)
    na = set_b.difference(inter_ab)
    
    matches = sample(inter_ab, total_m)
    unmatch_a = sample(nb, total_um)
    unmatch_b = sample(na, total_um)
    
    return matches + unmatch_a, matches + unmatch_b , matches

In [None]:
def popular_ds(dsa,dsb,list_a,list_b,gab):
    '''
        Monta o dataset com base nas listas
        
        @return dataset a
        @return dataset b
        @return goldstanrd
    '''
    za = dsa[dsa.voter_id.isin(list_a)]
    zb = dsb[dsb.voter_id.isin(list_b)]

    gs = pd.DataFrame(list(zip(gab)),columns=['id1'])
    gs['id2'] = gs.id1
    return za, zb , gs

def save_all(dsa,dsb,gs,outfile):
    '''
    Salva tudo em um zip
    '''
    
    n_a = dsa.isnull().sum()
    s_a = dsa.describe(include='object')
    n_b = dsb.isnull().sum()
    s_b = dsb.describe(include='object')
    
    sa = gerar_estatiscas_df(n_a,s_a)
    sb = gerar_estatiscas_df(n_b,s_b)
    
    dfa = substituir_valores_nulos(dsa)
    dfb = substituir_valores_nulos(dsb)
    
    save_zip2(dfa,dfb,gs,sa,sb,outfile)

In [None]:
def read(zinput):
    '''
        Ler zip com os datasets - para o ano de 2014 e 2017
    '''
    zf = zipfile.ZipFile(zinput) 
    fwidths = [35,20,20,3,
               4,1,8,1+7+4+2, #mambojambo_1 (itens 8 ao 11)
               30,6+2,13,35, #mambojambo_2 (itens 13 ao 14)
               2,5,50+50+50+50+50 , #mambojambo_2 (itens 19 ao 23)
               13
              ]
    colunas = ['lastname','firstname','middlename','namesufix',
               'birthyear','gender','date_registration','mambojambo_1',
               'street_name','mambojambo_2','extension','city',
               'state','zip','mambojambo_3',
               'voter_id'
              ]


    saida = []
    for arquivo in zipfile.ZipFile.namelist(zf):
        
        df = pd.read_fwf(zf.open(arquivo),
                         header=0,encoding="iso-8859-1",
#                          nrows=15,
                         widths = fwidths,
                         names = colunas)
        
        del df['mambojambo_1']
        del df['mambojambo_2']
        del df['mambojambo_3']
        
        if len(saida) == 0:
            saida = df
        else:
            saida = pd.concat([saida,df])
            
        
    return saida

In [None]:
def alterar_colunas(df):
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    cols[0] = 'id'
    df.columns = cols
    return df

## Leitura dos dados

In [None]:
base_dir = 'D:'+os.sep+'Dados'+os.sep+'OneDrive'+os.sep+'Doutorado'+os.sep+'Datasets'+os.sep
ds_files = base_dir + 'MVR'+os.sep+'michiganvoters.info'+os.sep+'download'+os.sep
fa = '20140901'+os.sep+'foia_voters.zip'
fb = '20171031'+os.sep+'foia_voters.zip'
# fb = '20200302'+os.sep+'EntireStateVoter.zip'


In [None]:
a = read(ds_files + fa)
b = read(ds_files + fb)

In [None]:
sa = set(a.voter_id.unique())
sb = set(b.voter_id.unique())
len(a),len(sa)

In [None]:
n = 1500
    
out_files = 'D:'+os.sep+'Dados'+os.sep+'OneDrive'+os.sep+'Doutorado'+os.sep+'workspace'+os.sep+'tl@pprl'+os.sep+'datasets'+os.sep
out_files += 'mvr' + os.sep

prefix = 'michiganvoters_'
sufix = '.zip'

for n in [2000,10000,100000,500000]:
    for i in [.1]:
        la,lb , gab = gerar_ds(sa,sb,m=i,n=n)
        dsa , dsb , gs = popular_ds(a,b,la,lb,gab)
        dsa = alterar_colunas(dsa)
        dsb = alterar_colunas(dsb)
        of = out_files + prefix + str(n) +"_"+ str(i)+sufix
        save_all(dsa,dsb,gs,of)
        print(of)
    

## Scratch

In [None]:
dsb

In [None]:
dsb[dsb.voter_id == 108341437]

In [None]:
dsa

In [None]:
dsa