In [None]:
import os
import sys
import shutil

import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns

import textdistance
import missingno as msno

from tqdm.notebook import trange, tqdm

sys.path.append("../libs/")

from datasetutil import open_processed_ds , gerar_gabarito

# Metodos

## Faltantes

In [None]:
def plotar_dados_faltantes(a,b,dsname,outdir,custom_out=False,custom_id=False):
    
#     df = pd.concat([a,b])
#     df = df.replace("nan", np.nan)
#     df = df.replace('', np.nan)
#     del df['id']

    dfa = a.copy()
    dfb = b.copy()
    
    if not custom_id:
        del dfa['id']
        del dfb['id']
    else:
        del dfa[custom_id]
        del dfb[custom_id]
    
#     msno.dendrogram(df,orientation='top')
#     msno.dendrogram(df)
#     msno.heatmap(df)
#     msno.bar(df)
    for par in [('a',dfa),('b',dfb)]:
        dsid = par[0]
        df = par[1]
        df = df.replace("nan", np.nan)
        df = df.replace('', np.nan)
        
        plt.ioff()
        p = msno.matrix(df,figsize=(10, 6))
        p.set_title('Number of unique values per attribute')
        fig = p.get_figure()
        
        if not custom_out:
            outf = outdir + dsname + '_' + dsid + '_missing'
        else:
            outf = outdir + dsname + str(custom_out) + '_' + dsid + '_missing'
            
        fig.savefig(outf + '.pdf', bbox_inches = 'tight')
        fig.savefig(outf + '.png', bbox_inches = 'tight')
        plt.close()



In [None]:
# sdf = msno.nullity_filter(df, filter='bottom', n=15, p=0.999) # or filter='top'
# sdf = msno.nullity_filter(df, filter='top', n=15, p=0.999) # or filter='top'

# sdf = msno.nullity_sort(df, sort='descending') 
# sdf = msno.nullity_sort(df, sort='ascending')

## Valores Unicos

In [None]:
def plotar_valores_unicos(a,b,dsname,outdir,
                          custom_out=False,custom_id=False):
#     df = pd.concat([a,b])
#     del df['id']
    
    dfa = a.copy()
    dfb = b.copy()
    
    if not custom_id:
        del dfa['id']
        del dfb['id']
    else:
        del dfa[custom_id]
        del dfb[custom_id]
    
    for par in [('a',dfa),('b',dfb)]:
        dsid = par[0]
        df = par[1]
        df = df.replace("nan", np.nan)
        df = df.replace('', np.nan)
    
        #transformando os dados
        sdf = pd.melt(df)
        sdf = sdf.loc[sdf['value']!='']
        sdf = sdf.loc[sdf['value']!='nan']
        # == float("NaN")
        z = df.nunique()
        tamanho=len(df)
        plt.ioff()
        fig, ax1 = plt.subplots(figsize=(10, 6))

        sns.set_style("whitegrid")

        x,y = list(z.index),list(z.values)
        # sns.displot(x=x, hue=y, kind="kde", fill=True)
        #p = sns.countplot(data=sdf, x='variable', hue='variable')
        p = sns.barplot(x=x, y=y, color=(.25,.25,.25))
        p.set_title('Number of unique values per attribute')
        ax1.set_ylabel('counting')

        ax2 = ax1.twinx()
        # Ensure ticks occur at the same positions, then modify labels
        ax2.set_ylim(ax1.get_ylim())
        ax2.set_yticklabels(np.round(ax1.get_yticks()/tamanho,1))
        ax2.set_ylabel('Percentage of unique values')

        _ = plt.setp(p.get_xticklabels(), rotation=30)

        if not custom_out:
            outf = outdir + dsname + '_' + dsid + '_unique'
        else:
            outf = outdir + dsname + str(custom_out) + '_' + dsid + '_unique'

        fig.savefig(outf + '.pdf', bbox_inches = 'tight')
        fig.savefig(outf + '.png', bbox_inches = 'tight')
        plt.close()


## Sim study

In [None]:
def calcular_similaridade_gabarido(a,b,gold,
                                   id_col_a = 'id',id_col_b = 'id'):
    cols_a = list(a.columns)
    cols_b = list(b.columns)

    cols_a.remove(id_col_a)
    cols_b.remove(id_col_b)

    similaridades = []

    for row in gold.iterrows():
        id_a = row[1][0]
        id_b = row[1][1]

        linha1 = a[a[id_col_a]==id_a]
        linha2 = b[b[id_col_b]==id_b]

        e1 = linha1[cols_a].astype(str).apply(''.join, axis=1)
        e1 = e1[e1.keys()[0]]

        e2 = linha2[cols_b].astype(str).apply(''.join, axis=1)
        e2 = e2[e2.keys()[0]]

        h = textdistance.hamming.normalized_similarity(e1,e2)
        j = textdistance.jaccard.normalized_similarity(e1,e2)
        l = textdistance.levenshtein.normalized_similarity(e1,e2)
        e = textdistance.entropy_ncd.normalized_similarity(e1,e2)
        o = textdistance.overlap.normalized_similarity(e1,e2)
        d = textdistance.sorensen_dice.normalized_similarity(e1,e2)


        similaridades.append({'id': str(id_a) +'-'+ str(id_b),
                              'hamming' : h,
                              'jaccard' : j,
    #                           'levenshtein' : l,
                              'entropy' : e ,
                              'overlap' : o ,
                              'sorensen_dice' : d
                             })

    return pd.DataFrame(similaridades)
    
    

In [None]:
def plotar_similaridade_gabarito(a,b,gold,dsname,outdir,
                                 idcola = 'id',idcolb = 'id',
                                 custom_out=False):
    
    plt.ioff()
    sdf = calcular_similaridade_gabarido(a,b,gold,id_col_a=idcola, id_col_b=idcolb)
    
    mdf = sdf.melt(id_vars='id')
    mdf.rename(columns = {'id':'id', 'variable': 'metric'}, inplace = True)
    
    sns.set()
    sns.set_style("whitegrid")

    fig = plt.figure(figsize=(10, 6))
    sns.despine(left=True)
    sns.set(style="whitegrid",rc={'figure.figsize':(10,4)})
    g = sns.displot(mdf,x='value', hue='metric', element="poly",
                    kind="hist",legend=True ,
                    height=5, aspect=2)
    g.set_xlabels("simialarity")
    plt.title("Similarity of the duplicated entities")
    
    if not custom_out:
        outf = outdir + dsname + '_gsim'
    else:
        outf = outdir + dsname + str(custom_out) + '_gsim'
        
    plt.savefig(outf + '.pdf', bbox_inches = 'tight')
    plt.savefig(outf + '.png', bbox_inches = 'tight')
    plt.close()


## Salvar arquivos

In [None]:
def salvar_mdfile(ds_name,stats,outdir,filename=False,custom_flag=False):
    if not filename:
        texto = ['# Details of the data source sample', '','' , 'This dataset sample has the folowing characteristics.' , '']
    else:
        texto = ['# Details of the data source sample ('+filename+')', '','' , 'This dataset sample has the folowing characteristics.' , '']
    
    texto.append('## Data source sample summary')
    texto.append('')
#     texto.append(stats.to_markdown(tablefmt="grid"))
    texto.append(stats.to_markdown())
    texto.append('')
    
    img1 = '![image](https://github.com/thiagonobrega/ds_utils/blob/master/datasets/'+ds_name+'/stats/'+ds_name+'_missing.png "Sim")'
    img2 = '![image](https://github.com/thiagonobrega/ds_utils/blob/master/datasets/'+ds_name+'/stats/'+ds_name+'_unique.png "Sim")'
    img3 = '![image](https://github.com/thiagonobrega/ds_utils/blob/master/datasets/'+ds_name+'/stats/'+ds_name+'_gsim.png "Sim")'
    
    texto.append('## Data source missing values')
    texto.append('')
    texto.append(img1)
    texto.append('')

    texto.append('## Data source unique values')
    texto.append('')
    texto.append(img2)
    texto.append('')

    texto.append('## Data source duplicated similarities')
    texto.append('')
    texto.append(img3)
    texto.append('')
    
    if custom_flag:
        ff = 'a'
    else:
        ff = 'w'
    with open(outdir+'README.md', ff) as the_file:
        for linha in texto:
            the_file.write(linha + "\n")

In [None]:
def salvar_textable(ds_name,stats,outdir):   
    with open(outdir+ds_name+'.tex', 'w') as the_file:
        the_file.write(stats.to_latex() + "\n")

# Executar

In [None]:
datasets_dir = ".."+os.sep+"datasets"+os.sep

In [None]:
def plotar_sumario(ds_name,ds_file,datasets_dir,custom_out=False,custom_id=False):

    ds_dir = ds_name+os.sep
    ds_loc = datasets_dir + ds_dir

    zdf =  ds_loc + ds_file 

    a,b,gold,stats = open_processed_ds(zdf,get_stats=True)
    
    if len(gold.columns) > 2:
        gold = gold[gold.iloc[:, 2]==1]
    
#     if not custom_out:
#         print('entrei')
#         out_dir = ds_loc + os.sep + 'stats' + os.sep + custom_out
#     else:
    out_dir = ds_loc + os.sep + 'stats' + os.sep
        
        
    
    os.makedirs(out_dir,exist_ok=True)

    plotar_dados_faltantes(a,b,ds_name,out_dir,
                           custom_out=custom_out,custom_id=custom_id)
    plotar_valores_unicos(a,b,ds_name,out_dir,
                          custom_out=custom_out,custom_id=custom_id)
    
    if not custom_id:
        plotar_similaridade_gabarito(a,b,gold,ds_name,out_dir,
                                     custom_out=custom_out)
    else:
        plotar_similaridade_gabarito(a,b,gold,ds_name,out_dir,
                                     custom_out=custom_out,
                                     idcola=custom_id,idcolb=custom_id
                                    )        

    salvar_mdfile(ds_name,stats,ds_loc,filename=zdf,custom_flag=custom_out)
    if not custom_out:
        salvar_textable(ds_name,stats,out_dir)
    else:
        salvar_textable(ds_name+str(custom_out),stats,out_dir)



In [None]:
def rmdir(dir_path):
    try:
        shutil.rmtree(dir_path)
    except OSError as e:
        print("Error: %s : %s" % (dir_path, e.strerror))

## census

In [None]:
# dsname,arquivo,custom_label,custom_id
# custom label when more than one data source in the folter

ds = [ 
        ('census','processed_census.zip',False,False),
        ('yv-er','processed_yver.zip',False,False),
        ('abt-buy','processed_abt-buy.zip',False,False),
        ('books','processed_amazon-barnesnobel.zip','1',False),
        ('books','processed_amazon-barnesnobel-small.zip','2',False),
        ('books','processed_goodreads-barnesnobel.zip','3',False),
        ('dblp-acm','processed_DBLP-ACM.zip',False,False),
        ('movies','processed_imdb-rottentomatos.zip','1',False),
        ('movies','processed_imdb-tmd.zip','2',False),
        ('MVR','michiganvoters_500_0.1.zip',False,False),
        ('NCVR','processed_ncvoters_1700_0.1.zip',False,'ncid'),
        ('restaurants','processed_fodors-zagats.zip','1',False),
        ('restaurants','processed_yelp-yellowpages.zip','2',False),
        ('restaurants','processed_yelp-zomato.zip','3',False),
        ('tse','processed_tse-2k-8.zip',False,'NR_CPF_CANDIDATO'),
]

clean all

In [None]:
for d in ds:
    ds_name = d[0]
    rmbd = datasets_dir + ds_name + os.sep
    rms = rmbd + 'stats'
    rmf = rmbd + 'README.md'
    rmdir(rms)
    try:
        os.remove(rmf)
    except OSError as e:
        print("Error: %s : %s" % (rmf, e.strerror))

gerar estatticas individuais

In [None]:
for d in tqdm(ds):
    ds_file = d[1]
    ds_name = d[0]
    custom_out = d[2]
    custom_id = d[3]
    
    if not custom_out:
        plotar_sumario(ds_name,ds_file,datasets_dir,
                       custom_id=custom_id)
    else:
        plotar_sumario(ds_name,ds_file,datasets_dir,
                       custom_out=custom_out,custom_id=custom_id)

## Gerar motivacao

In [None]:
df = []
for d in tqdm(ds):
    ds_file = d[1]
    ds_name = d[0]
    custom_out = d[2]
    custom_id = d[3]
    
    ds_dir = ds_name+os.sep
    ds_loc = datasets_dir + ds_dir
    zdf =  ds_loc + ds_file 
    a,b,gold,stats = open_processed_ds(zdf,get_stats=True)
    
    if not custom_id:
        ldf = calcular_similaridade_gabarido(a,b,gold)
    else:
        ldf = calcular_similaridade_gabarido(a,b,gold,
                                             id_col_a = custom_id,id_col_b = custom_id)
    if not custom_out:
        ldf['ds'] = ds_name
    else:
        ldf['ds'] = ds_name + custom_out
    
    if len(df) == 0:
        df = ldf
    else:
        df = pd.concat([df,ldf])

In [None]:
del df['id']
df

In [None]:
mdf = df.copy()
mdf['media'] = mdf.iloc[:, :5].astype(float).mean(axis=1)

# pessoal= ['census', 'yv-er', 'MVR', 'NCVR' ,'tse']
pessoal1 = ['census', 'yv-er' , 'tse' ,'NCVR']
set1= ['abt-buy', 'dblp-acm', 'movies2','books3']


In [None]:
def plot_gen1(mdf,desc,log=False):
    sns.set()
    sns.set_style("whitegrid")
    mdf.rename(columns = {'ds': 'data source','media': 'similarity'}, inplace = True)

#     = plt.subplot(figsize=(8, 5.5))
    fig,ax1 = plt.subplots(figsize=(8,5.5))
    sns.despine(left=True)
    sns.set(style="whitegrid",rc={'figure.figsize':(10,4)})
    g = sns.histplot(mdf,x='similarity', hue='data source', element="poly" #, kind="hist",
#                     , stat='density'
#                      , stat='probability'
                     , stat='count'
                     , fill=True ,
                    ax=ax1)
                    #height=5, aspect=1.5 ,legend=True,)
    # g.set_xlabels("simialarity")
    if log:
        plt.yscale('log')
        ax1.get_yaxis().set_major_formatter(mtick.ScalarFormatter())


    plt.title("Similarity of the duplicated entities")

    outf = datasets_dir +desc+'_original_dss_sim'

    plt.savefig(outf + '.pdf', bbox_inches = 'tight')
    plt.savefig(outf + '.png', bbox_inches = 'tight')

    plt.close()

In [None]:
plot_gen1(mdf[mdf.ds.isin(pessoal1)],'personal',log=True)

In [None]:
plot_gen1(mdf[mdf.ds.isin(set1)],'all',log=False)

# Debug

In [None]:

ds_dir = 'abt-buy'+os.sep
ds_loc = datasets_dir + ds_dir
out_dir = ds_loc + os.sep + 'stats' + os.sep

zdf =  ds_loc + 'processed_abt-buy.zip' 

a,b,gold,stats = open_processed_ds(zdf,get_stats=True)

# if len(gold.columns) > 2:
#         gold = gold[gold.iloc[:, 2]==1]

In [None]:
    import zipfile
    get_stats =  True
    zf = zipfile.ZipFile(zdf)
    nl = zipfile.ZipFile.namelist(zf)

    is_one = False
    

    for i in range(0, len(nl)):

        fn = nl[i]

        if ('a.csv' == fn):
            a = pd.read_csv(zf.open(fn), header=0, sep=";",
                            index_col=False)
            a = a.fillna('')

        if ('b.csv' == fn):
            b = pd.read_csv(zf.open(fn), header=0, sep=";",
                            index_col=False)
            b = b.fillna('')

        if ('gold.csv' in fn):
            gs = pd.read_csv(zf.open(fn), header=0, sep=";",
                             index_col=False)
        if get_stats:
            if ('stats.csv' in fn):
                stats = pd.read_csv(zf.open(fn), header=0, sep=";",
                                 index_col=False)
                is_one = True

            if ('stats_a.csv' == fn):
                stats_a = pd.read_csv(zf.open(fn) , header=0 , sep=";" ,
                                 index_col=False)

            if ('stats_b.csv' == fn):
                stats_b = pd.read_csv(zf.open(fn) , header=0 , sep=";" ,
                                      index_col=False)


    if get_stats:
        if not is_one:
            stats = pd.concat([stats_a,stats_b])

In [None]:
calcular_similaridade_gabarido(a,b,gold)

In [None]:
z = []