In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from os.path import dirname, join
from hashlib import md5
from unidecode import unidecode

from sys import exit

pd.set_option('display.max_columns',None)

In [2]:
from bulletin import __file__ as __root__
from bulletin.commom.normalize import trim_overspace, normalize_text, normalize_number, normalize_date, normalize_municipios, normalize_igbe
from bulletin.data import municipios, regionais

mun_list = municipios.load()
mun_list = mun_list.rename(columns={'mun_nome':'municipio'}) 

rs_list = regionais.load()
rs_list = rs_list.rename(columns={'reg':'regional'}) 


def normalize_nome(text):
    x = str(text).replace("'"," ").replace(".","").replace("\n","").replace(",","").upper()
    x = trim_overspace(x)
    x = unidecode(x)
    return x
    
def normalize_email(text):
    x = str(text).replace("'","").replace("\n","").replace(",","").upper()
    x = trim_overspace(x)
    x = unidecode(x)
    return x

def normalize_do(do):
    try:
        do = str(do).replace("-","")
        return int(do)
    except:
        return None


In [3]:
nt_dtypes = {
    "id":"int64",
    "uf_residencia":"int64",
    "ibge_residencia":"int64",
    "telefone_notificador":"object"
}
nt_converters = {
    "paciente": normalize_nome,
    "nome_mae": normalize_nome,
    "idade":lambda x: normalize_number(x,fill=0),
    "sexo":lambda x: normalize_number(x,fill=3),
    "etnia":normalize_number,
    "raca_cor":lambda x: normalize_number(x,fill=6),
    "nome_notificador":normalize_nome,
    "email_notificador":normalize_email,
    "nome_unidade_notifica":normalize_nome,
    "uf_unidade_notifica":normalize_number,
    "ibge_unidade_notifica":normalize_number,
    "evolucao":lambda x: normalize_number(x,fill=3),
    "resultado":normalize_number
}
nt_dates = [
    "data_notificacao",
    "data_nascimento",
    "data_liberacao",
    "data_cura_obito",
    "updated_at"

]
notifica = pd.read_csv(join(dirname(__root__),'tmp','notifica.csv'),
                       dtype=nt_dtypes, 
                       converters=nt_converters,
                       parse_dates=nt_dates,
                       date_parser=lambda x: pd.to_datetime(x, format='%d/%m/%Y', errors='coerce'))



In [None]:
notifica = pd.merge(left=notifica, right=mun_list[['ibge','municipio']], left_on='ibge_residencia', right_on='ibge', how='left')
notifica = notifica.drop(columns=['ibge'])
notifica = pd.merge(left=notifica, right=rs_list[['ibge','regional']], left_on='ibge_residencia', right_on='ibge', how='left')
notifica = notifica.drop(columns=['ibge'])
notifica = pd.merge(left=notifica, right=mun_list[['ibge','municipio']], left_on='ibge_unidade_notifica', right_on='ibge', how='left', suffixes=['_residencia','_notifica'])
notifica = notifica.drop(columns=['ibge'])

notifica['municipio_residencia'] = notifica['municipio_residencia'].apply(normalize_nome)
notifica['municipio_notifica'] = notifica['municipio_notifica'].apply(normalize_nome)
notifica['regional'] = notifica['regional'].apply(normalize_number)

notifica['hash'] = notifica.apply(lambda row: md5(str.encode(" ".join([row['paciente'],str(row['idade']),row['municipio_residencia']]))).hexdigest(), axis=1)
notifica

In [None]:
casos_dtypes = {
    "Laboratório":"object",
    "Sexo":"object"
}
casos_converters = {
    "Nome": normalize_nome,
    "Idade":lambda x: normalize_number(x,fill=-9),
    "Mun Resid":normalize_nome,
    "Mun atend":normalize_nome,
    "RS":normalize_number
}
casos_dates = [
    "Dt diag",
    "Comunicação",
    "IS"
]


casos = pd.read_excel('Casos confirmados PR.xlsx',
                      'Casos confirmados', 
                      usecols='D:M', 
                      dtype=casos_dtypes, 
                      converters=casos_converters,
                      parse_dates=casos_dates)

casos.columns = [ normalize_text(x) for x in casos.columns ]
casos['hash'] = casos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']),row['mun_resid']]))).hexdigest(), axis=1)
casos['hash_less'] = casos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']-1),row['mun_resid']]))).hexdigest(), axis=1)
casos['hash_more'] = casos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']+1),row['mun_resid']]))).hexdigest(), axis=1)

casos

In [None]:
obitos_dtypes = {
    "Sexo":"object"
}
obitos_converters = {
    "Nome": normalize_nome,
    "Idade":lambda x: normalize_number(x,fill=-9),
    "Município":normalize_nome,
    "RS":normalize_number
}
obitos_dates = [
    "Data do óbito",
]

obitos = pd.read_excel('Casos confirmados PR.xlsx',
                      'Obitos', 
                      usecols='D:I', 
                      dtype=casos_dtypes, 
                      converters=casos_converters,
                      parse_dates=obitos_dates)


obitos.columns = [ normalize_text(x) for x in obitos.columns ]
obitos['hash'] = obitos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']),row['municipio']]))).hexdigest(), axis=1)
obitos['hash_less'] = obitos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']-1),row['municipio']]))).hexdigest(), axis=1)
obitos['hash_more'] = obitos.apply(lambda row: md5(str.encode(" ".join([row['nome'],str(row['idade']+1),row['municipio']]))).hexdigest(), axis=1)

obitos

In [20]:
casos.loc[casos.duplicated(subset='hash')].shape

(370, 13)

In [21]:
casos.loc[casos['hash'].isin(casos['hash_less'])].shape

(1730, 13)

In [22]:
casos.loc[casos['hash'].isin(casos['hash_more'])].shape

(1727, 13)

In [23]:
obitos.loc[obitos.duplicated(subset='hash')].shape

(0, 9)

In [24]:
obitos.loc[obitos['hash'].isin(obitos['hash_less'])].shape

(6, 9)

In [25]:
obitos.loc[obitos['hash'].isin(obitos['hash_more'])].shape

(6, 9)

In [26]:
notifica.loc[notifica['hash'].isin(casos['hash'])].shape

(155080, 26)

In [27]:
notifica.loc[notifica['hash'].isin(casos['hash_less'])].shape

(319, 26)

In [28]:
notifica.loc[notifica['hash'].isin(casos['hash_more'])].shape

(8082, 26)

In [29]:
notifica.loc[notifica['hash'].isin(obitos['hash'])].shape

(3445, 26)

In [30]:
notifica.loc[notifica['hash'].isin(obitos['hash_less'])].shape

(11, 26)

In [31]:
notifica.loc[notifica['hash'].isin(obitos['hash_more'])].shape

(564, 26)