In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode


def normalize_number(num, cast=int, error='fill', fill='-1'):
    try:
        return cast(num)
    except ValueError:
        if error == 'raise':
            raise Exception(ValueError)
        elif error == 'fill':
            return normalize_number(fill, cast, 'raise')


def normalize_hash(string: str):
    return "".join(filter(lambda char: 'A' <= char <= 'Z', str(string).upper()))


def trim_overspace(string):
    parts = filter(lambda part: len(part) > 0, string.split(" "))
    return " ".join(parts)


def normalize_labels(label):
    label = str(label).replace("'", " ").replace(".", "").replace("\n", "").replace(",", "").lower()
    label = trim_overspace(label).replace(" ", "_")
    label = unidecode(label)

    return label


def normalize_cpf(cpf):
    cpf = ''.join(filter(lambda x: '0' <= x <= '9', str(cpf)))
    cpf = str(cpf).zfill(11)
    digitos = list(map(int, cpf))

    if max(digitos) == min(digitos):
        return None

    validacao = sum(np.array(digitos[:9]) * np.array([10, 9, 8, 7, 6, 5, 4, 3, 2])) * 10 % 11

    if validacao != digitos[-2]:
        return None

    return cpf[:3] + '.' + cpf[3:6] + '.' + cpf[6:9] + '-' + cpf[9:]


def date_hash(date_):
    try:
        date = pd.to_datetime(date_)
        return date.strftime("%d%m%Y")
    except ValueError:
        return '9999999'


def replace_all(txt, replace_list):
    for replace_tuple in replace_list:
        txt = txt.replace(*replace_tuple)
    return txt

def normalize_text(txt):
    if not txt is None:
        txt = txt.upper()
        txt = unidecode(txt)
        txt = trim_overspace(txt)
        txt =  "".join([x if ('A' <= x <= 'Z') or (x in [' ','-','_','/','\'']) else '' for x in txt])
    else:
        txt = ''
    
    return txt


def normalize_municipios(mun):
    mun = normalize_text(mun)
    est = 'PR'
    if '/' in mun:
        mun, est = mun.split('/')

    mun = trim_overspace(mun)

    return mun, est


def normalize_ibge(ibge):
    ibge = str(ibge)
    if len(ibge) > 6:
        ibge = ibge[:len(ibge) - 1]

    return ibge

['/home/vinnylg/projects/covid19datascience/notebooks', '/home/vinnylg/.vscode/extensions/ms-toolsai.jupyter-2021.5.745244803/pythonFiles', '/home/vinnylg/.vscode/extensions/ms-toolsai.jupyter-2021.5.745244803/pythonFiles/lib/python', '/home/vinnylg/anaconda3/envs/boletim/lib/python38.zip', '/home/vinnylg/anaconda3/envs/boletim/lib/python3.8', '/home/vinnylg/anaconda3/envs/boletim/lib/python3.8/lib-dynload', '', '/home/vinnylg/.local/lib/python3.8/site-packages', '/home/vinnylg/anaconda3/envs/boletim/lib/python3.8/site-packages', '/home/vinnylg/.local/lib/python3.8/site-packages/IPython/extensions', '/home/vinnylg/.ipython', '..']


In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

meses = ['janeiro','fevereiro','marco','abril','maio','junho','julho','agosto','setembro','outubro','novembro','dezembro']
pd.set_option("display.precision", 5)

In [None]:
casos = pd.read_csv('geral.csv', sep=';', converters={'IDADE_ORIGINAL': lambda x: normalize_number(x,fill=0)}, parse_dates=['DATA_DIAGNOSTICO','DATA_OBITO'], date_parser=lambda x: pd.to_datetime(x, format='%d/%m/%Y'))
casos.columns = [ normalize_labels(x) for x in csv_geral.columns ]
casos

In [3]:
# cc = CasosConfirmados()
# cc.load()
# casos = cc.get_casos()
# casos.columns = [ normalize.normalize_labels(x) for x in casos.columns ]
# casos = casos.rename(columns={'idade':'idade_original','dt_diag':'data_diagnostico','ibge7':'ibge_res_pr'})
# casos = casos.loc[casos['ibge_res_pr'].notna()].copy()

Time elapsed loading Casos Confirmados: ~00:00:01


In [4]:
# faixa_etaria = [60,70,80,90,100,np.inf]
# faixa_etaria_labels = ['<=59','60-69','70-79','80-89','90-99','>=100']

faixa_etaria = [10,20,40,60,np.inf]
faixa_etaria_labels = ['<10','10-19','20-39','40-59','>=60']

faixa_etaria_index = dict(enumerate(faixa_etaria_labels))
index_casos = np.digitize(casos['idade_original'], faixa_etaria, right=False)

casos['faixa_etaria'] = [ index for index in index_casos ]
faixa_etaria_index

{0: '<10', 1: '10-19', 2: '20-39', 3: '40-59', 4: '>=60'}

In [5]:
casos['mes_caso'] = casos.apply(lambda row: row['data_diagnostico'].strftime('%m'), axis=1)
casos['ano_caso'] = casos.apply(lambda row: row['data_diagnostico'].strftime('%Y'), axis=1)

In [6]:
obitos = casos.loc[ casos['data_obito'].notnull() ].copy()
obitos['mes_obito'] = obitos.apply(lambda row: row['data_obito'].strftime('%m'), axis=1)
obitos['ano_obito'] = obitos.apply(lambda row: row['data_obito'].strftime('%Y'), axis=1)

In [7]:
faixa_etaria_casos = casos.groupby(by=['ano_caso','mes_caso','faixa_etaria'])[['sexo','ibge_res_pr']].count().rename(columns={'sexo':'0','ibge_res_pr':'1'})
faixa_etaria_casos = faixa_etaria_casos.unstack().fillna(0).astype(int)
faixa_etaria_casos = faixa_etaria_casos.swaplevel(1,0,1).sort_index(1).rename(columns={'0':'qtde','1':'%'})

faixa_etaria_casos.columns = pd.MultiIndex.from_tuples(faixa_etaria_casos.columns)
faixa_etaria_casos['total'] = faixa_etaria_casos.xs('qtde',level=1, axis=1).sum(axis=1)


# for ano in faixa_etaria_casos.index.get_level_values(0).unique():
#     faixa_etaria_casos = faixa_etaria_casos.append(
#         pd.Series(data=faixa_etaria_casos.loc[ano].sum(0), name=(ano,'total'))
#     )

# faixa_etaria_casos = faixa_etaria_casos.append(
#         pd.Series(data=faixa_etaria_casos.xs('total', level=1).sum(0), name=('total_geral',''))
# )

# faixa_etaria_casos = faixa_etaria_casos.loc[faixa_etaria_casos.sum(1)>0]

faixa_etaria_casos = faixa_etaria_casos.sort_index()

for ano_mes in faixa_etaria_casos.index:
    for faixa_etaria, tipo in faixa_etaria_casos.columns:
        if tipo == '%':
            faixa_etaria_casos.loc[ano_mes,(faixa_etaria,'%')] = faixa_etaria_casos.loc[ano_mes,(faixa_etaria,'%')] / faixa_etaria_casos.loc[ano_mes,'total'].values

faixa_etaria_casos.columns = faixa_etaria_casos.columns.set_levels([ faixa_etaria_index[index] if isinstance(index,int) else index for index in faixa_etaria_casos.columns.levels[0]],0,False)
            
faixa_etaria_casos

  faixa_etaria_casos.columns = faixa_etaria_casos.columns.set_levels([ faixa_etaria_index[index] if isinstance(index,int) else index for index in faixa_etaria_casos.columns.levels[0]],0,False)


Unnamed: 0_level_0,Unnamed: 1_level_0,<10,<10,10-19,10-19,20-39,20-39,40-59,40-59,>=60,>=60,total
Unnamed: 0_level_1,Unnamed: 1_level_1,qtde,%,qtde,%,qtde,%,qtde,%,qtde,%,Unnamed: 12_level_1
ano_caso,mes_caso,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2020,3,5,0.01333,7,0.01867,181,0.48267,122,0.32533,60,0.16,375
2020,4,44,0.02125,61,0.02945,849,0.40995,743,0.35876,374,0.18059,2071
2020,5,177,0.03318,227,0.04256,2377,0.44563,1893,0.35489,660,0.12373,5334
2020,6,884,0.03155,1482,0.05289,12497,0.44599,9742,0.34767,3416,0.12191,28021
2020,7,2456,0.03617,3891,0.0573,30170,0.4443,22576,0.33246,8812,0.12977,67905
2020,8,2585,0.03793,4559,0.0669,30072,0.44127,22143,0.32493,8789,0.12897,68148
2020,9,1983,0.03437,4002,0.06936,25420,0.44058,18815,0.3261,7477,0.12959,57697
2020,10,1302,0.0338,2772,0.07196,17370,0.4509,12004,0.31161,5075,0.13174,38523
2020,11,2649,0.02739,7035,0.07273,44834,0.46352,30335,0.31362,11872,0.12274,96725
2020,12,3398,0.02607,9145,0.07016,56200,0.43113,43141,0.33095,18470,0.14169,130354


In [8]:
faixa_etaria_obitos = obitos.groupby(by=['ano_obito','mes_obito','faixa_etaria'])[['sexo','ibge_res_pr']].count().rename(columns={'sexo':'0','ibge_res_pr':'1'})
faixa_etaria_obitos = faixa_etaria_obitos.unstack().fillna(0).astype(int)
faixa_etaria_obitos = faixa_etaria_obitos.swaplevel(1,0,1).sort_index(1).rename(columns={'0':'qtde','1':'%'})

faixa_etaria_obitos.columns = pd.MultiIndex.from_tuples(faixa_etaria_obitos.columns)
faixa_etaria_obitos['total'] = faixa_etaria_obitos.xs('qtde',level=1, axis=1).sum(axis=1)


# for ano in faixa_etaria_obitos.index.get_level_values(0).unique():
#     faixa_etaria_obitos = faixa_etaria_obitos.append(
#         pd.Series(data=faixa_etaria_obitos.loc[ano].sum(0), name=(ano,'total'))
#     )

# faixa_etaria_obitos = faixa_etaria_obitos.append(
#         pd.Series(data=faixa_etaria_obitos.xs('total', level=1).sum(0), name=('total_geral',''))
# )

# faixa_etaria_obitos = faixa_etaria_obitos.loc[faixa_etaria_obitos.sum(1)>0]
faixa_etaria_obitos = faixa_etaria_obitos.sort_index()

for ano_mes in faixa_etaria_obitos.index:
    for faixa_etaria, tipo in faixa_etaria_obitos.columns:
        if tipo == '%':
            faixa_etaria_obitos.loc[ano_mes,(faixa_etaria,'%')] = faixa_etaria_obitos.loc[ano_mes,(faixa_etaria,'%')] / faixa_etaria_obitos.loc[ano_mes,'total'].values
            
faixa_etaria_obitos.columns = faixa_etaria_obitos.columns.set_levels([ faixa_etaria_index[index] if isinstance(index,int) else index for index in faixa_etaria_obitos.columns.levels[0]],0,False)

faixa_etaria_obitos

  faixa_etaria_obitos.columns = faixa_etaria_obitos.columns.set_levels([ faixa_etaria_index[index] if isinstance(index,int) else index for index in faixa_etaria_obitos.columns.levels[0]],0,False)


Unnamed: 0_level_0,Unnamed: 1_level_0,<10,<10,10-19,10-19,20-39,20-39,40-59,40-59,>=60,>=60,total
Unnamed: 0_level_1,Unnamed: 1_level_1,qtde,%,qtde,%,qtde,%,qtde,%,qtde,%,Unnamed: 12_level_1
ano_obito,mes_obito,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2020,3,0,0.0,0,0.0,0,0.0,3,0.5,3,0.5,6
2020,4,0,0.0,0,0.0,2,0.01905,33,0.31429,70,0.66667,105
2020,5,1,0.00813,1,0.00813,3,0.02439,26,0.21138,92,0.74797,123
2020,6,0,0.0,3,0.00487,28,0.04545,115,0.18669,470,0.76299,616
2020,7,1,0.00069,1,0.00069,64,0.04448,321,0.22307,1052,0.73106,1439
2020,8,3,0.0019,3,0.0019,52,0.03293,323,0.20456,1198,0.75871,1579
2020,9,0,0.0,6,0.0048,60,0.04804,223,0.17854,960,0.76861,1249
2020,10,0,0.0,2,0.00245,28,0.03427,137,0.16769,650,0.79559,817
2020,11,1,0.00098,1,0.00098,33,0.0322,181,0.17659,809,0.78927,1025
2020,12,0,0.0,3,0.00137,74,0.03371,414,0.18861,1704,0.77631,2195


In [9]:
writer = pd.ExcelWriter("faixa_etaria_por_mes_raw.xlsx",
                        engine='xlsxwriter',
                        datetime_format='dd/mm/yyyy',
                        date_format='dd/mm/yyyy')

workbook = writer.book

faixa_etaria_casos.to_excel(writer,sheet_name=f"faixa_etaria_casos")
worksheet = writer.sheets[f"faixa_etaria_casos"]
auto_fit_columns(worksheet,faixa_etaria_casos)


faixa_etaria_obitos.to_excel(writer,sheet_name=f"faixa_etaria_obitos")
worksheet = writer.sheets[f"faixa_etaria_obitos"]
auto_fit_columns(worksheet,faixa_etaria_obitos)

writer.save()
writer.close()

  warn("Calling close() on already closed file.")


In [11]:
faixa_etaria_casos.loc[(slice(None),'04'),()]

Unnamed: 0_level_0,Unnamed: 1_level_0,<10,<10,10-19,10-19,20-39,20-39,40-59,40-59,>=60,>=60,total
Unnamed: 0_level_1,Unnamed: 1_level_1,qtde,%,qtde,%,qtde,%,qtde,%,qtde,%,Unnamed: 12_level_1
ano_caso,mes_caso,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2020,4,44,0.02125,61,0.02945,849,0.40995,743,0.35876,374,0.18059,2071
2021,4,2586,0.0405,5372,0.08414,25188,0.3945,21038,0.3295,9664,0.15136,63848
