# "SRAG Data"
> "Data from opendatasus"

- toc: true
- branch: master
- badges: true
- comments: false
- categories: [srag, covid, opendata, sus]
- image: images/some_folder/your_image.png
- hide: false
- search_exclude: true
- metadata_key1: metadata_value1
- metadata_key2: metadata_value2

Dicionário de dados:

- [2021](https://opendatasus.saude.gov.br/dataset/9f76e80f-a2f1-4662-9e37-71084eae23e3/resource/b3321e55-24e9-49ab-8651-29cf5c8f3179/download/dicionario-de-dados-srag-hospitalizado-27.07.2020-final.pdf)

In [1]:
#hide

import os.path
import pandas as pd
import ssl
from urllib.request import urlopen
from bs4 import BeautifulSoup

# import matplotlib.pyplot as plt
import altair as alt

In [2]:
#hide

def get_last_bd_srag_csv_url(year=2021):
    
    available_years = (2020,2021)
    if year not in available_years:
        print('year not valid. Available years:',available_years)
        return
    
    # Se nao achar, retorna última url encontrada
    srag_url = f'https://s3-sa-east-1.amazonaws.com/ckan.saude.gov.br/SRAG/{year}/INFLUD-29-03-2021.csv'
    
    context = ssl._create_unverified_context() # To aviod ssl error
    bd_srag_url = f'https://opendatasus.saude.gov.br/dataset/bd-srag-{year}'
    html_page = urlopen(bd_srag_url, context=context)
    soup = BeautifulSoup(html_page, features="lxml")
    for link in soup.findAll('a'):
        url = link.get('href')
        (filename, ext) = os.path.splitext(url)
        if ext.lower() == '.csv':
            srag_url = url
            print(f'\nCsv file found at <{bd_srag_url}>')
    
    return srag_url



def get_srag_data(years=[2021],update=True,save_local=True,treat=True,selected_columns='BASIC',aditional_columns=[]):
    
    sep = ';'
    quotechar = '"'
    frames = []
    for year in years:
        fname = f'data/opendatasus/INFLUD{year}.csv'
        if os.path.isfile(fname) and not update:
            print(f'\nReading OpenDataSus from local file <{fname}>. If you prefer to download last version, set "update=True".\n')
            df = pd.read_csv(fname,dtype=object)
        else:
            url = get_last_bd_srag_csv_url(year)
            print(f'\nDownloading from <{url}> ... ', end='')
            df = pd.read_csv(url,sep=sep,quotechar=quotechar,dtype=object)
            if save_local:
                df.to_csv(fname,index=False)
            print('complete!\n')
        frames.append(df)
    
    df = pd.concat(frames)
    if treat:
        df = treat_srag_data(df,selected_columns,aditional_columns)
    
    return df

def get_cities_states_dictionaries():
    ''' Returns 2 dictionaries: 
    1. cities_dict - city code (6 dig): city name
    2. states_dict - state code (2 dig): state name
    '''
    fname = 'data/IBGE/RELATORIO_DTB_BRASIL_MUNICIPIO.ods'
    df = pd.read_excel(fname,dtype=object)
    df['cod_municipio'] = df['Código Município Completo'].str[:6]
    cities_dict = df.set_index('cod_municipio')['Nome_Município'].to_dict()
    states_dict = df[['UF','Nome_UF']].groupby('UF').first()['Nome_UF'].to_dict()
    return cities_dict, states_dict

def treat_srag_data(df_orig,selected_columns='BASIC',aditional_columns=[]):
    "Select columns, set types and replace values."
    df = df_orig.copy()
    date_cols = ['DT_SIN_PRI','DT_EVOLUCA','DT_NASC','DT_ENTUTI']
    cities_cols = ['CO_MUN_RES','CO_MU_INTE','CO_MUN_NOT']
#     str_cols = cities_cols
    
    if selected_columns != 'ALL':
        basic_cols = date_cols + cities_cols
        basic_cols += ['SEM_PRI', 'EVOLUCAO', 'CLASSI_FIN','CLASSI_OUT',                
                      'NU_IDADE_N','CS_RACA', 'CS_ESCOL_N', 'CS_SEXO',
#                       'ID_MN_RESI','ID_MN_ITE','ID_MUNICIP',
                      'UTI', 'SUPORT_VEN']
    
        if selected_columns == 'BASIC':
            cols = basic_cols
        else:
            cols = basic_cols + list(aditional_columns)
        df = df[cols]
        
    df_cols = df.columns
    numeric_cols = ['SEM_PRI','NU_IDADE_N']
#     numeric_cols = list(set(df_cols) - set(date_cols) - set(str_cols))
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce',dayfirst=True)

    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        
    # add age_col 'idade_anos'
    age_col = 'idade_anos'
    df[age_col] = (df.DT_SIN_PRI - df.DT_NASC).dt.days / 365.25
    mask = df[age_col].isna()
    div_tipo = {'1':365.25,'2':12.,'3':1.}
    df.loc[mask,age_col] = df_orig.loc[mask,'NU_IDADE_N'].astype(int) / df_orig.loc[mask,'TP_IDADE'].apply(lambda x: div_tipo.get(x,'n.d.'))
    
    first_date_2021 = pd.to_datetime('2021-01-03')
    mask = (df.DT_SIN_PRI >= first_date_2021)
#     mask = df.DT_SIN_PRI.dt.year == 2021
    df['SEM_PRI_ABS'] = df['SEM_PRI']
    df.loc[mask,'SEM_PRI_ABS'] = df.loc[mask,'SEM_PRI'] + 53
#     + (df.DT_SIN_PRI.dt.year - 2020) * 53
    
    cities_dict,states_dict = get_cities_states_dictionaries()
    regions_dict = {'1':'Norte',
                    '2':'Nordeste',
                    '3':'Sudeste',
                    '4':'Sul',
                    '5':'Centro-Oeste' }
    
    not_valid_col = 'nd'
    for col in cities_cols:
#         city_name_col = col[3:]
        sufix_index = col.index('_',3)
        city_name_col = 'MUN' + col[sufix_index:]
        state_name_col = 'UF' + col[sufix_index:]
        region_name_col = 'REGIAO' + col[sufix_index:]
        df[city_name_col] = df[col].apply(lambda x: cities_dict.get(x,not_valid_col))
        df[state_name_col] = df[col].str[:2].apply(lambda x: states_dict.get(x,not_valid_col))
        df[region_name_col] = df[col].str[0].apply(lambda x: regions_dict.get(x,not_valid_col))
        
    evolucao_dict = {'1':'cura',
                     '2':'obito',
                     '3':'obito_outras_causas',
                     '9':'ignorado' }
    classi_fin_dict = {'1':'cura',
                     '2':'obito',
                     '3':'obito_outras_causas',
                     '9':'ignorado' }
    raca_dict = {'1':'branca',
                 '2':'preta',
                 '3':'amarela',
                 '4':'parda',
                 '5':'indigena',
                 '9':'ignorado' }
    evolucao_dict = {'1':'cura',
                     '2':'obito',
                     '3':'obito_outras_causas',
                     '9':'ignorado' }

    df['EVOLUCAO'] = df['EVOLUCAO'].apply(lambda x: evolucao_dict.get(x,not_valid_col))
    df['CLASSI_FIN'] = df['CLASSI_FIN'].apply(lambda x: classi_fin_dict.get(x,not_valid_col))
    df['CS_RACA'] = df['CS_RACA'].apply(lambda x: raca_dict.get(x,not_valid_col))
#     df['CLASSI_FIN'] = df['CLASSI_FIN'].apply(lambda x: classi_fin_dict.get(x,not_valid_col))
    
    dict_cols = ['EVOLUCAO','CLASSI_FIN']
    
    other_cols = list(set(df_cols) - set(date_cols) - set(numeric_cols) - set(cities_cols) - set(dict_cols))
    df[other_cols] = df[other_cols].fillna(not_valid_col)
    
#     for col in str_cols:
#         df[col] = df[col].fillna('n.d.')
    

        
    return df

def get_pivot_data(df,index_cols=[],columns_cols=[],values_cols='',last_week=999):#,selection_dict={}):
    
#     df = df_srag.copy()

#     for column,value in selection_dict.items():
#         df = df.query(f'{column} == "{value}"')
    
    
    df = df.groupby(by=index_cols + columns_cols)[values_cols].count().reset_index()
    df = df.pivot(index=index_cols,columns=columns_cols,values=values_cols).fillna(0)
    if len(index_cols) == 2:
        i = 0    
        index_col_1 = index_cols[i]
        index_col_2 = index_cols[i+1]
        df1 = df.reset_index().set_index(index_col_1)
        df2 = df1.groupby(by=index_col_1).sum()
        df2[index_col_2] = '--TODOS--'
        df = pd.concat([df1,df2])
        
    df['total'] = df.sum(axis=1)
    df = df.reset_index()
    return df


def get_weekly_data(df,x_col='SEM_PRI_ABS',y_col='tx_obito',cat_col='UF_RES',last_week=999,ns_opacity=0.10):
    
    index_cols = ['SEM_PRI_ABS',cat_col]
    columns_cols = ['EVOLUCAO']
    values_cols = 'DT_SIN_PRI'

    df = get_pivot_data(df,index_cols,columns_cols,values_cols,last_week)
    df['tx_obito'] = df['obito'] / df['total']
    


def show_alt_plot(df,x_col='SEM_PRI_ABS',y_col='tx_obito',cat_col='UF_RES',last_week=999,ns_opacity=0.10):
    
    index_cols = ['SEM_PRI_ABS',cat_col]
    # index_cols = ['SEM_PRI_ABS']
    columns_cols = ['EVOLUCAO']
    values_cols = 'DT_SIN_PRI'

    df = get_pivot_data(df,index_cols,columns_cols,values_cols,last_week)
    if y_col == 'tx_obito':
        df['tx_obito'] = df['obito'] / df['total']
    
    chart = get_altair_chart(df,x_col,y_col,cat_col,ns_opacity)
    return chart
    

def get_altair_chart(df,x_col,y_col,cat_col,ns_opacity=0.10):

    options_list = df[cat_col].unique().tolist()
    options_list.sort()

    selection = alt.selection_single(
        name='Selecione',
        fields=[cat_col],
        init={cat_col: options_list[0]},
        bind={cat_col: alt.binding_select(options=options_list)}
    )
# 57A44C
    chart = alt.Chart(df).mark_line(point=True,strokeWidth=2).add_selection(
        selection
    ).encode(
        x=x_col,
        y=y_col,
        color=f'{cat_col}:N',
        tooltip=list(df.columns),
        opacity=alt.condition(selection, alt.value(1.0), alt.value(ns_opacity))
    ).configure_point(
        size=20
    ).properties(
        width=800,
        height=500
    )
    return chart

def get_altair_chart_2_axis(df,x_col,y_col_1,y_col_2,cat_col,ns_opacity=0.10):

    options_list = df[cat_col].unique().tolist()
    options_list.sort()

    selection = alt.selection_single(
        name='Selecione',
        fields=[cat_col],
        init={cat_col: options_list[0]},
        bind={cat_col: alt.binding_select(options=options_list)}
    )
    
    base = alt.Chart(df).encode(
        alt.X(x_col, axis=alt.Axis(title='Semana Primeiros sintomas'))
    )
    line1 = base.mark_line(stroke='#57A44C', interpolate='monotone').add_selection(
        selection
    ).encode(alt.Y(y_col_1,axis=alt.Axis(title='Taxa de óbito', titleColor='#57A44C'))
             ,color=cat_col
             ,tooltip=list(df.columns)
            ).transform_filter(
        selection
    )
    
    line2 = base.mark_line(stroke='#5276A7', interpolate='monotone').encode(
        alt.Y(y_col_2,axis=alt.Axis(title='Total de casos', titleColor='#5276A7'))
        ,color=cat_col
        ,tooltip=list(df.columns)
    ).transform_filter(
        selection
    )
    
    chart = alt.layer(line1, line2).resolve_scale(
        y = 'independent'
    ).properties(
        width=800,
        height=500
    )
    return chart

def select_items(df,selection_dict):
    df = df.copy()
    for column,value in selection_dict.items():
        df = df.query(f'{column}{value}')
    return df

def get_outcome_data(df,index_cols):
    columns_cols = ['EVOLUCAO']
    values_cols = 'DT_SIN_PRI'
    df = get_pivot_data(df,index_cols,columns_cols,values_cols)
    df['tx_obito'] = df['obito'] / df['total']
    return df

In [3]:
#hide

df_original = get_srag_data(years=[2020,2021],update=False,treat=False,save_local=True)


Reading OpenDataSus from local file <data/opendatasus/INFLUD2020.csv>. If you prefer to download last version, set "update=True".


Reading OpenDataSus from local file <data/opendatasus/INFLUD2021.csv>. If you prefer to download last version, set "update=True".



In [4]:
#hide

print(df_original.shape)
assert df_original.shape[0]>=1608416
assert df_original.shape[1]==154

(1608416, 154)


In [5]:
#hide

df_srag = treat_srag_data(df_original)

In [6]:
#hide

print(df_srag.shape)
assert df_srag.shape[0]==df_original.shape[0]
assert df_srag.shape[1]==28

(1608416, 28)


In [7]:
#collapse_output

df_srag.dtypes

DT_SIN_PRI     datetime64[ns]
DT_EVOLUCA     datetime64[ns]
DT_NASC        datetime64[ns]
DT_ENTUTI      datetime64[ns]
CO_MUN_RES             object
CO_MU_INTE             object
CO_MUN_NOT             object
SEM_PRI                 int64
EVOLUCAO               object
CLASSI_FIN             object
CLASSI_OUT             object
NU_IDADE_N              int64
CS_RACA                object
CS_ESCOL_N             object
CS_SEXO                object
UTI                    object
SUPORT_VEN             object
idade_anos            float64
SEM_PRI_ABS             int64
MUN_RES                object
UF_RES                 object
REGIAO_RES             object
MUN_INTE               object
UF_INTE                object
REGIAO_INTE            object
MUN_NOT                object
UF_NOT                 object
REGIAO_NOT             object
dtype: object

## Análises semanais

In [8]:
last_week = 999
cat1 = 'SEM_PRI_ABS'

In [9]:
#collapse

cat2 = 'UF_RES'
index_cols = [cat1,cat2]

# selection_dict = {'EVOLUCAO':"== ['cura','obito']", cat2:'!= "nd"'}
selection_dict = {cat2:'!= "nd"'}
# selection_dict = {'CS_ESCOL_N':"== ['1','2','3','4']",'idade_anos':' >= 60','EVOLUCAO':"== ['cura','obito']"}

df_sel = select_items(df_srag,selection_dict)
print(f'\n Número de casos selecionados: {df_sel.shape[0]}\n')

df = get_outcome_data(df_sel,index_cols)
df = df.query('SEM_PRI_ABS <= @last_week')
get_altair_chart(df,x_col=cat1,y_col='tx_obito',cat_col=cat2,ns_opacity=0.10)


 Número de casos selecionados: 1608269



In [10]:
#collapse

cat2 = 'CS_ESCOL_N'
index_cols = [cat1,cat2]

selection_dict = {'CS_ESCOL_N':"== ['1','2','3','4']",'idade_anos':' >= 30'}  # 'EVOLUCAO':"== ['cura','obito']"}

df_sel = select_items(df_srag,selection_dict)
print(f'\n Número de casos selecionados: {df_sel.shape[0]}\n')

df = get_outcome_data(df_sel,index_cols)
df = df.query('SEM_PRI_ABS <= @last_week')
get_altair_chart(df,x_col=cat1,y_col='tx_obito',cat_col=cat2,ns_opacity=0.30)


 Número de casos selecionados: 453078



In [11]:
#collapse

cat2 = 'CS_RACA'
index_cols = [cat1,cat2]

selection_dict = {'CS_RACA':"!= ['ignorado','nd','indigena','amarela']",'idade_anos':' <= 60'}  # 'EVOLUCAO':"== ['cura','obito']"}

df_sel = select_items(df_srag,selection_dict)
print(f'\n Número de casos selecionados: {df_sel.shape[0]}\n')

df = get_outcome_data(df_sel,index_cols)
df = df.query('SEM_PRI_ABS <= @last_week')
get_altair_chart(df,x_col=cat1,y_col='tx_obito',cat_col=cat2,ns_opacity=0.30)


 Número de casos selecionados: 614338



In [12]:
#collapse

cat2 = 'CS_SEXO'
index_cols = [cat1,cat2]

selection_dict = {'CS_SEXO':"!= ['I']",'idade_anos':' <= 150'}  # 'EVOLUCAO':"== ['cura','obito']"}

df_sel = select_items(df_srag,selection_dict)
print(f'\n Número de casos selecionados: {df_sel.shape[0]}\n')

df = get_outcome_data(df_sel,index_cols)
df = df.query('SEM_PRI_ABS <= @last_week')
get_altair_chart(df,x_col=cat1,y_col='tx_obito',cat_col=cat2,ns_opacity=0.30)


 Número de casos selecionados: 1607894



In [13]:
#collapse

cat2 = 'UF_RES'
index_cols = [cat1,cat2]

selection_dict = {'UF_RES':"!= ['nd']",'idade_anos':' <= 150','EVOLUCAO':"== ['cura','obito']"}

df_sel = select_items(df_srag,selection_dict)
print(f'\n Número de casos selecionados: {df_sel.shape[0]}\n')

df = get_outcome_data(df_sel,index_cols)
df = df.query('SEM_PRI_ABS <= @last_week')
# get_altair_chart(df,x_col=cat1,y_col='tx_obito',cat_col=cat2,ns_opacity=0.30)


x_col = 'SEM_PRI_ABS'
y_col_1 = 'tx_obito'
y_col_2 = 'total'

get_altair_chart_2_axis(df,x_col=cat1,y_col_1=y_col_1,y_col_2=y_col_2,cat_col=cat2,ns_opacity=0.10)


 Número de casos selecionados: 1232427

