#### Important Links (for attribute and database reference):

https://datasus.saude.gov.br/mortalidade-desde-1996-pela-cid-10

https://renastonline.ensp.fiocruz.br/sites/default/files/wiki/dicionario-sim.pdf

https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral+-+Estrutura.pdf

https://svs.aids.gov.br/download/Dicionario_de_Dados_SIM_tabela_DO.pdf

http://tabnet.datasus.gov.br/cgi/sim/Mortalidade_Geral_1996_2012.pdf

http://tabnet.datasus.gov.br/cgi/sim/Consolida_Sim_2011.pdf

ICD10 suicide codes used by DATASUS:

Códigos de suicídio usados pelo DATASUS do CID10:
http://www2.datasus.gov.br/cid10/V2008/WebHelp/v01_y98.htm#Cap20Nota01

#### DATASUS DATA:
```
ftp.datasus.gov.br
    /dissemin/publicos/SIM/CID10/DORES
    /dissemin/publicos/SIM/CID10/DOCS
```
Usual year update date:

![image.jpg](../utils/infos/attributes-desc/imgs/datasus_year_update_date.jpg)

In [18]:
# Importações
from IPython.display import display

import pandas as pd

import numpy as np
import datetime

import matplotlib.pyplot as plt

# if automatic converters are necessary
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Prevent infinite warnings
import warnings
warnings.filterwarnings('ignore')

In [19]:
# OS and File imports
import os
import sys

import importlib

from zipfile import ZipFile
from io import BytesIO

# List of directories to add to sys.path
directories_to_add = [
    os.path.abspath(os.path.join('..')),
    os.path.abspath(os.path.join('../utils')),
    os.path.abspath(os.path.join('../utils', 'functions')),
]

for directory in directories_to_add:
    if directory not in sys.path:
        sys.path.insert(0, directory)
print(sys.path)

['/home/user/coding_env/BrSuicides-dataset/utils/functions', '/home/user/coding_env/BrSuicides-dataset/utils', '/home/user/coding_env/BrSuicides-dataset', '/home/user/coding_env/BrSuicides-dataset/data_creation', '/mnt/d/.MESTRADO/.Orientacao/br-suicides-ts-analysis', '/mnt/d/.MESTRADO/.Orientacao/br-suicides-ts-analysis/data_analysis/utils', '/usr/local/lib/python310.zip', '/usr/local/lib/python3.10', '/usr/local/lib/python3.10/lib-dynload', '', '/home/user/coding_env/venvpysus/lib/python3.10/site-packages']


In [20]:
# Making sure pycache is remade on kernel run since files are tiny
import color_palettes
importlib.reload(color_palettes)
import attributes_values_table_generator
importlib.reload(attributes_values_table_generator)

from color_palettes import my_blue, my_orange
from preprocessing import initial_parse_dates
from attributes_values_table_generator import get_values_table_columns, get_attributes_values_table, get_single_attribute_values, update_attribute_values

In [21]:
# ---------------- USER SET FOLDER PATH ----------------
user_dir_path = ''

root_dir = '..'

final_dataset_dir = '/BrSuicides/'
final_dataset_name = 'BrSuicides'

csv_dir = '/data_storage/'
# csv_dir_datasus = csv_dir + '0_datasus_csvs/'
# csv_dir_dirty = csv_dir + '1_dirty/'
csv_dir_filtered = csv_dir + '2_filtered/'
csv_dir_final = final_dataset_dir + '/'

csv_data_dir = os.path.dirname(root_dir + csv_dir) + '/'
print(csv_data_dir)

# datasus_data_dir = os.path.dirname(root_dir + csv_dir_datasus) + '/'
# print(datasus_data_dir)
# csv_data_dir_dirty = os.path.dirname(root_dir + csv_dir_dirty) + '/'
# print(csv_data_dir_dirty)
csv_data_dir_filtered = os.path.dirname(root_dir + csv_dir_filtered) + '/'
print(csv_data_dir_filtered)
csv_data_dir_final = os.path.dirname(root_dir + csv_dir_final) + '/'
print(csv_data_dir_final)

../data_storage/
../data_storage/2_filtered/
../BrSuicides/


In [22]:
# List all files in the csvs directory
all_files = os.listdir(csv_data_dir_final)

# Filter for files that end with .zip extension
csv_files = [file for file in all_files if file.endswith('.csv')]
print('CSVs: ', '\n', csv_files)

# Filter for files that end with .zip extension
csv_file_name = csv_files[-1]
print('CSV file name: ', '\n', csv_file_name)

years_interval = csv_file_name.rsplit('-', 1)[1][0:-4].split('_')
print('Years interval: ', '\n' , years_interval)

# Years that will be downloaded, [1996,2023) interval
years = [x for x in range(int(years_interval[0]), (int(years_interval[-1]) + 1))]

print('Years:', '\n', years)

CSVs:  
 ['BrSuicides-1996_2022.csv', 'BrSuicides-1996_2023.csv', 'BrSuicides-1996_2024.csv']
CSV file name:  
 BrSuicides-1996_2024.csv
Years interval:  
 ['1996', '2024']
Years: 
 [1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]


### Dataframe:

In [23]:
dataframe = pd.read_csv(csv_data_dir_final + csv_file_name, encoding='utf-8', parse_dates=initial_parse_dates)
display(dataframe.info())
display(dataframe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299883 entries, 0 to 299882
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ESTADO      299883 non-null  object        
 1   ANO         299883 non-null  int64         
 2   DTOBITO     299883 non-null  datetime64[ns]
 3   NATURAL     254438 non-null  float64       
 4   DTNASC      298631 non-null  object        
 5   IDADE       299883 non-null  float64       
 6   SEXO        299880 non-null  object        
 7   RACACOR     278664 non-null  object        
 8   ESTCIV      291240 non-null  object        
 9   ESC         258235 non-null  object        
 10  OCUP        245283 non-null  object        
 11  CODMUNRES   299883 non-null  object        
 12  LOCOCOR     299507 non-null  object        
 13  CODMUNOCOR  299883 non-null  object        
 14  CAUSABAS    299883 non-null  object        
 15  ESC2010     159836 non-null  object        
 16  ES

None

Unnamed: 0,ESTADO,ANO,DTOBITO,NATURAL,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,OCUP,CODMUNRES,LOCOCOR,CODMUNOCOR,CAUSABAS,ESC2010,ESCFALAGR1
0,PR,1996,1996-10-24,841.0,1963-08-05,33.0,masculino,,Solteiro,Nenhuma,61200.0,Pinhal de São Bento,outros,Pinhal de São Bento,X709,,
1,PR,1996,1996-02-04,152.0,1920-09-21,75.0,masculino,,Casado,Nenhuma,62100.0,Assaí,domicílio,Assaí,X680,,
2,PR,1996,1996-01-17,841.0,1961-08-28,34.0,masculino,,Solteiro,,62100.0,Abatiá,hospital,Abatiá,X680,,
3,PR,1996,1996-02-07,841.0,1977-09-24,18.0,masculino,,Solteiro,Nenhuma,62100.0,Santa Cecília do Pavão,hospital,São Sebastião da Amoreira,X680,,
4,PR,1996,1996-02-09,841.0,1956-08-30,39.0,masculino,,União consensual,,62100.0,Santa Mariana,domicílio,Santa Mariana,X700,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299878,MT,2024,2024-12-30,851.0,1984-01-06,40.0,masculino,Branca,Casado,8 a 11 anos,CABO DA POLICIA MILITAR,Tangará da Serra,domicílio,Tangará da Serra,X740,8 a 11 anos,8 a 11 anos
299879,MT,2024,2024-12-30,821.0,2002-07-31,22.0,masculino,Parda,União consensual,4 a 7 anos,,Serra Nova Dourada,domicílio,Água Boa,X700,4 a 7 anos,4 a 7 anos
299880,MT,2024,2024-12-31,850.0,1969-10-08,55.0,masculino,Parda,União consensual,4 a 7 anos,MOTORISTA DE CAMINHAO (ROTAS REGIONAIS E INTER...,Alto Araguaia,hospital,Rondonópolis,X690,1 a 3 anos,4 a 7 anos
299881,MT,2024,2024-12-31,851.0,1994-11-05,30.0,masculino,Parda,Solteiro,Ignorado,TRABALHADOR DA MANUTENCAO DE EDIFICACOES,Araputanga,domicílio,Araputanga,X700,Ignorado,Ignorado


In [24]:
dataframe_columns = list(dataframe.columns)

print('Dataframe columns:\n', dataframe_columns)
print('\nNº of columns in dataframe: ', len(dataframe_columns))

Dataframe columns:
 ['ESTADO', 'ANO', 'DTOBITO', 'NATURAL', 'DTNASC', 'IDADE', 'SEXO', 'RACACOR', 'ESTCIV', 'ESC', 'OCUP', 'CODMUNRES', 'LOCOCOR', 'CODMUNOCOR', 'CAUSABAS', 'ESC2010', 'ESCFALAGR1']

Nº of columns in dataframe:  17


## Resulting Changes:

### Alterações realizadas por atributo:

In [25]:
dir_to_attributes_info_table = '../brsuicides_info_tables'

file_preprocessing_types = dir_to_attributes_info_table + '/attributes_preprocessing_type.csv'
preprocessing_types = pd.read_csv(file_preprocessing_types)
display(preprocessing_types)

Unnamed: 0,Attribute,Conversion Type,Null value,Remove Missing/Invalid?
0,DTOBITO,ignore,,True
1,ESTADO,ignore,,True
2,ANO,ignore,,True
3,CAUSABAS,ignore,,True
4,SEXO,categorical,ignorado,False
5,RACACOR,categorical,ignorado,False
6,ESTCIV,categorical,ignorado,False
7,ESC,categorical,ignorado,False
8,ESC2010,categorical,ignorado,False
9,ESCFALAGR1,categorical,ignorado,False


In [26]:
# Filter for files that end with .zip extension
original_csv_file_name = [file for file in os.listdir(csv_data_dir_filtered) if file.endswith('.csv')][0]
print('CSV file name: ', '\n', original_csv_file_name)

CSV file name:  
 brazil-suicides-1996_2024.csv


### Quantidade de valores por atributo

In [27]:
original_dataframe_length = len(pd.read_csv(csv_data_dir_filtered + original_csv_file_name, encoding='utf-8', parse_dates=initial_parse_dates))
print(f'Original dataframe length: {original_dataframe_length}')
current_dataframe_length = len(dataframe)
print(f'Current dataframe length: {current_dataframe_length}')

Original dataframe length: 301729
Current dataframe length: 299883


In [28]:
# Hardcoding DTOBITO update because its values are set in the filtering step, 
# an exception to the rest of the attributes
dtobito_values = get_single_attribute_values(dir_to_attributes_info_table, 'DTOBITO')
print(dtobito_values)
dtobito_values[-2] = current_dataframe_length - dtobito_values[-1]
print(dtobito_values)

update_attribute_values(dir_to_attributes_info_table, dtobito_values)

['DTOBITO', 0, 52, 301677, 266175, 52]
['DTOBITO', 0, 52, 301677, 299831, 52]
Updated values for 'DTOBITO' in the datatable.


In [29]:
df_lens = {'original': original_dataframe_length, 'current': current_dataframe_length}

final_attributes_values_table = get_attributes_values_table(dir_to_attributes_info_table, percentage=False)
final_attributes_values_table_percentages = get_attributes_values_table(dir_to_attributes_info_table, percentage=True, dataframe_lens=df_lens)

Cols: (['Faltantes/Nulos', 'Inválidos', 'Válidos'], ['Tratados', 'Removidos'])


## How to interpret this table:

##### The first three columns are the types of data detected before any type of manipulation, while the last column represents how many rows were removed because of the detection of missing/null or invalid data. They don't represent the final volume of each attribute data. For example, "SEXO" is an attribute that has 4 missing/null values and 51 "ignorado" values, but 0 missing/null values and only 28 "ignorado" remain.

In [30]:
final_attributes_values_table

Unnamed: 0,Atributo,Faltantes/Nulos,Inválidos,Válidos,Tratados,Removidos
0,DTOBITO,0,52,301677,299831,52
1,ESTADO,0,0,301729,299883,0
2,ANO,0,0,301729,299883,0
3,CAUSABAS,0,0,301729,299883,0
4,DTNASC,0,2187,299542,299883,0
5,IDADE,157,656,300916,299070,813
6,OCUP,55522,66816,179391,299883,0
7,CODMUNRES,0,1299,300430,298584,1299
8,CODMUNOCOR,0,492,301237,299391,492
9,NATURAL,46055,0,255674,299883,0


In [31]:
final_attributes_values_table_percentages

Unnamed: 0,Atributo,Faltantes/Nulos,Inválidos,Válidos,Tratados,Removidos
0,DTOBITO,0.0,0.02,99.98,99.98,0.02
1,ESTADO,0.0,0.0,100.0,100.0,0.0
2,ANO,0.0,0.0,100.0,100.0,0.0
3,CAUSABAS,0.0,0.0,100.0,100.0,0.0
4,DTNASC,0.0,0.72,99.28,100.0,0.0
5,IDADE,0.05,0.22,99.73,99.73,0.27
6,OCUP,18.4,22.14,59.45,100.0,0.0
7,CODMUNRES,0.0,0.43,99.57,99.57,0.43
8,CODMUNOCOR,0.0,0.16,99.84,99.84,0.16
9,NATURAL,15.26,0.0,84.74,100.0,0.0


In [32]:
def generate_latex_table(attributes_info_values_table):
    # Set the first column as the index to ignore the DataFrame index
    attributes_info_values_table = attributes_info_values_table.set_index(attributes_info_values_table.columns[0])

    # Use the DataFrame.style.to_latex() to generate the LaTeX string
    latex_table = attributes_info_values_table.style.format(precision=2).to_latex(
        column_format="|l|" + "r|" * (len(attributes_info_values_table.columns))
    )
    
    # Replace \toprule, \midrule, and \bottomrule with \hline
    latex_table = latex_table.replace("\\toprule", "\\hline").replace("\\midrule", "\\hline").replace("\\bottomrule", "\\hline")

    # Add \hline after each row
    latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n")

    # Return the LaTeX table as a raw string
    return latex_table

In [33]:
latex_info_table = generate_latex_table(final_attributes_values_table_percentages)
print(latex_info_table)

\begin{tabular}{|l|r|r|r|r|r|}
 & Faltantes/Nulos & Inválidos & Válidos & Tratados & Removidos \\ \hline
Atributo &  &  &  &  &  \\ \hline
DTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \hline
ESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
ANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
CAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
DTNASC & 0.00 & 0.72 & 99.28 & 100.00 & 0.00 \\ \hline
IDADE & 0.05 & 0.22 & 99.73 & 99.73 & 0.27 \\ \hline
OCUP & 18.40 & 22.14 & 59.45 & 100.00 & 0.00 \\ \hline
CODMUNRES & 0.00 & 0.43 & 99.57 & 99.57 & 0.43 \\ \hline
CODMUNOCOR & 0.00 & 0.16 & 99.84 & 99.84 & 0.16 \\ \hline
NATURAL & 15.26 & 0.00 & 84.74 & 100.00 & 0.00 \\ \hline
ESC & 14.08 & 18.13 & 67.79 & 100.00 & 0.00 \\ \hline
ESC2010 & 46.90 & 8.58 & 44.52 & 100.00 & 0.00 \\ \hline
ESCFALAGR1 & 52.91 & 7.46 & 39.63 & 100.00 & 0.00 \\ \hline
ESTCIV & 3.02 & 4.48 & 92.51 & 100.00 & 0.00 \\ \hline
LOCOCOR & 0.13 & 0.74 & 99.13 & 100.00 & 0.00 \\ \hline
RACACOR & 7.23 & 0.00 

In [None]:
'''
\begin{tabular}{|l|r|r|r|r|r|}
\hline
Atributo & Faltantes & Nulos/Inválidos & Válidos & Tratados & Removidos \\ \hline
DTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \hline
ESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
ANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
CAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
DTNASC & 0.00 & 0.79 & 99.21 & 100.00 & 0.00 \\ \hline
IDADE & 0.06 & 0.23 & 99.71 & 99.71 & 0.29 \\ \hline
OCUP & 18.67 & 24.93 & 56.39 & 100.00 & 0.00 \\ \hline
CODMUNRES & 0.00 & 0.46 & 99.54 & 99.54 & 0.46 \\ \hline
CODMUNOCOR & 0.00 & 0.18 & 99.82 & 99.82 & 0.18 \\ \hline
NATURAL & 16.82 & 0.00 & 83.18 & 100.00 & 0.00 \\ \hline
ESC & 15.35 & 18.79 & 65.86 & 100.00 & 0.00 \\ \hline
ESC2010 & 52.22 & 8.05 & 39.73 & 100.00 & 0.00 \\ \hline
ESCFALAGR1 & 58.99 & 6.78 & 34.22 & 100.00 & 0.00 \\ \hline
ESTCIV & 3.12 & 4.37 & 92.51 & 100.00 & 0.00 \\ \hline
LOCOCOR & 0.15 & 0.81 & 99.03 & 100.00 & 0.00 \\ \hline
RACACOR & 8.04 & 0.00 & 91.96 & 100.00 & 0.00 \\ \hline
SEXO & 0.00 & 0.02 & 99.98 & 100.00 & 0.00 \\ \hline
\end{tabular}
'''

'\n\x08egin{tabular}{|l|r|r|r|r|r|}\n\\hline\nAtributo & Faltantes & Nulos/Inválidos & Válidos & Tratados & Removidos \\ \\hline\nDTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \\hline\nESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nCAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nDTNASC & 0.00 & 0.79 & 99.21 & 100.00 & 0.00 \\ \\hline\nIDADE & 0.06 & 0.23 & 99.71 & 99.71 & 0.29 \\ \\hline\nOCUP & 18.67 & 24.93 & 56.39 & 100.00 & 0.00 \\ \\hline\nCODMUNRES & 0.00 & 0.46 & 99.54 & 99.54 & 0.46 \\ \\hline\nCODMUNOCOR & 0.00 & 0.18 & 99.82 & 99.82 & 0.18 \\ \\hline\nNATURAL & 16.82 & 0.00 & 83.18 & 100.00 & 0.00 \\ \\hline\nESC & 15.35 & 18.79 & 65.86 & 100.00 & 0.00 \\ \\hline\nESC2010 & 52.22 & 8.05 & 39.73 & 100.00 & 0.00 \\ \\hline\nESCFALAGR1 & 58.99 & 6.78 & 34.22 & 100.00 & 0.00 \\ \\hline\nESTCIV & 3.12 & 4.37 & 92.51 & 100.00 & 0.00 \\ \\hline\nLOCOCOR & 0.15 & 0.81 & 99.03 & 100.00 & 0.00 \\ \\hline\nR