#### Important Links (for attribute and database reference):

https://datasus.saude.gov.br/mortalidade-desde-1996-pela-cid-10

https://renastonline.ensp.fiocruz.br/sites/default/files/wiki/dicionario-sim.pdf

https://diaad.s3.sa-east-1.amazonaws.com/sim/Mortalidade_Geral+-+Estrutura.pdf

https://svs.aids.gov.br/download/Dicionario_de_Dados_SIM_tabela_DO.pdf

http://tabnet.datasus.gov.br/cgi/sim/Mortalidade_Geral_1996_2012.pdf

http://tabnet.datasus.gov.br/cgi/sim/Consolida_Sim_2011.pdf

ICD10 suicide codes used by DATASUS:

Códigos de suicídio usados pelo DATASUS do CID10:
http://www2.datasus.gov.br/cid10/V2008/WebHelp/v01_y98.htm#Cap20Nota01

#### DATASUS DATA:
```
ftp.datasus.gov.br
    /dissemin/publicos/SIM/CID10/DORES
    /dissemin/publicos/SIM/CID10/DOCS
```
Usual year update date:

![image.jpg](../utils/infos/attributes-desc/imgs/datasus_year_update_date.jpg)

In [1]:
# Importações
from IPython.display import display

import pandas as pd

import numpy as np
import datetime

import matplotlib.pyplot as plt

# if automatic converters are necessary
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

# Prevent infinite warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# OS and File imports
import os
import sys

import importlib

from zipfile import ZipFile
from io import BytesIO

# List of directories to add to sys.path
directories_to_add = [
    os.path.abspath(os.path.join('..')),
    os.path.abspath(os.path.join('../utils')),
    os.path.abspath(os.path.join('../utils', 'functions')),
]

for directory in directories_to_add:
    if directory not in sys.path:
        sys.path.insert(0, directory)
print(sys.path)

['d:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset\\utils\\functions', 'd:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset\\utils', 'd:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset', 'c:\\Python312\\python312.zip', 'c:\\Python312\\DLLs', 'c:\\Python312\\Lib', 'c:\\Python312', '', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\win32', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\win32\\lib', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\Pythonwin', 'c:\\Python312\\Lib\\site-packages']


In [3]:
# Making sure pycache is remade on kernel run since files are tiny
import color_palettes
importlib.reload(color_palettes)
import attributes_values_table_generator
importlib.reload(attributes_values_table_generator)

from color_palettes import my_blue, my_orange
from preprocessing import initial_parse_dates
from attributes_values_table_generator import get_values_table_columns, get_attributes_values_table, get_single_attribute_values, update_attribute_values

In [4]:
# ---------------- USER SET FOLDER PATH ----------------
user_dir_path = ''

root_dir = '..'

final_dataset_dir = '/BrSuicides/'
final_dataset_name = 'BrSuicides'

csv_dir = '/data_storage/'
# csv_dir_datasus = csv_dir + '0_datasus_csvs/'
# csv_dir_dirty = csv_dir + '1_dirty/'
csv_dir_filtered = csv_dir + '2_filtered/'
csv_dir_final = final_dataset_dir + '/'

csv_data_dir = os.path.dirname(root_dir + csv_dir) + '/'
print(csv_data_dir)

# datasus_data_dir = os.path.dirname(root_dir + csv_dir_datasus) + '/'
# print(datasus_data_dir)
# csv_data_dir_dirty = os.path.dirname(root_dir + csv_dir_dirty) + '/'
# print(csv_data_dir_dirty)
csv_data_dir_filtered = os.path.dirname(root_dir + csv_dir_filtered) + '/'
print(csv_data_dir_filtered)
csv_data_dir_final = os.path.dirname(root_dir + csv_dir_final) + '/'
print(csv_data_dir_final)

../data_storage/
../data_storage/2_filtered/
../BrSuicides/


In [5]:
# List all files in the csvs directory
all_files = os.listdir(csv_data_dir_final)

# Filter for files that end with .zip extension
csv_files = [file for file in all_files if file.endswith('.csv')]
print('CSVs: ', '\n', csv_files)

# Filter for files that end with .zip extension
csv_file_name = csv_files[0]
print('CSV file name: ', '\n', csv_file_name)

years_interval = csv_file_name.rsplit('-', 1)[1][0:-4].split('_')
print('Years interval: ', '\n' , years_interval)

# Years that will be downloaded, [1996,2023) interval
years = [x for x in range(int(years_interval[0]), (int(years_interval[-1]) + 1))]

print('Years:', '\n', years)

CSVs:  
 ['BrSuicides-1996_2022.csv', 'BrSuicides-1996_2023.csv']
CSV file name:  
 BrSuicides-1996_2022.csv
Years interval:  
 ['1996', '2022']
Years: 
 [1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


### Dataframe:

In [6]:
dataframe = pd.read_csv(csv_data_dir_final + csv_file_name, encoding='utf-8', parse_dates=initial_parse_dates)
display(dataframe.info())
display(dataframe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266227 entries, 0 to 266226
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   ESTADO      266227 non-null  object        
 1   ANO         266227 non-null  int64         
 2   DTOBITO     266227 non-null  datetime64[ns]
 3   NATURAL     221746 non-null  float64       
 4   DTNASC      264993 non-null  object        
 5   IDADE       266227 non-null  float64       
 6   SEXO        266224 non-null  object        
 7   RACACOR     245272 non-null  object        
 8   ESTCIV      258289 non-null  object        
 9   ESC         225885 non-null  object        
 10  OCUP        217058 non-null  object        
 11  CODMUNRES   266227 non-null  object        
 12  LOCOCOR     265851 non-null  object        
 13  CODMUNOCOR  266227 non-null  object        
 14  CAUSABAS    266227 non-null  object        
 15  ESC2010     127700 non-null  object        
 16  ES

None

Unnamed: 0,ESTADO,ANO,DTOBITO,NATURAL,DTNASC,IDADE,SEXO,RACACOR,ESTCIV,ESC,OCUP,CODMUNRES,LOCOCOR,CODMUNOCOR,CAUSABAS,ESC2010,ESCFALAGR1
0,PR,1996,1996-10-24,841.0,1963-08-05,33.0,masculino,,Solteiro,Nenhuma,61200.0,Pinhal de São Bento,outros,Pinhal de São Bento,X709,,
1,PR,1996,1996-02-04,152.0,1920-09-21,75.0,masculino,,Casado,Nenhuma,62100.0,Assaí,domicílio,Assaí,X680,,
2,PR,1996,1996-01-17,841.0,1961-08-28,34.0,masculino,,Solteiro,,62100.0,Abatiá,hospital,Abatiá,X680,,
3,PR,1996,1996-02-07,841.0,1977-09-24,18.0,masculino,,Solteiro,Nenhuma,62100.0,Santa Cecília do Pavão,hospital,São Sebastião da Amoreira,X680,,
4,PR,1996,1996-02-09,841.0,1956-08-30,39.0,masculino,,União consensual,,62100.0,Santa Mariana,domicílio,Santa Mariana,X700,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266222,MT,2022,2022-09-17,851.0,1982-08-28,40.0,masculino,Parda,União consensual,1 a 3 anos,CASEIRO (AGRICULTURA),Glória D'Oeste,domicílio,Glória D'Oeste,X730,1 a 3 anos,1 a 3 anos
266223,MT,2022,2022-08-21,815.0,1998-09-11,23.0,masculino,Parda,Solteiro,8 a 11 anos,PEDREIRO,Cuiabá,outros,Cuiabá,X708,4 a 7 anos,8 a 11 anos
266224,MT,2022,2022-10-14,851.0,1993-06-23,29.0,masculino,Branca,Solteiro,8 a 11 anos,MECANICO DE MANUTENCAO DE AUTOMOVEIS E MOTOCIC...,Rondonópolis,domicílio,Rondonópolis,X700,8 a 11 anos,8 a 11 anos
266225,MT,2022,2022-10-09,851.0,1981-06-12,41.0,masculino,Parda,Solteiro,12 anos e mais,REPRESENTANTE COMERCIAL AUTONOMO,Rondonópolis,domicílio,Rondonópolis,X700,12 anos e mais,12 anos e mais


In [7]:
dataframe_columns = list(dataframe.columns)

print('Dataframe columns:\n', dataframe_columns)
print('\nNº of columns in dataframe: ', len(dataframe_columns))

Dataframe columns:
 ['ESTADO', 'ANO', 'DTOBITO', 'NATURAL', 'DTNASC', 'IDADE', 'SEXO', 'RACACOR', 'ESTCIV', 'ESC', 'OCUP', 'CODMUNRES', 'LOCOCOR', 'CODMUNOCOR', 'CAUSABAS', 'ESC2010', 'ESCFALAGR1']

Nº of columns in dataframe:  17


## Resulting Changes:

### Alterações realizadas por atributo:

In [8]:
dir_to_attributes_info_table = '../brsuicides_info_tables'

file_preprocessing_types = dir_to_attributes_info_table + '/attributes_preprocessing_type.csv'
preprocessing_types = pd.read_csv(file_preprocessing_types)
display(preprocessing_types)

Unnamed: 0,Attribute,Conversion Type,Null value,Remove Missing/Invalid?
0,DTOBITO,ignore,,True
1,ESTADO,ignore,,True
2,ANO,ignore,,True
3,CAUSABAS,ignore,,True
4,SEXO,categorical,ignorado,False
5,RACACOR,categorical,ignorado,False
6,ESTCIV,categorical,ignorado,False
7,ESC,categorical,ignorado,False
8,ESC2010,categorical,ignorado,False
9,ESCFALAGR1,categorical,ignorado,False


In [9]:
# Filter for files that end with .zip extension
original_csv_file_name = [file for file in os.listdir(csv_data_dir_filtered) if file.endswith('.csv')][0]
print('CSV file name: ', '\n', original_csv_file_name)

CSV file name:  
 brazil-suicides-1996_2023.csv


### Quantidade de valores por atributo

In [10]:
original_dataframe_length = len(pd.read_csv(csv_data_dir_filtered + original_csv_file_name, encoding='utf-8', parse_dates=initial_parse_dates))
print(f'Original dataframe length: {original_dataframe_length}')
current_dataframe_length = len(dataframe)
print(f'Current dataframe length: {current_dataframe_length}')

Original dataframe length: 284978
Current dataframe length: 266227


In [11]:
# Hardcoding DTOBITO update because its values are set in the filtering step, 
# an exception to the rest of the attributes
dtobito_values = get_single_attribute_values(dir_to_attributes_info_table, 'DTOBITO')
print(dtobito_values)
dtobito_values[-2] = current_dataframe_length - dtobito_values[-1]
print(dtobito_values)

update_attribute_values(dir_to_attributes_info_table, dtobito_values)

['DTOBITO', 0, 52, 284926, 283182, 52]
['DTOBITO', 0, 52, 284926, 266175, 52]
Updated values for 'DTOBITO' in the datatable.


In [12]:
df_lens = {'original': original_dataframe_length, 'current': current_dataframe_length}

final_attributes_values_table = get_attributes_values_table(dir_to_attributes_info_table, percentage=False)
final_attributes_values_table_percentages = get_attributes_values_table(dir_to_attributes_info_table, percentage=True, dataframe_lens=df_lens)

Cols: (['Faltantes/Nulos', 'Inválidos', 'Válidos'], ['Tratados', 'Removidos'])


## How to interpret this table:

##### The first three columns are the types of data detected before any type of manipulation, while the last column represents how many rows were removed because of the detection of missing/null or invalid data. They don't represent the final volume of each attribute data. For example, "SEXO" is an attribute that has 4 missing/null values and 51 "ignorado" values, but 0 missing/null values and only 28 "ignorado" remain.

In [13]:
final_attributes_values_table

Unnamed: 0,Atributo,Faltantes/Nulos,Inválidos,Válidos,Tratados,Removidos
0,DTOBITO,0,52,284926,266175,52
1,ESTADO,0,0,284978,283182,0
2,ANO,0,0,284978,283182,0
3,CAUSABAS,0,0,284978,283182,0
4,DTNASC,0,2152,282826,283182,0
5,IDADE,157,630,284191,282395,787
6,OCUP,52901,66816,165261,283182,0
7,CODMUNRES,0,1263,283715,281919,1263
8,CODMUNOCOR,0,487,284491,282695,487
9,NATURAL,45623,0,239355,283182,0


In [14]:
final_attributes_values_table_percentages

Unnamed: 0,Atributo,Faltantes/Nulos,Inválidos,Válidos,Tratados,Removidos
0,DTOBITO,0.0,0.02,99.98,99.98,0.02
1,ESTADO,0.0,0.0,100.0,100.0,0.0
2,ANO,0.0,0.0,100.0,100.0,0.0
3,CAUSABAS,0.0,0.0,100.0,100.0,0.0
4,DTNASC,0.0,0.76,99.24,100.0,0.0
5,IDADE,0.06,0.22,99.72,100.0,0.3
6,OCUP,18.56,23.45,57.99,100.0,0.0
7,CODMUNRES,0.0,0.44,99.56,100.0,0.47
8,CODMUNOCOR,0.0,0.17,99.83,100.0,0.18
9,NATURAL,16.01,0.0,83.99,100.0,0.0


In [15]:
def generate_latex_table(attributes_info_values_table):
    # Set the first column as the index to ignore the DataFrame index
    attributes_info_values_table = attributes_info_values_table.set_index(attributes_info_values_table.columns[0])

    # Use the DataFrame.style.to_latex() to generate the LaTeX string
    latex_table = attributes_info_values_table.style.format(precision=2).to_latex(
        column_format="|l|" + "r|" * (len(attributes_info_values_table.columns))
    )
    
    # Replace \toprule, \midrule, and \bottomrule with \hline
    latex_table = latex_table.replace("\\toprule", "\\hline").replace("\\midrule", "\\hline").replace("\\bottomrule", "\\hline")

    # Add \hline after each row
    latex_table = latex_table.replace("\\\\\n", "\\\\ \\hline\n")

    # Return the LaTeX table as a raw string
    return latex_table

In [16]:
latex_info_table = generate_latex_table(final_attributes_values_table_percentages)
print(latex_info_table)

\begin{tabular}{|l|r|r|r|r|r|}
 & Faltantes/Nulos & Inválidos & Válidos & Tratados & Removidos \\ \hline
Atributo &  &  &  &  &  \\ \hline
DTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \hline
ESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
ANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
CAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
DTNASC & 0.00 & 0.76 & 99.24 & 100.00 & 0.00 \\ \hline
IDADE & 0.06 & 0.22 & 99.72 & 100.00 & 0.30 \\ \hline
OCUP & 18.56 & 23.45 & 57.99 & 100.00 & 0.00 \\ \hline
CODMUNRES & 0.00 & 0.44 & 99.56 & 100.00 & 0.47 \\ \hline
CODMUNOCOR & 0.00 & 0.17 & 99.83 & 100.00 & 0.18 \\ \hline
NATURAL & 16.01 & 0.00 & 83.99 & 100.00 & 0.00 \\ \hline
ESC & 14.70 & 18.47 & 66.83 & 100.00 & 0.00 \\ \hline
ESC2010 & 49.41 & 8.36 & 42.23 & 100.00 & 0.00 \\ \hline
ESCFALAGR1 & 55.77 & 7.17 & 37.05 & 100.00 & 0.00 \\ \hline
ESTCIV & 3.07 & 4.43 & 92.50 & 100.00 & 0.00 \\ \hline
LOCOCOR & 0.14 & 0.77 & 99.09 & 100.00 & 0.00 \\ \hline
RACACOR & 7.61 & 0.

In [17]:
'''
\begin{tabular}{|l|r|r|r|r|r|}
\hline
Atributo & Faltantes & Nulos/Inválidos & Válidos & Tratados & Removidos \\ \hline
DTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \hline
ESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
ANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
CAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \hline
DTNASC & 0.00 & 0.79 & 99.21 & 100.00 & 0.00 \\ \hline
IDADE & 0.06 & 0.23 & 99.71 & 99.71 & 0.29 \\ \hline
OCUP & 18.67 & 24.93 & 56.39 & 100.00 & 0.00 \\ \hline
CODMUNRES & 0.00 & 0.46 & 99.54 & 99.54 & 0.46 \\ \hline
CODMUNOCOR & 0.00 & 0.18 & 99.82 & 99.82 & 0.18 \\ \hline
NATURAL & 16.82 & 0.00 & 83.18 & 100.00 & 0.00 \\ \hline
ESC & 15.35 & 18.79 & 65.86 & 100.00 & 0.00 \\ \hline
ESC2010 & 52.22 & 8.05 & 39.73 & 100.00 & 0.00 \\ \hline
ESCFALAGR1 & 58.99 & 6.78 & 34.22 & 100.00 & 0.00 \\ \hline
ESTCIV & 3.12 & 4.37 & 92.51 & 100.00 & 0.00 \\ \hline
LOCOCOR & 0.15 & 0.81 & 99.03 & 100.00 & 0.00 \\ \hline
RACACOR & 8.04 & 0.00 & 91.96 & 100.00 & 0.00 \\ \hline
SEXO & 0.00 & 0.02 & 99.98 & 100.00 & 0.00 \\ \hline
\end{tabular}
'''

'\n\x08egin{tabular}{|l|r|r|r|r|r|}\n\\hline\nAtributo & Faltantes & Nulos/Inválidos & Válidos & Tratados & Removidos \\ \\hline\nDTOBITO & 0.00 & 0.02 & 99.98 & 99.98 & 0.02 \\ \\hline\nESTADO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nANO & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nCAUSABAS & 0.00 & 0.00 & 100.00 & 100.00 & 0.00 \\ \\hline\nDTNASC & 0.00 & 0.79 & 99.21 & 100.00 & 0.00 \\ \\hline\nIDADE & 0.06 & 0.23 & 99.71 & 99.71 & 0.29 \\ \\hline\nOCUP & 18.67 & 24.93 & 56.39 & 100.00 & 0.00 \\ \\hline\nCODMUNRES & 0.00 & 0.46 & 99.54 & 99.54 & 0.46 \\ \\hline\nCODMUNOCOR & 0.00 & 0.18 & 99.82 & 99.82 & 0.18 \\ \\hline\nNATURAL & 16.82 & 0.00 & 83.18 & 100.00 & 0.00 \\ \\hline\nESC & 15.35 & 18.79 & 65.86 & 100.00 & 0.00 \\ \\hline\nESC2010 & 52.22 & 8.05 & 39.73 & 100.00 & 0.00 \\ \\hline\nESCFALAGR1 & 58.99 & 6.78 & 34.22 & 100.00 & 0.00 \\ \\hline\nESTCIV & 3.12 & 4.37 & 92.51 & 100.00 & 0.00 \\ \\hline\nLOCOCOR & 0.15 & 0.81 & 99.03 & 100.00 & 0.00 \\ \\hline\nR