# DATASUS Data to Dataframe merger

#### Merges all csv of all years into one 

In [1]:
# Importações
from IPython.display import display

import pandas as pd

import numpy as np
import datetime

# Prevent infinite warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# OS and File imports
import os
import sys

import importlib

from zipfile import ZipFile
from io import BytesIO

# List of directories to add to sys.path
directories_to_add = [
    os.path.abspath(os.path.join('..')),
    os.path.abspath(os.path.join('../utils')),
    os.path.abspath(os.path.join('../utils', 'functions')),
]

for directory in directories_to_add:
    if directory not in sys.path:
        sys.path.insert(0, directory)
print(sys.path)

['/home/user/coding_env/BrSuicides-dataset/utils/functions', '/home/user/coding_env/BrSuicides-dataset/utils', '/home/user/coding_env/BrSuicides-dataset', '/home/user/coding_env/BrSuicides-dataset/data_creation', '/mnt/d/.MESTRADO/.Orientacao/br-suicides-ts-analysis', '/mnt/d/.MESTRADO/.Orientacao/br-suicides-ts-analysis/data_analysis/utils', '/usr/local/lib/python310.zip', '/usr/local/lib/python3.10', '/usr/local/lib/python3.10/lib-dynload', '', '/home/user/coding_env/venvpysus/lib/python3.10/site-packages']


In [3]:
import user_config
importlib.reload(user_config)
import preprocessing
importlib.reload(preprocessing)

from user_config import user_dir_path
from preprocessing import df_csv_filename

## Files Location Setup

Folder structure:
```
root_folder/
    data_storage/
        attrs-utils/"relevant_attributes_infos"
        datasus/"all years csvs"
        utils/"utility functions and data"
    data_creation/
        0_Datasus_data_reader_v2.ipynb
        1_Datasus_merger_to_csv.ipynb
        2_Data_modeling.ipynb
        3_Data_preprocessing.ipynb
    plots/all_generated_plots...
```

In [4]:
root_dir = '..'

csv_dir = '/data_storage/'
csv_dir_datasus = csv_dir + '0_datasus_csvs/'
csv_dir_dirty = csv_dir + '1_dirty/'

csv_data_dir = os.path.dirname(root_dir + csv_dir) + '/'
print(csv_data_dir)

datasus_data_dir = os.path.dirname(root_dir + csv_dir_datasus) + '/'
print(datasus_data_dir)

csv_data_dir_dirty = os.path.dirname(root_dir + csv_dir_dirty) + '/'
print(csv_data_dir_dirty)

../data_storage/
../data_storage/0_datasus_csvs/
../data_storage/1_dirty/


This code is specific for Data_merger_to_csv:

In [5]:
# List all files in the csvs directory
all_files = os.listdir(datasus_data_dir)
print("All files in directory:", all_files)

# Check if a .zip file exists
zip_files = sorted([file for file in all_files if file.endswith(".zip")])
csv_files = sorted([file for file in all_files if file.endswith(".csv")])

print("Zip files found:", sorted(zip_files))
print("CSV files found:", sorted(csv_files))

All files in directory: ['2016.csv', '2015.csv', '2017.csv', '.gitkeep', '2011.csv', '2004.csv', '2012.csv', '2020.csv', '1997.csv', '1996.csv', '2000.csv', '2021.csv', '2009.csv', '2008.csv', '2002.csv', '2013.csv', '1998.csv', '2022.csv', '2014.csv', '2024.csv', '2023.csv', '2006.csv', '2003.csv', '1999.csv', '2007.csv', '2019.csv', '2005.csv', '2018.csv', '2010.csv', '2001.csv']
Zip files found: []
CSV files found: ['1996.csv', '1997.csv', '1998.csv', '1999.csv', '2000.csv', '2001.csv', '2002.csv', '2003.csv', '2004.csv', '2005.csv', '2006.csv', '2007.csv', '2008.csv', '2009.csv', '2010.csv', '2011.csv', '2012.csv', '2013.csv', '2014.csv', '2015.csv', '2016.csv', '2017.csv', '2018.csv', '2019.csv', '2020.csv', '2021.csv', '2022.csv', '2023.csv', '2024.csv']


#### Zip File Paths

In [6]:
if zip_files:
    # Use the first zip file found
    zip_file_name = zip_files[0]
    zip_path_files = os.path.join(datasus_data_dir, zip_file_name)

    print(f"Processing ZIP file: {zip_file_name}")

    # Extract region name and year interval from the zip filename
    years_interval = zip_file_name.split('-')[2].split('_')
    years_interval[1] = years_interval[1][:4]

    # Extract and process CSVs from ZIP
    csv_to_df = pd.DataFrame()
    with ZipFile(zip_path_files, 'r') as z:
        zip_csv_files = sorted([f for f in z.namelist() if f.endswith('.csv')])
        print(f'CSV files in ZIP: {zip_csv_files}')
        
        for csvf in zip_csv_files:
            print(f"Reading CSV from ZIP: {csvf}")
            with z.open(csvf) as f:
                temp_df = pd.read_csv(BytesIO(f.read()))
            csv_to_df = pd.concat([csv_to_df, temp_df], ignore_index=True)
            
elif csv_files:
    # Process unzipped CSV files directly
    print("No ZIP file found, processing unzipped CSVs.")
    years_interval = [csv_files[0].split('-')[-1].split('.')[0], csv_files[-1].split('-')[-1].split('.')[0]]

    csv_to_df = pd.DataFrame()
    for csv_file in sorted(csv_files):
        csv_path = os.path.join(datasus_data_dir, csv_file)
        print(f"Reading CSV: {csv_file}")
        temp_df = pd.read_csv(csv_path)
        csv_to_df = pd.concat([csv_to_df, temp_df], ignore_index=True)

if years_interval:
    print("Years interval:", years_interval)
    years_interval_name = f'{years_interval[0]}_{years_interval[-1]}'
    print("Years interval name:", years_interval_name)

No ZIP file found, processing unzipped CSVs.
Reading CSV: 1996.csv
Reading CSV: 1997.csv
Reading CSV: 1998.csv
Reading CSV: 1999.csv
Reading CSV: 2000.csv
Reading CSV: 2001.csv
Reading CSV: 2002.csv
Reading CSV: 2003.csv
Reading CSV: 2004.csv
Reading CSV: 2005.csv
Reading CSV: 2006.csv
Reading CSV: 2007.csv
Reading CSV: 2008.csv
Reading CSV: 2009.csv
Reading CSV: 2010.csv
Reading CSV: 2011.csv
Reading CSV: 2012.csv
Reading CSV: 2013.csv
Reading CSV: 2014.csv
Reading CSV: 2015.csv
Reading CSV: 2016.csv
Reading CSV: 2017.csv
Reading CSV: 2018.csv
Reading CSV: 2019.csv
Reading CSV: 2020.csv
Reading CSV: 2021.csv
Reading CSV: 2022.csv
Reading CSV: 2023.csv
Reading CSV: 2024.csv
Years interval: ['1996', '2024']
Years interval name: 1996_2024


In [7]:
f"{df_csv_filename.split('.')[0]}-{years_interval_name}.{df_csv_filename.split('.')[-1]}"

'brazil-suicides-1996_2024.csv'

In [8]:
df_csv_filename = f"{df_csv_filename.split('.')[0]}-{years_interval_name}.{df_csv_filename.split('.')[-1]}"

csv_to_df.to_csv(csv_data_dir_dirty + df_csv_filename, index=False)

In [9]:
dataframe = pd.read_csv(csv_data_dir_dirty + df_csv_filename, encoding="utf-8")

In [10]:
display(dataframe.info())
display(dataframe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301781 entries, 0 to 301780
Columns: 103 entries, ESTADO to ALTCAUSA
dtypes: float64(2), int64(6), object(95)
memory usage: 237.1+ MB


None

Unnamed: 0,ESTADO,ANO,level_2,contador,TIPOBITO,DTOBITO,NATURAL,DTNASC,IDADE,SEXO,...,VERSAOSIST,VERSAOSCB,ATESTADO,NUDIASOBCO,FONTES,TPRESGINFO,TPNIVELINV,NUDIASINF,FONTESINF,ALTCAUSA
0,PR,1996,49,30050.0,2,24101996,841,05081963,433,1,...,,,,,,,,,,
1,PR,1996,542,30543.0,2,4021996,152,21091920,475,1,...,,,,,,,,,,
2,PR,1996,543,30544.0,2,17011996,841,28081961,434,1,...,,,,,,,,,,
3,PR,1996,568,30569.0,2,7021996,841,24091977,418,1,...,,,,,,,,,,
4,PR,1996,631,30632.0,2,9021996,841,30081956,439,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301776,MT,2024,22693,,2,30122024,851,6011984,440,1,...,3.2.02,3.2,S069/X740 / / / ...,,,,,,,
301777,MT,2024,22701,,2,30122024,821,31072002,422,1,...,3.2.00,3.2,T71X/X700 / / / ...,,,,,,,
301778,MT,2024,22726,,2,31122024,850,8101969,455,1,...,3.2.30,3.4,T287/K922 / / / ...,,,,,,,
301779,MT,2024,22742,,2,31122024,851,5111994,430,1,...,3.2.30,3.3,T71X/X700 / / / ...,,,,,,,
