# DATASUS Data to Dataframe merger

#### Merges all csv of all years into one 

In [1]:
# Importações
from IPython.display import display

import pandas as pd

import numpy as np
import datetime

# Prevent infinite warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# OS and File imports
import os
import sys

import importlib

from zipfile import ZipFile
from io import BytesIO

# List of directories to add to sys.path
directories_to_add = [
    os.path.abspath(os.path.join('..')),
    os.path.abspath(os.path.join('../utils')),
    os.path.abspath(os.path.join('../utils', 'functions')),
]

for directory in directories_to_add:
    if directory not in sys.path:
        sys.path.insert(0, directory)
print(sys.path)

['d:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset\\utils\\functions', 'd:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset\\utils', 'd:\\.MESTRADO\\.Orientacao\\BrSuicides-dataset', 'c:\\Python312\\python312.zip', 'c:\\Python312\\DLLs', 'c:\\Python312\\Lib', 'c:\\Python312', '', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\win32', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\win32\\lib', 'C:\\Users\\pp0l0\\AppData\\Roaming\\Python\\Python312\\site-packages\\Pythonwin', 'c:\\Python312\\Lib\\site-packages']


In [3]:
import user_config
importlib.reload(user_config)
import preprocessing
importlib.reload(preprocessing)

from user_config import user_dir_path
from preprocessing import df_csv_filename

## Files Location Setup

Folder structure:
```
root_folder/
    data_storage/
        attrs-utils/"relevant_attributes_infos"
        datasus/"all years csvs"
        utils/"utility functions and data"
    data_creation/
        0_Datasus_data_reader_v2.ipynb
        1_Datasus_merger_to_csv.ipynb
        2_Data_modeling.ipynb
        3_Data_preprocessing.ipynb
    plots/all_generated_plots...
```

In [4]:
root_dir = '..'

csv_dir = '/data_storage/'
csv_dir_datasus = csv_dir + '0_datasus_csvs/'
csv_dir_dirty = csv_dir + '1_dirty/'

csv_data_dir = os.path.dirname(root_dir + csv_dir) + '/'
print(csv_data_dir)

datasus_data_dir = os.path.dirname(root_dir + csv_dir_datasus) + '/'
print(datasus_data_dir)

csv_data_dir_dirty = os.path.dirname(root_dir + csv_dir_dirty) + '/'
print(csv_data_dir_dirty)

../data_storage/
../data_storage/0_datasus_csvs/
../data_storage/1_dirty/


This code is specific for Data_merger_to_csv:

In [5]:
# List all files in the csvs directory
all_files = os.listdir(datasus_data_dir)
print("All files in directory:", all_files)

# Check if a .zip file exists
zip_files = [file for file in all_files if file.endswith('.zip')]
csv_files = [file for file in all_files if file.endswith('.csv')]

print("Zip files found:", zip_files)
print("CSV files found:", csv_files)

All files in directory: ['.gitkeep', 'brazil-suicides-1996_2023.zip']
Zip files found: ['brazil-suicides-1996_2023.zip']
CSV files found: []


#### Zip File Paths

In [6]:
if zip_files:
    # Use the first zip file found
    zip_file_name = zip_files[0]
    zip_path_files = os.path.join(datasus_data_dir, zip_file_name)

    print(f"Processing ZIP file: {zip_file_name}")

    # Extract region name and year interval from the zip filename
    years_interval = zip_file_name.split('-')[2].split('_')
    years_interval[1] = years_interval[1][:4]

    # Extract and process CSVs from ZIP
    csv_to_df = pd.DataFrame()
    with ZipFile(zip_path_files, 'r') as z:
        zip_csv_files = sorted([f for f in z.namelist() if f.endswith('.csv')])
        print(f'CSV files in ZIP: {zip_csv_files}')
        
        for csvf in zip_csv_files:
            print(f"Reading CSV from ZIP: {csvf}")
            with z.open(csvf) as f:
                temp_df = pd.read_csv(BytesIO(f.read()))
            csv_to_df = pd.concat([csv_to_df, temp_df], ignore_index=True)
            
elif csv_files:
    # Process unzipped CSV files directly
    print("No ZIP file found, processing unzipped CSVs.")
    years_interval = [csv_files[0].split('-')[-1].split('.')[0], csv_files[-1].split('-')[-1].split('.')[0]]

    csv_to_df = pd.DataFrame()
    for csv_file in sorted(csv_files):
        csv_path = os.path.join(datasus_data_dir, csv_file)
        print(f"Reading CSV: {csv_file}")
        temp_df = pd.read_csv(csv_path)
        csv_to_df = pd.concat([csv_to_df, temp_df], ignore_index=True)

if years_interval:
    print("Years interval:", years_interval)
    years_interval_name = f'{years_interval[0]}_{years_interval[-1]}'
    print("Years interval name:", years_interval_name)

Processing ZIP file: brazil-suicides-1996_2023.zip
CSV files in ZIP: ['1996.csv', '1997.csv', '1998.csv', '1999.csv', '2000.csv', '2001.csv', '2002.csv', '2003.csv', '2004.csv', '2005.csv', '2006.csv', '2007.csv', '2008.csv', '2009.csv', '2010.csv', '2011.csv', '2012.csv', '2013.csv', '2014.csv', '2015.csv', '2016.csv', '2017.csv', '2018.csv', '2019.csv', '2020.csv', '2021.csv', '2022.csv', '2023.csv']
Reading CSV from ZIP: 1996.csv
Reading CSV from ZIP: 1997.csv
Reading CSV from ZIP: 1998.csv
Reading CSV from ZIP: 1999.csv
Reading CSV from ZIP: 2000.csv
Reading CSV from ZIP: 2001.csv
Reading CSV from ZIP: 2002.csv
Reading CSV from ZIP: 2003.csv
Reading CSV from ZIP: 2004.csv
Reading CSV from ZIP: 2005.csv
Reading CSV from ZIP: 2006.csv
Reading CSV from ZIP: 2007.csv
Reading CSV from ZIP: 2008.csv
Reading CSV from ZIP: 2009.csv
Reading CSV from ZIP: 2010.csv
Reading CSV from ZIP: 2011.csv
Reading CSV from ZIP: 2012.csv
Reading CSV from ZIP: 2013.csv
Reading CSV from ZIP: 2014.csv
Readi

In [7]:
f'{df_csv_filename.split('.')[0]}-{years_interval_name}.{df_csv_filename.split('.')[-1]}'

'brazil-suicides-1996_2023.csv'

In [8]:
df_csv_filename = f'{df_csv_filename.split('.')[0]}-{years_interval_name}.{df_csv_filename.split('.')[-1]}'

csv_to_df.to_csv(csv_data_dir_dirty + df_csv_filename, index=False)

In [9]:
dataframe = pd.read_csv(csv_data_dir_dirty + df_csv_filename, encoding='utf-8')

In [10]:
display(dataframe.info())
display(dataframe)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285030 entries, 0 to 285029
Columns: 103 entries, ESTADO to ALTCAUSA
dtypes: float64(3), int64(6), object(94)
memory usage: 224.0+ MB


None

Unnamed: 0,ESTADO,ANO,level_2,contador,TIPOBITO,DTOBITO,NATURAL,DTNASC,IDADE,SEXO,...,VERSAOSIST,VERSAOSCB,ATESTADO,NUDIASOBCO,FONTES,TPRESGINFO,TPNIVELINV,NUDIASINF,FONTESINF,ALTCAUSA
0,PR,1996,49,30050.0,2,24101996,841,05081963,433,1,...,,,,,,,,,,
1,PR,1996,542,30543.0,2,4021996,152,21091920,475,1,...,,,,,,,,,,
2,PR,1996,543,30544.0,2,17011996,841,28081961,434,1,...,,,,,,,,,,
3,PR,1996,568,30569.0,2,7021996,841,24091977,418,1,...,,,,,,,,,,
4,PR,1996,631,30632.0,2,9021996,841,30081956,439,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285025,MT,2023,21481,,2,26122023,841,04031986,437,1,...,3.2.30,3.4,T71/X700 ...,,,,,,,
285026,MT,2023,21535,,2,27122023,800,11081995,428,1,...,3.2.30,3.4,T71/X700 ...,,,,,,,
285027,MT,2023,21595,,2,28122023,851,09071967,456,1,...,3.2.30,3.3,S069/X749 ...,,,,,,,
285028,MT,2023,21606,,2,28122023,851,10081966,457,1,...,3.2.00,3.2,T71/X700 ...,,,,,,,
