# DATASUS Data to Dataframe merger

#### Merges all csv of all years into one 

In [1]:
# Importações
from IPython.display import display

import pandas as pd

import numpy as np
import datetime

# Prevent infinite warnings
import warnings
warnings.filterwarnings('ignore')

# OS and File imports
import os
from zipfile import ZipFile
from io import BytesIO

## Files Location Setup

Folder structure:
```
root_folder/
    data_storage/
        attrs-utils/"relevant_attributes_infos"
        datasus/"all years csvs"
        utils/"utility functions and data"
    data_creation/
        0_Datasus_data_reader_v2.ipynb
        1_Datasus_merger_to_csv.ipynb
        2_Data_modeling.ipynb
        3_Data_preprocessing.ipynb
    plots/all_generated_plots...
```

### User set folder path:

##### Should change only the initial "user_dir_path"

In [2]:
# ---------------- USER SET FOLDER PATH ----------------
user_dir_path = ''

root_dir = '..'

csv_dir = '/data_storage/'
csv_dir_datasus = csv_dir + '0_datasus_csvs/'
csv_dir_dirty = csv_dir + '1_dirty/'
# csv_dir_clean = csv_dir + '2_clean/'
# csv_dir_preprocessed = csv_dir + '3_preprocessed/'

csv_data_dir = os.path.dirname(root_dir + csv_dir) + '/'
print(csv_data_dir)

datasus_data_dir = os.path.dirname(root_dir + csv_dir_datasus) + '/'
print(datasus_data_dir)
csv_data_dir_dirty = os.path.dirname(root_dir + csv_dir_dirty) + '/'
print(csv_data_dir_dirty)
# csv_data_dir_clean = os.path.dirname(root_dir + csv_dir_clean) + '/'
# print(csv_data_dir_clean)
# csv_data_dir_preprocessed = os.path.dirname(root_dir + csv_dir_preprocessed) + '/'
# print(csv_data_dir_preprocessed)

../data_storage/
../data_storage/0_datasus_csvs/
../data_storage/1_dirty/


This code is specific for Data_merger_to_csv:

In [3]:
# List all files in the csvs directory
all_files = os.listdir(datasus_data_dir)

# Filter for files that end with .zip extension
zip_files = [file for file in all_files if file.endswith('.zip')]
print('Zips: ', '\n', zip_files)

# Filter for files that end with .zip extension
zip_file_name = zip_files[0]
print('Zip file name: ', '\n', zip_file_name)

# Get region name
region_name = zip_files[0].split('-')[1]
print('Region name: ', '\n', region_name)

years_interval = zip_files[0].split('-')[2].split('_')
years_interval[1] = years_interval[1][0:4]
print('Years interval: ', '\n' , years_interval)

# Years that will be downloaded, [1996,2023) interval
years = [x for x in range(int(years_interval[0]), (int(years_interval[-1]) + 1))]
# years = [1996, 1997, 1998, 1999, 2000, 2001, 2002, 
#          2003, 2004, 2005, 2006, 2007, 2008, 2009, 
#          2010, 2011, 2012, 2013, 2014, 2015, 2016, 
#          2017, 2018, 2019, 2020, 2021, 2022, 
#        # 2023
#          ]

print('Years:', '\n' , years)

Zips:  
 ['suicidios-brazil-1996_2022.zip']
Zip file name:  
 suicidios-brazil-1996_2022.zip
Region name:  
 brazil
Years interval:  
 ['1996', '2022']
Years: 
 [1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]


#### Zip File Paths

In [4]:
# Path to zip file
zip_path = user_dir_path + datasus_data_dir
print(zip_path)
# Path with parquet file .zip
zip_path_files = user_dir_path + datasus_data_dir + zip_file_name
print(zip_path_files)

# Path to zip file on colab
# zip_file_path_colab = google_colab_dir + user_colab_folder_path + datasus_data_dir
# print(zip_file_path_colab)
# Path with parquet file .zip on colab
# zip_path_files_colab = google_colab_dir + user_colab_folder_path + datasus_data_dir + zip_file_name
# print(zip_path_files_colab)

# Setting csv file name
df_csv_filename = 'suicidios-' + region_name + '-' + years_interval[0] + '_' + years_interval[-1] + '.csv'
print('\nCSV Filename: ', '\n', df_csv_filename)

../data_storage/0_datasus_csvs/
../data_storage/0_datasus_csvs/suicidios-brazil-1996_2022.zip

CSV Filename:  
 suicidios-brazil-1996_2022.csv


### CSV / Parquet to Dataframe conversion

In [5]:
# CSV CONVERSION

# Initialize csv_to_df
csv_to_df = pd.DataFrame()

# with ZipFile(zip_path_files_colab, 'r') as z:
with ZipFile(zip_path_files, 'r') as z:
    # List comprehension to find all CSV files within the zip
    csv_files = sorted([f for f in z.namelist() if f.endswith('.csv')])
    print('CSV files:', csv_files)
    
    # Iterate through the list of CSV files
    for csvf in csv_files:
        # BytesIO to read file into memory, then read with Pandas
        with z.open(csvf) as f:
            temp_df = pd.read_csv(BytesIO(f.read()))
        
        # Concatenate current DataFrame with the Resulting DataFrame
        csv_to_df = pd.concat([csv_to_df, temp_df], ignore_index=True)

# Local PC
csv_to_df.to_csv(csv_data_dir_dirty + df_csv_filename, index=False)

CSV files: ['1996.csv', '1997.csv', '1998.csv', '1999.csv', '2000.csv', '2001.csv', '2002.csv', '2003.csv', '2004.csv', '2005.csv', '2006.csv', '2007.csv', '2008.csv', '2009.csv', '2010.csv', '2011.csv', '2012.csv', '2013.csv', '2014.csv', '2015.csv', '2016.csv', '2017.csv', '2018.csv', '2019.csv', '2020.csv', '2021.csv', '2022.csv']


In [6]:
# # PARQUET CONVERSION

# # Initialize parquet_to_df
# parquet_to_df = pd.DataFrame()

# # with ZipFile(zip_path_files_colab, 'r') as z:
# with ZipFile(zip_path_files, 'r') as z:
#     # List comprehension to find all Parquet files within the zip
#     parquet_files = [f for f in z.namelist() if f.endswith('.parquet')]
    
#     for pf in parquet_files:
#         # BytesIO to read file into memory, then read with Pandas
#         with z.open(pf) as f:
#             temp_df = pd.read_parquet(BytesIO(f.read()))
        
#         # Concatenate current DataFrame with the Resulting DataFrame
#         parquet_to_df = pd.concat([parquet_to_df, temp_df], ignore_index=True)

# # Local PC
# parquet_to_df.to_csv(df_csv_filename, index=False)

In [7]:
dataframe = pd.read_csv(csv_data_dir_dirty + df_csv_filename, encoding='utf-8')

In [8]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268028 entries, 0 to 268027
Columns: 103 entries, ESTADO to ALTCAUSA
dtypes: float64(3), int64(6), object(94)
memory usage: 210.6+ MB


In [9]:
# dataframe = dataframe.dropna(axis="columns", how="all")
# dataframe.info()