# DATASUS Data Collector using PySUS

##### use this on google colab or use on a linux distro, windows stopped working

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pkg_resources

# Package checking list
packages_to_check = [
    {
     'name': 'pandas',
     'version':'1.5.3'
     },
    {
     'name': 'pySUS',
     'version': '0.9.4'
     },
    ]

# check if necessary packages are installed, and install if not
for package in packages_to_check:
    print(package)
    try:
        pkg_resources.get_distribution(package['name'])
        print(f"{package['name']} is installed.")
    except pkg_resources.DistributionNotFound:
        print(f"{package['name']} is not installed. Installing:")
        %pip install {package['name']}=={package['version']}

{'name': 'pandas', 'version': '1.5.3'}
pandas is installed.
{'name': 'pySUS', 'version': '0.9.4'}
pySUS is installed.


In [3]:
import pandas as pd
from pysus.online_data.SIM import download

In [None]:
# Definition of all states from all regions in Brazil
states_sul = ["PR", "RS", "SC"]
states_sudeste = ["ES", "MG", "RJ", "SP"]
states_nordeste = ["AL", "BA", "CE", "MA", "PE", "PI", "PB", "RN", "SE"]
states_norte = ["AC", "AM", "AP", "PA", "RO", "RR", "TO"]
states_centrooeste = ["DF", "GO", "MS", "MT"]

# Initialization of all_states, that contain all states in Brazil
all_states = states_sul + states_sudeste + states_nordeste + states_norte + states_centrooeste

# Initialization of states_to_download, which will determine which states'
# data will be downloaded
states_to_download = all_states
print("States that will be downloaded:", states_to_download, '\n')

# Years that will be downloaded, [1996,2023) interval
years_to_download = [x for x in range(1996, 2023)]
# years_to_download = [
#                    1996, 1997, 1998, 1999, 2000, 2001, 2002,
#                    2003, 2004, 2005, 2006, 2007, 2008, 2009,
#                    2010, 2011, 2012, 2013, 2014, 2015, 2016,
#                    2017, 2018, 2019, 2020, 2021, 2022,
#                  # 2023
#                    ]
print("Years that will be downloaded:", years_to_download, '\n')

# Iteration of years
for i_year in years_to_download:
  # Initialization of data dictionary
  states_dict = {}

  # Iteration of states
  for i_state in states_to_download:
    # Store (state, year) data in states_dict
    states_dict[i_state, i_year] = download(i_state, i_year)
    print(str(i_state) + " database from " + str(i_year) + " downloaded!")

  print('states_dict result:', '\n', states_dict, '\n')

  # Initialization of data_df, which contains the states data in the current iterated year
  data_df = pd.concat({k: pd.DataFrame.from_dict(pd.read_parquet(v)) for k, v in states_dict.items()}, axis=0).reset_index()
  # For PySUS version 0.10^
  # data_df = pd.concat({k: pd.DataFrame.from_dict(v.to_dataframe()) for k, v in states_dict.items()}, axis=0).reset_index()

  # Trailing whitespace striping otherwise comparisons don't work 💀
  data_df["CAUSABAS"] = data_df["CAUSABAS"].str.strip()

  # Keep only codes that correspond to suicide
  icd10_filter_list = ["X{}".format(x) for x in range(600, 850)]
  suicides_df = data_df[data_df["CAUSABAS"].isin(icd10_filter_list)]

  # Renaming columns to names that make sense
  suicides_df.rename(columns={"level_0": "ESTADO", "level_1": "ANO"}, inplace=True)

  # Define user folder path
  user_folder_path = ".Mestrado/Orientação/.Datasus Data Reader/data_storage/0_datasus_csvs"

  # Upload to chosen folder
  suicides_df.to_csv('/content/drive/My Drive/'+ user_folder_path +'/' + str(i_year) + '.csv', index=False)

  # Use parquets if RAM explodes
  # suicides_df.to_parquet('/content/drive/My Drive/'+ user_folder_path +'/' + str(i_year) + '.parquet', index=False)

  # End of year iteration

print("Download and upload finished!")