Importing libraries

In [43]:
import urllib.request
import pandas as pd
import datetime

Downloading required files 

In [2]:
# Source: https://towardsdatascience.com/automatically-update-data-sources-in-python-e424dbea68d0
def download_files(url, output):
    print("Start Program ... ")
    try:
        print("Start Downloading file ... ")
        urllib.request.urlretrieve(url, output)
        print(f"File {output} -- saved!")
    except Exception as e:
        print("Downloading file error: " + str(e))
        print ("Dataframe was created with the most recent download")
    return pd.read_csv(output, compression='gzip')

In [3]:
# Downloading files from https://data.brasil.io/dataset/covid19/_meta/list.html
# This function tries to get the most recent version of the datasets
# if the download fails, it will generate the dataframe with the github dataset

df_cases = download_files('https://data.brasil.io/dataset/covid19/caso.csv.gz', 'cases.csv.gz')
df_full = download_files('https://data.brasil.io/dataset/covid19/caso_full.csv.gz', 'cases_full.csv.gz')
df_death = download_files('https://data.brasil.io/dataset/covid19/obito_cartorio.csv.gz', 'death_data.csv.gz')

Start Program ... 
Start Downloading file ... 
File cases.csv.gz -- saved!
Start Program ... 
Start Downloading file ... 
File cases_full.csv.gz -- saved!
Start Program ... 
Start Downloading file ... 
File death_data.csv.gz -- saved!


Loading files

In [9]:
# Reading in files from civil-registry

cr_deaths = pd.read_csv('civil_registry_deaths.csv')
cr_states = pd.read_csv('civil_registry_covid_states.csv')
cr_states_det = pd.read_csv('civil_registry_covid_states_detailed.csv')
cr_city = pd.read_csv('civil_registry_covid_cities.csv')
cr_city_det = pd.read_csv('civil_registry_covid_cities_detailed.csv')

Data Exploration

In [16]:
cr_states_det.dtypes

date                           object
state                          object
state_ibge_code                 int64
place                          object
gender                         object
age_group                      object
deaths_sars                   float64
deaths_pneumonia              float64
deaths_respiratory_failure    float64
deaths_septicemia             float64
deaths_indeterminate          float64
deaths_others                 float64
deaths_covid19                float64
created_at                     object
dtype: object

In [17]:
cr_states_det.isnull().mean()

date                          0.000000
state                         0.000000
state_ibge_code               0.000000
place                         0.000000
gender                        0.000000
age_group                     0.033404
deaths_sars                   0.984158
deaths_pneumonia              0.683077
deaths_respiratory_failure    0.772754
deaths_septicemia             0.722384
deaths_indeterminate          0.969544
deaths_others                 0.172506
deaths_covid19                0.976924
created_at                    0.000000
dtype: float64

In [25]:
cr_states_det['gender'].value_counts()

M    195319
F    162153
Name: gender, dtype: int64

In [29]:
cr_states_det['age_group'].value_counts().sort_values()

10-19     8392
100+     12446
20-29    18540
9-       21307
30-39    24882
40-49    33007
90-99    39356
50-59    41617
60-69    47121
80-89    49307
70-79    49556
Name: age_group, dtype: int64

In [33]:
cr_states_det.columns[cr_states_det.columns.str.contains('deaths') == True]

Index(['deaths_sars', 'deaths_pneumonia', 'deaths_respiratory_failure',
       'deaths_septicemia', 'deaths_indeterminate', 'deaths_others',
       'deaths_covid19'],
      dtype='object')

In [36]:
death_cols = cr_states_det.columns[cr_states_det.columns.str.contains('deaths') == True]
cr_states_det[death_cols].sum().sort_values()

deaths_sars                     8298.0
deaths_indeterminate           11561.0
deaths_covid19                 26343.0
deaths_respiratory_failure    134343.0
deaths_septicemia             231664.0
deaths_pneumonia              295054.0
deaths_others                 871491.0
dtype: float64

In [21]:
# Date conversion

cr_states_det['date'] = pd.to_datetime(cr_states_det['date'])

In [51]:
cr_states_det_2020 = cr_states_det[cr_states_det['date'] >= datetime.datetime(2020, 1, 1)]
cr_states_det_2019 = cr_states_det[cr_states_det['date'] < datetime.datetime(2020, 1, 1)]

In [53]:
deaths_2020 = cr_states_det_2020.groupby('date').sum()
deaths_2019 = cr_states_det_2019.groupby('date').sum()

In [70]:
cr_states_det[cr_states_det['deaths_covid19'] == 1]

Unnamed: 0,date,state,state_ibge_code,place,gender,age_group,deaths_sars,deaths_pneumonia,deaths_respiratory_failure,deaths_septicemia,deaths_indeterminate,deaths_others,deaths_covid19,created_at
3703,2020-04-07,AC,12,hospital,F,70-79,,,,,,,1.0,2020-05-30 08:50
3766,2020-04-15,AC,12,hospital,M,80-89,,,,,,,1.0,2020-05-30 08:50
3790,2020-04-18,AC,12,hospital,M,50-59,,,,,,,1.0,2020-05-30 08:50
3803,2020-04-19,AC,12,hospital,M,80-89,,,,,,,1.0,2020-05-30 08:50
3807,2020-04-20,AC,12,hospital,M,40-49,,,,,,,1.0,2020-05-30 08:50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357449,2020-05-23,TO,17,public,M,40-49,,,,,,,1.0,2020-05-30 05:04
357452,2020-05-24,TO,17,hospital,F,60-69,,,,,,,1.0,2020-05-30 05:04
357458,2020-05-25,TO,17,hospital,M,40-49,,,,,,,1.0,2020-05-30 05:04
357470,2020-05-27,TO,17,hospital,M,60-69,,,,,,,1.0,2020-05-30 05:04


Data Cleaning