# Data Engineering Capstone Project

## Enviroment setup

In [1]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id as mono_id
import configparser

In [2]:
# Read config file
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

INPUT_DATA = config['LOCAL']['INPUT_DATA']
INPUT_DATA_VACCINES = config['LOCAL']['INPUT_DATA_VACCINES']
OUTPUT_DATA = config['LOCAL']['OUTPUT_DATA']
DATA_COLUMNS = config['COMMON']['DATA_COLUMNS']

In [3]:
# Spark session
spark = SparkSession \
        .builder\
        .getOrCreate()

In [4]:
def write_parquet(df, parquet_name):
    parquet_path = OUTPUT_DATA + f'{parquet_name}.parquet'
    df.write.mode("overwrite").parquet(parquet_path)
    print(f'Writing {parquet_name} Table DONE.')


In [5]:
def read_parquet(parquet_name):
    parquet_path = OUTPUT_DATA + f'{parquet_name}.parquet'
    return spark.read.parquet(parquet_path)

In [None]:
def check_nulls(df, columns_list, expected_value):
    df.createOrReplaceTempView("viewcheck")
    sql_check = f"SELECT COUNT(*) FROM viewcheck WHERE 1 <> 1 {''.join([' OR ' + c + ' IS NULL ' for c in columns_list])}"
    
    dfcheck = spark.sql(sql_check)
    
    value_check = dfcheck.collect()[0][0]
    
    return value_check == expected_value

In [None]:
def check_has_content(df):
    return df.count() > 0

## Step 1: Scope the Project and Gather Data
In this step, we’ll:

* Identify and gather the data we'll be using for our project (at least two sources and more than 1 million rows).
* Explain what end use cases we'd like to prepare the data for (e.g., analytics table, app back-end, source-of-truth database, etc.)

We choose the following datasets:
* Brazilian Government' dataset [COVID-19 population imunization program](https://dados.gov.br/dataset/covid-19-vacinacao/resource/ef3bd0b8-b605-474b-9ae5-c97390c197a8?inner_span=True)

In [6]:
vaccines_df = spark.read.csv(INPUT_DATA_VACCINES, sep=';', header=True)

vaccines_df.printSchema()

root
 |-- data_importacao_rnds: string (nullable = true)
 |-- document_id: string (nullable = true)
 |-- estabelecimento_municipio_codigo: string (nullable = true)
 |-- estabelecimento_municipio_nome: string (nullable = true)
 |-- estabelecimento_razaosocial: string (nullable = true)
 |-- estabelecimento_uf: string (nullable = true)
 |-- estabelecimento_valor: string (nullable = true)
 |-- estalecimento_nofantasia: string (nullable = true)
 |-- id_sistema_origem: string (nullable = true)
 |-- paciente_datanascimento: string (nullable = true)
 |-- paciente_endereco_cep: string (nullable = true)
 |-- paciente_endereco_coibgemunicipio: string (nullable = true)
 |-- paciente_endereco_copais: string (nullable = true)
 |-- paciente_endereco_nmmunicipio: string (nullable = true)
 |-- paciente_endereco_nmpais: string (nullable = true)
 |-- paciente_endereco_uf: string (nullable = true)
 |-- paciente_enumsexobiologico: string (nullable = true)
 |-- paciente_id: string (nullable = true)
 |-- pac

## Step 2: Explore and Assess the Data
In this step we need:
* Explore the data to identify data quality issues, like missing values, duplicate data, etc.
* Document steps necessary to clean the data

In [7]:
# Read the data dictionary from JSON and extract the valid columns
col_names = pd.read_json(DATA_COLUMNS, typ='series')
valid_columns = col_names.index
valid_columns

Index(['paciente_id', 'paciente_idade', 'paciente_datanascimento',
       'paciente_enumsexobiologico', 'paciente_endereco_nmpais',
       'paciente_endereco_uf', 'paciente_endereco_nmmunicipio',
       'estabelecimento_razaosocial', 'estalecimento_nofantasia',
       'estabelecimento_uf', 'estabelecimento_municipio_nome',
       'vacina_categoria_codigo', 'vacina_categoria_nome',
       'vacina_grupoatendimento_codigo', 'vacina_grupoatendimento_nome',
       'vacina_fabricante_nome', 'vacina_codigo', 'vacina_nome',
       'vacina_dataaplicacao'],
      dtype='object')

In [8]:
# Get the difference between the dataframe colums and the valid columns
columns_todrop = list(set(vaccines_df.columns) - set(valid_columns))

columns_todrop

['estabelecimento_valor',
 'id_sistema_origem',
 'paciente_racacor_codigo',
 'paciente_racacor_valor',
 'vacina_fabricante_referencia',
 'data_importacao_rnds',
 'paciente_endereco_coibgemunicipio',
 'paciente_endereco_cep',
 'document_id',
 'estabelecimento_municipio_codigo',
 'vacina_lote',
 'paciente_nacionalidade_enumnacionalidade',
 'paciente_endereco_copais',
 'sistema_origem',
 'vacina_descricao_dose']

In [9]:
# Remove unused columns from dataframe
vaccines_df = vaccines_df.drop(*columns_todrop)
vaccines_df.printSchema()

root
 |-- estabelecimento_municipio_nome: string (nullable = true)
 |-- estabelecimento_razaosocial: string (nullable = true)
 |-- estabelecimento_uf: string (nullable = true)
 |-- estalecimento_nofantasia: string (nullable = true)
 |-- paciente_datanascimento: string (nullable = true)
 |-- paciente_endereco_nmmunicipio: string (nullable = true)
 |-- paciente_endereco_nmpais: string (nullable = true)
 |-- paciente_endereco_uf: string (nullable = true)
 |-- paciente_enumsexobiologico: string (nullable = true)
 |-- paciente_id: string (nullable = true)
 |-- paciente_idade: string (nullable = true)
 |-- vacina_categoria_codigo: string (nullable = true)
 |-- vacina_categoria_nome: string (nullable = true)
 |-- vacina_codigo: string (nullable = true)
 |-- vacina_dataaplicacao: string (nullable = true)
 |-- vacina_fabricante_nome: string (nullable = true)
 |-- vacina_grupoatendimento_codigo: string (nullable = true)
 |-- vacina_grupoatendimento_nome: string (nullable = true)
 |-- vacina_nome

In [10]:
# Replace the null values
vaccines_df = vaccines_df.fillna(\
    {\
        'vacina_categoria_codigo': 0, \
        'vacina_categoria_nome': 'N/A', \
        'vacina_grupoatendimento_nome': 'N/A', \
        'paciente_enumsexobiologico': 'N/A',\
        'estalecimento_nofantasia': 'N/A'
    })

## Step 3: Define the Data Model
_Map out the conceptual data model and explain why you chose that model_

The data model is a star schema consisting of 5 Dimensions table and 1 Fact table:
  * Dimensions tables:
      * vaccines table: Vaccines and suppliers
      * health_institution table: Hospitals, Nursing home, Clinics 
      * category table: Priority groups
      * population_group table: Demograph group (professions, age group, ethnicity)
      * patient table: Demograph data (age, city, gender)
  * Fact table:
      * imunization table: Dimensions, First | second dose, date

![ER Data Model - Star Scheme](./docs/er-model-star.jpg)

_List the steps necessary to pipeline the data into the chosen data model_
* ETL starts the enviroment setup: imports, read config file, def functions and create Spark Session
* ETL script takes source data (Brazilian Government' dataset COVID-19 population imunization program)
* Raw data is read into dataframe and cleaned (remove unused columns, fill nulls) 
* For each dimension and fact table 
	* Create a temporary view table
	* Read data to new dataframe
    * Check data quality: key columns don't have nulls, each table has content
	* Create id/indexes (if necessary)
	* Write parquet files

## Step 4: Run ETL to Model the Data
* Create the data pipelines and the data model
* Include a data dictionary
* Run data quality checks to ensure the pipeline ran as expected
	* Integrity constraints on the relational database (e.g., unique key, data type, etc.)
	* Unit tests for the scripts to ensure they are doing the right thing
	* Source/count checks to ensure completeness

In [11]:
# Create vaccines table and write parquet files
vaccines_df.createOrReplaceTempView("vaccines_table_DF")
vaccines_table_DF = spark.sql("""
    SELECT  DISTINCT vacina_codigo AS id, 
                     vacina_nome AS name, 
                     vacina_fabricante_nome AS supplier
    FROM vaccines_table_DF
    ORDER BY supplier
""")

vaccines_table_DF.printSchema()
vaccines_table_DF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- supplier: string (nullable = true)

+---+--------------------+--------------------+
| id|                name|            supplier|
+---+--------------------+--------------------+
| 86|Covid-19-Coronava...|   FUNDACAO BUTANTAN|
| 85|Vacina Covid-19 -...|FUNDACAO OSWALDO ...|
+---+--------------------+--------------------+



In [None]:
# Check data quality
check_nulls(vaccines_table_DF, ['id', 'name', 'supplier'], 0)
check_has_content(vaccines_table_DF)

In [12]:
# Write parquet file and get back to Spark:
write_parquet(vaccines_table_DF, 'vaccines')
vaccines_table_DF = read_parquet('vaccines')

Writing vaccines Table DONE.


In [13]:
# Create Health Institution table and write parquet files
vaccines_df.createOrReplaceTempView("health_institution_table_DF")
health_institution_table_DF = spark.sql("""
    SELECT DISTINCT estalecimento_nofantasia AS name,
                    estabelecimento_razaosocial AS organization,
                    estabelecimento_uf AS state,
                    estabelecimento_municipio_nome AS city
    FROM health_institution_table_DF
    ORDER BY name
""")

health_institution_table_DF = health_institution_table_DF.select(mono_id().alias('id'), '*')
health_institution_table_DF.printSchema()
health_institution_table_DF.show()

root
 |-- id: long (nullable = false)
 |-- name: string (nullable = false)
 |-- organization: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)

+------------+--------------------+--------------------+-----+-------------------+
|          id|                name|        organization|state|               city|
+------------+--------------------+--------------------+-----+-------------------+
|           0|AMBULATORIO MUNIC...|PREFEITURA MUNICI...|   MA|    POCAO DE PEDRAS|
|           1|CEADIM DE SERRANO...|MUNICIPIO DE SERR...|   MA|SERRANO DO MARANHAO|
|  8589934592|CENTRO DE ATENDIM...|PREFEITURA MUNICI...|   MA|      VARGEM GRANDE|
| 17179869184|CENTRO DE ESPECIA...|PREFEITURA MUNICI...|   MA|    BARAO DE GRAJAU|
| 25769803776|CENTRO DE ESPECIA...|PREFEITURA MUNICI...|   MA|           PINHEIRO|
| 34359738368|CENTRO DE SAUDE A...|MUNICIPIO DE URBA...|   MA|      URBANO SANTOS|
| 42949672960|CENTRO DE SAUDE A...|PREFEITURA MUNICI...|   M

In [None]:
# Check data quality
check_nulls(health_institution_table_DF, ['id', 'name', 'organization', 'state', 'city'], 0)
check_has_content(health_institution_table_DF)

In [14]:
# Write parquet file and get back to Spark:
write_parquet(health_institution_table_DF, 'health_institution')
health_institution_table_DF = read_parquet('health_institution')

Writing health_institution Table DONE.


In [15]:
# Create Category table and write parquet files
vaccines_df.createOrReplaceTempView("category_table_DF")
category_table_DF = spark.sql("""
    SELECT DISTINCT vacina_categoria_codigo AS id,
                    vacina_categoria_nome AS name
            FROM category_table_DF
            ORDER BY name
""")

category_table_DF.printSchema()
category_table_DF.show()

root
 |-- id: string (nullable = false)
 |-- name: string (nullable = false)

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|        Comorbidades|
|  2|        Faixa Etária|
| 11|Pessoas com Defic...|
|  3|Pessoas de 60 ano...|
|  7|     Povos Indígenas|
|  6|Povos e Comunidad...|
|  9|Trabalhadores de ...|
+---+--------------------+



In [None]:
# Check data quality
check_nulls(category_table_DF, ['id', 'name'], 0)
check_has_content(category_table_DF)

In [16]:
# Write parquet file and get back to Spark:
write_parquet(category_table_DF, 'category')
category_table_DF = read_parquet('category')

Writing category Table DONE.


In [17]:
# Create Population Groups table and write parquet files
vaccines_df.createOrReplaceTempView("population_group_table_DF")
population_group_table_DF = spark.sql("""
    SELECT DISTINCT vacina_grupoatendimento_codigo AS id,
                    vacina_grupoatendimento_nome AS name
            FROM population_group_table_DF
        ORDER BY name
""")

population_group_table_DF.printSchema()
population_group_table_DF.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = false)

+----+--------------------+
|  id|                name|
+----+--------------------+
| 905|  Cuidador de Idosos|
| 103|   Diabetes Mellitus|
| 105|Doença Renal Crônica|
| 106|Doenças Cardiovas...|
| 907|       Enfermeiro(a)|
| 909|     Fisioterapeutas|
| 911|Funcionário do Si...|
| 107|Hipertensão de di...|
| 102|          Neoplasias|
| 915|       Nutricionista|
| 926|              Outros|
|1102|Pessoas com Defic...|
| 201|Pessoas de 18 a 6...|
| 301|Pessoas de 60 ano...|
| 202|Pessoas de 65 a 6...|
| 203|Pessoas de 70 a 7...|
| 204|Pessoas de 75 a 7...|
| 205|Pessoas de 80 ano...|
| 701|Povos indígenas e...|
| 917|Profissionais e A...|
+----+--------------------+
only showing top 20 rows



In [None]:
# Check data quality
check_nulls(population_group_table_DF, ['id', 'name'], 0)
check_has_content(population_group_table_DF)

In [18]:
# Write parquet file and get back to Spark:
write_parquet(population_group_table_DF, 'population_group')
population_group_table_DF = read_parquet('population_group')

Writing population_group Table DONE.


In [19]:
# Create Patient table and write parquet files
vaccines_df.createOrReplaceTempView("patient_table_DF")
patient_table_DF = spark.sql("""
    SELECT DISTINCT paciente_id AS id,
                    paciente_idade AS age,
                    paciente_datanascimento AS birthdate,
                    paciente_enumsexobiologico AS gender,
                    paciente_endereco_nmpais AS country,
                    paciente_endereco_uf AS state,
                    paciente_endereco_nmmunicipio AS city
            FROM patient_table_DF
            WHERE paciente_id IS NOT NULL
            ORDER BY id
""")

patient_table_DF.printSchema()
patient_table_DF.show()

root
 |-- id: string (nullable = true)
 |-- age: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- gender: string (nullable = false)
 |-- country: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)

+--------------------+---+-------------------+------+-------+-----+--------------------+
|                  id|age|          birthdate|gender|country|state|                city|
+--------------------+---+-------------------+------+-------+-----+--------------------+
|001172d8023bf74e8...| 69|1951-05-03 00:00:00|     M| BRASIL|   MA|            SAO LUIS|
|0037915f9d8c82740...| 80|1940-07-14 00:00:00|     F| BRASIL|   MA|          ACAILANDIA|
|003f0c135c70c7f4d...| 82|1938-05-09 00:00:00|     F| BRASIL|   MA|  ITINGA DO MARANHAO|
|00ba3b39629b28be3...| 61|1960-01-21 00:00:00|     F| BRASIL|   MA|      BARRA DO CORDA|
|012c38283b39186dd...| 50|1970-04-03 00:00:00|     M| BRASIL|   MA|       PRIMEIRA CRUZ|
|014d4147b3c1cde08...

In [None]:
# Check data quality
check_nulls(patient_table_DF, ['id', 'age', 'birthdate', 'gender', 'country', 'state', 'city'], 0)
check_has_content(patient_table_DF)

In [20]:
# Write parquet file and get back to Spark:
write_parquet(patient_table_DF, 'patient')
patient_table_DF = read_parquet('patient')

Writing patient Table DONE.


In [21]:
vaccines_df_joined = vaccines_df.join(health_institution_table_DF, \
                                      [vaccines_df.estalecimento_nofantasia == health_institution_table_DF.name,\
                                      vaccines_df.estabelecimento_razaosocial == health_institution_table_DF.organization])

vaccines_df_joined.printSchema()

root
 |-- estabelecimento_municipio_nome: string (nullable = true)
 |-- estabelecimento_razaosocial: string (nullable = true)
 |-- estabelecimento_uf: string (nullable = true)
 |-- estalecimento_nofantasia: string (nullable = false)
 |-- paciente_datanascimento: string (nullable = true)
 |-- paciente_endereco_nmmunicipio: string (nullable = true)
 |-- paciente_endereco_nmpais: string (nullable = true)
 |-- paciente_endereco_uf: string (nullable = true)
 |-- paciente_enumsexobiologico: string (nullable = false)
 |-- paciente_id: string (nullable = true)
 |-- paciente_idade: string (nullable = true)
 |-- vacina_categoria_codigo: string (nullable = false)
 |-- vacina_categoria_nome: string (nullable = false)
 |-- vacina_codigo: string (nullable = true)
 |-- vacina_dataaplicacao: string (nullable = true)
 |-- vacina_fabricante_nome: string (nullable = true)
 |-- vacina_grupoatendimento_codigo: string (nullable = true)
 |-- vacina_grupoatendimento_nome: string (nullable = false)
 |-- vacina

In [22]:
# Create Patient table and write parquet files
vaccines_df_joined.createOrReplaceTempView("imunization_table_DF")
imunization_table_DF = spark.sql("""
    SELECT distinct paciente_id AS patient_id,
            id AS health_institution_id,
            vacina_categoria_codigo AS category_id,
            vacina_grupoatendimento_codigo AS population_group_id,
            vacina_codigo AS vaccines_id,
            vacina_descricao_dose AS vaccines_dose,
            vacina_dataaplicacao AS jab_date
        FROM imunization_table_DF
        ORDER BY jab_date
""")

imunization_table_DF.printSchema()

root
 |-- patient_id: string (nullable = true)
 |-- health_institution_id: long (nullable = true)
 |-- category_id: string (nullable = false)
 |-- population_group_id: string (nullable = true)
 |-- vaccines_id: string (nullable = true)
 |-- jab_date: string (nullable = true)



In [None]:
# Check data quality
check_nulls(imunization_table_DF, ['patient_id', 'health_institution_id', 'category_id', 'population_group_id', 'vaccines_id', 'vaccines_dose', 'jab_date'], 0)
check_has_content(imunization_table_DF)

In [23]:
# Write parquet file and get back to Spark:
write_parquet(imunization_table_DF, 'imunization')
imunization_table_DF = read_parquet('imunization')

Writing imunization Table DONE.


In [24]:
imunization_table_DF.show()

+--------------------+---------------------+-----------+-------------------+-----------+-------------------+
|          patient_id|health_institution_id|category_id|population_group_id|vaccines_id|           jab_date|
+--------------------+---------------------+-----------+-------------------+-----------+-------------------+
|1e0a0f8549d3f0d4d...|         429496729600|          2|                203|         86|2021-03-29 21:00:00|
|4acc91ff29e63517e...|         163208757249|          6|                601|         85|2021-03-29 21:00:00|
|c38e7ab6cdb03b114...|         566935683072|          6|                601|         85|2021-03-29 21:00:00|
|7c66c5cf8de7570fc...|         618475290624|          2|                203|         86|2021-03-29 21:00:00|
|9d341f28d622689d6...|         618475290624|          2|                204|         86|2021-03-29 21:00:00|
|378809f96d3f0ded7...|         781684047872|          6|                601|         85|2021-03-29 21:00:00|
|2ac4028a431da5bc6.