In [None]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
import configparser

In [None]:
# Read config file
config = configparser.ConfigParser()
config.read_file(open('dl.cfg'))

INPUT_DATA = config['LOCAL']['INPUT_DATA']
INPUT_DATA_VACCINES = config['LOCAL']['INPUT_DATA_VACCINES']
DATA_DICT = config['COMMON']['DATA_DICT']

### Step 1: Scope the Project and Gather Data
In this step, we’ll:

* Identify and gather the data we'll be using for our project (at least two sources and more than 1 million rows).
* Explain what end use cases we'd like to prepare the data for (e.g., analytics table, app back-end, source-of-truth database, etc.)

We choose the following datasets:
* Brazilian Government' dataset [COVID-19 population imunization program](https://dados.gov.br/dataset/covid-19-vacinacao/resource/ef3bd0b8-b605-474b-9ae5-c97390c197a8?inner_span=True)

In [None]:
# Spark session
spark = SparkSession \
        .builder\
        .config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11")\
        .enableHiveSupport().getOrCreate()

In [None]:
vaccines_df = spark.read.csv(INPUT_DATA_VACCINES, sep=';', header=True)

### Step 2: Explore and Assess the Data

In [None]:
data_dict = pd.read_json(DATA_DICT, typ='series')

In [None]:
col_names = data_dict.index.to_list()
col_names

In [None]:
vaccines_df.select(col_names).show()

In [None]:
vaccines_df.head(10)

In [None]:
vaccines_df.select("estabelecimento_razaosocial").filter(vaccines_df.estabelecimento_municipio_nome == 'PINHEIRO').show(10)

In [None]:
vaccines_df.select("vacina_grupoatendimento_codigo", "vacina_grupoatendimento_nome").distinct().show();