# Visualizações

Criar as visualizações que permitam ter bons insights e acompanhamentos em relação a
pandemia do COVID-19.

## Carregamento de datasets

In [0]:
df_data_by_country = spark.read.parquet("dbfs:/FileStore/_covid_data_lake/_ready/_data_by_country/data_by_country.parquet")
df_summary_country = spark.read.parquet("dbfs:/FileStore/_covid_data_lake/_ready/_summary_country/summary_country.parquet")

In [0]:
import pyspark.sql.functions as f
new_data_by_country = df_data_by_country.select("*") \
      .groupBy("NOME", "DATE") \
      .agg(
        f.max("CONFIRMED").alias("CONFIRMED"),
        f.max("DEATHS").alias("DEATHS"),
        f.max("RECOVERED").alias("RECOVERED")
      ) \
      .orderBy(f.asc("DATE"))

display(new_data_by_country)

NOME,DATE,CONFIRMED,DEATHS,RECOVERED
Grenada,2020-01-22,0,0,0
Portugal,2020-01-22,0,0,0
Sao Tome and Principe,2020-01-22,0,0,0
Canada,2020-01-22,0,0,0
Malta,2020-01-22,0,0,0
Nepal,2020-01-22,0,0,0
Bulgaria,2020-01-22,0,0,0
Russian Federation,2020-01-22,0,0,0
Kazakhstan,2020-01-22,0,0,0
Poland,2020-01-22,0,0,0


In [0]:
display(df_summary_country)

NOME,SIGLA,NEWCONFIRMED,TOTALCONFIRMED,NEWDEATHS,TOTALDEATHS,NEWRECOVERED,TOTALRECOVERED,DATE
Afghanistan,AF,18,55894,0,2451,90,49499,2021-03-12
Albania,AL,631,114840,17,1986,714,77498,2021-03-12
Algeria,DZ,138,114681,4,3026,109,79428,2021-03-12
Andorra,AD,41,11130,0,112,23,10708,2021-03-12
Angola,AO,47,21161,0,516,84,19761,2021-03-12
Antigua and Barbuda,AG,20,882,1,23,34,489,2021-03-12
Argentina,AR,7693,2169694,107,53359,5049,1961640,2021-03-12
Armenia,AM,340,175538,5,3237,163,165259,2021-03-12
Australia,AU,16,29090,0,909,9,22945,2021-03-12
Austria,AT,2528,481919,19,8776,2012,449053,2021-03-12


# Análise exploratória dos dados

Aqui realizaremos uma série de queries sobre os dados

## Países TOP 5 de casos confirmados de COVID-19

In [0]:
df = df_summary_country.select('NOME', 'TOTALCONFIRMED') \
    .orderBy(f.desc('TOTALCONFIRMED')) \
    .limit(5)

display(df)

NOME,TOTALCONFIRMED
United States of America,29236735
India,11285561
Brazil,11202305
Russian Federation,4302726
United Kingdom,4247879


## Países TOP 5 em número de mortes por COVID-19

In [0]:
df = df_summary_country.select('NOME', 'TOTALDEATHS') \
    .orderBy(f.desc('TOTALDEATHS')) \
    .limit(5)

display(df)

NOME,TOTALDEATHS
United States of America,529263
Brazil,270656
Mexico,192491
India,158189
United Kingdom,125222


In [0]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from datetime import datetime, date

def month_collected(data):
  return data.strftime("%B")

def year_collected(data):
  return data.strftime("%Y")

month_collected_udf = udf(month_collected, StringType())
year_collected_udf = udf(year_collected, StringType())

In [0]:
# Coletando o mês de cada célula com base na data

df = new_data_by_country.select("*", month_collected_udf("DATE").alias("MONTH"))

display(df)

NOME,DATE,CONFIRMED,DEATHS,RECOVERED,MONTH
Honduras,2020-01-22,0,0,0,January
Portugal,2020-01-22,0,0,0,January
Qatar,2020-01-22,0,0,0,January
Grenada,2020-01-22,0,0,0,January
Chad,2020-01-22,0,0,0,January
Iceland,2020-01-22,0,0,0,January
Mozambique,2020-01-22,0,0,0,January
Poland,2020-01-22,0,0,0,January
Italy,2020-01-22,0,0,0,January
Moldova,2020-01-22,0,0,0,January


In [0]:
# Coletando o ano de cada célula com base na data

df2 = df.select("*", year_collected_udf("DATE").alias("YEAR"))

display(df2)

NOME,DATE,CONFIRMED,DEATHS,RECOVERED,MONTH,YEAR
Honduras,2020-01-22,0,0,0,January,2020
Portugal,2020-01-22,0,0,0,January,2020
Qatar,2020-01-22,0,0,0,January,2020
Grenada,2020-01-22,0,0,0,January,2020
Chad,2020-01-22,0,0,0,January,2020
Iceland,2020-01-22,0,0,0,January,2020
Mozambique,2020-01-22,0,0,0,January,2020
Poland,2020-01-22,0,0,0,January,2020
Italy,2020-01-22,0,0,0,January,2020
Moldova,2020-01-22,0,0,0,January,2020


In [0]:
import pyspark.sql.functions as f

## Evolução de casos e mortes no Brasil

In [0]:
cases_total_Brazil = df2.select("NOME", "CONFIRMED", "DEATHS", "RECOVERED", "DATE") \
          .where("NOME in ('Brazil')") \
          .orderBy(f.asc('DATE'))
        
display(cases_total_Brazil)

NOME,CONFIRMED,DEATHS,RECOVERED,DATE
Brazil,0,0,0,2020-01-22
Brazil,0,0,0,2020-01-23
Brazil,0,0,0,2020-01-24
Brazil,0,0,0,2020-01-25
Brazil,0,0,0,2020-01-26
Brazil,0,0,0,2020-01-27
Brazil,0,0,0,2020-01-28
Brazil,0,0,0,2020-01-29
Brazil,0,0,0,2020-01-30
Brazil,0,0,0,2020-01-31


## Evolução de casos e mortes no Mundo

In [0]:
cases_total_world = df2.select("CONFIRMED", "DEATHS", "RECOVERED", "DATE") \
          .groupBy("DATE") \
          .agg(
            f.sum("CONFIRMED").alias("CONFIRMED"),
            f.sum("DEATHS").alias("DEATHS"),
            f.sum("RECOVERED").alias("RECOVERED")
          ) \
          .orderBy(f.asc('DATE'))
        
display(cases_total_world)

DATE,CONFIRMED,DEATHS,RECOVERED
2020-01-22,557,17,30
2020-01-23,655,18,32
2020-01-24,941,26,39
2020-01-25,1433,42,42
2020-01-26,2118,56,56
2020-01-27,2927,82,65
2020-01-28,5578,131,108
2020-01-29,6167,133,127
2020-01-30,8235,171,145
2020-01-31,9927,213,225


## Análise de efeitos da vacinação

In [0]:
import matplotlib.pyplot as plt

In [0]:
df_vacina_morte = df_data_by_country.select("NOME", "DEATHS","DATE") \
        .where("NOME in ('Brazil', 'United Kingdom', 'United States of America', 'Israel')") \
        .orderBy(f.asc('DATE'))
display(df_vacina_morte)

NOME,DEATHS,DATE
United Kingdom,0,2020-01-22
Brazil,0,2020-01-22
Israel,0,2020-01-22
United States of America,0,2020-01-22
Israel,0,2020-01-23
United Kingdom,0,2020-01-23
Brazil,0,2020-01-23
United States of America,0,2020-01-23
United Kingdom,0,2020-01-24
United States of America,0,2020-01-24


In [0]:
df_vacina_novos = df_data_by_country.select("NOME", "CONFIRMED","DATE") \
        .where("NOME in ('Brazil', 'United Kingdom', 'United States of America', 'Israel')") \
        .orderBy(f.asc('DATE'))
display(df_vacina_novos)

NOME,CONFIRMED,DATE
United Kingdom,0,2020-01-22
Brazil,0,2020-01-22
Israel,0,2020-01-22
United States of America,1,2020-01-22
Israel,0,2020-01-23
United Kingdom,0,2020-01-23
Brazil,0,2020-01-23
United States of America,1,2020-01-23
United Kingdom,0,2020-01-24
United States of America,2,2020-01-24


In [0]:
df_vacina_brasil = df_data_by_country.select("NOME", "CONFIRMED","DATE") \
        .where("NOME in ('Brazil')") \
        .orderBy(f.asc('DATE'))
df_vacina_israel = df_data_by_country.select("NOME", "CONFIRMED","DATE") \
        .where("NOME in ('Israel')") \
        .orderBy(f.asc('DATE'))
df_vacina_eua = df_data_by_country.select("NOME", "CONFIRMED","DATE") \
        .where("NOME in ('United States of America')") \
        .orderBy(f.asc('DATE'))
df_vacina_uk = df_data_by_country.select("NOME", "CONFIRMED","DATE") \
        .where("NOME in ('United Kingdom')") \
        .orderBy(f.asc('DATE'))
display(df_vacina_brasil)
display(df_vacina_israel)
display(df_vacina_eua)
display(df_vacina_uk)

NOME,CONFIRMED,DATE
Brazil,0,2020-01-22
Brazil,0,2020-01-23
Brazil,0,2020-01-24
Brazil,0,2020-01-25
Brazil,0,2020-01-26
Brazil,0,2020-01-27
Brazil,0,2020-01-28
Brazil,0,2020-01-29
Brazil,0,2020-01-30
Brazil,0,2020-01-31


NOME,CONFIRMED,DATE
Israel,0,2020-01-22
Israel,0,2020-01-23
Israel,0,2020-01-24
Israel,0,2020-01-25
Israel,0,2020-01-26
Israel,0,2020-01-27
Israel,0,2020-01-28
Israel,0,2020-01-29
Israel,0,2020-01-30
Israel,0,2020-01-31


NOME,CONFIRMED,DATE
United States of America,1,2020-01-22
United States of America,1,2020-01-23
United States of America,2,2020-01-24
United States of America,2,2020-01-25
United States of America,5,2020-01-26
United States of America,5,2020-01-27
United States of America,5,2020-01-28
United States of America,6,2020-01-29
United States of America,6,2020-01-30
United States of America,8,2020-01-31


NOME,CONFIRMED,DATE
United Kingdom,0,2020-01-22
United Kingdom,0,2020-01-23
United Kingdom,0,2020-01-24
United Kingdom,0,2020-01-25
United Kingdom,0,2020-01-26
United Kingdom,0,2020-01-27
United Kingdom,0,2020-01-28
United Kingdom,0,2020-01-29
United Kingdom,0,2020-01-30
United Kingdom,2,2020-01-31


In [0]:
df_morte_brasil = df_data_by_country.select("NOME", "DEATHS","DATE") \
        .where("NOME in ('Brazil') and DEATHS >= 1") \
        .orderBy(f.asc('DATE'))
df_morte_israel = df_data_by_country.select("NOME", "DEATHS","DATE") \
        .where("NOME in ('Israel')  and DEATHS >= 1") \
        .orderBy(f.asc('DATE'))
df_morte_eua = df_data_by_country.select("NOME", "DEATHS","DATE") \
        .where("NOME in ('United States of America') and DEATHS >= 1") \
        .orderBy(f.asc('DATE'))
df_morte_uk = df_data_by_country.select("NOME", "DEATHS","DATE") \
        .where("NOME in ('United Kingdom') and DEATHS >= 1") \
        .orderBy(f.asc('DATE'))
display(df_morte_brasil)
display(df_morte_israel)
display(df_morte_eua)
display(df_morte_uk)

NOME,DEATHS,DATE
Brazil,1,2020-03-17
Brazil,3,2020-03-18
Brazil,6,2020-03-19
Brazil,11,2020-03-20
Brazil,15,2020-03-21
Brazil,25,2020-03-22
Brazil,34,2020-03-23
Brazil,46,2020-03-24
Brazil,59,2020-03-25
Brazil,77,2020-03-26


NOME,DEATHS,DATE
Israel,1,2020-03-20
Israel,1,2020-03-21
Israel,1,2020-03-22
Israel,1,2020-03-23
Israel,4,2020-03-24
Israel,5,2020-03-25
Israel,10,2020-03-26
Israel,12,2020-03-27
Israel,13,2020-03-28
Israel,16,2020-03-29


NOME,DEATHS,DATE
United States of America,1,2020-02-29
United States of America,1,2020-03-01
United States of America,6,2020-03-02
United States of America,7,2020-03-03
United States of America,11,2020-03-04
United States of America,12,2020-03-05
United States of America,14,2020-03-06
United States of America,17,2020-03-07
United States of America,21,2020-03-08
United States of America,22,2020-03-09


NOME,DEATHS,DATE
United Kingdom,1,2020-03-06
United Kingdom,2,2020-03-07
United Kingdom,2,2020-03-08
United Kingdom,3,2020-03-09
United Kingdom,7,2020-03-10
United Kingdom,7,2020-03-11
United Kingdom,9,2020-03-12
United Kingdom,10,2020-03-13
United Kingdom,29,2020-03-14
United Kingdom,43,2020-03-15
