In [1]:
# In this Spark ETL process we are loading the Covid19 data set(downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).

In [36]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [37]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('Covid19Data').getOrCreate()

In [38]:
# Step 1 - Load the Covid19 data set

# Covid19 
file = f'{cwd}/data/raw_data/covid19/owid-covid-data.csv'
cdf = spark.read.csv(path=file, header=True)
cdf.createOrReplaceTempView('CDF')
# cdf.printSchema()

# owid-covid-data.csv (downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).
# Schema:
# root
#  |-- iso_code: string (nullable = true)
#  |-- continent: string (nullable = true)
#  |-- location: string (nullable = true)
#  |-- date: string (nullable = true)
#  |-- total_cases: string (nullable = true)
#  |-- new_cases: string (nullable = true)
#  |-- new_cases_smoothed: string (nullable = true)
#  |-- total_deaths: string (nullable = true)
#  |-- new_deaths: string (nullable = true)
#  |-- new_deaths_smoothed: string (nullable = true)
#  |-- total_cases_per_million: string (nullable = true)
#  |-- new_cases_per_million: string (nullable = true)
#  |-- new_cases_smoothed_per_million: string (nullable = true)
#  |-- total_deaths_per_million: string (nullable = true)
#  |-- new_deaths_per_million: string (nullable = true)
#  |-- new_deaths_smoothed_per_million: string (nullable = true)
#  |-- reproduction_rate: string (nullable = true)
#  |-- icu_patients: string (nullable = true)
#  |-- icu_patients_per_million: string (nullable = true)
#  |-- hosp_patients: string (nullable = true)
#  |-- hosp_patients_per_million: string (nullable = true)
#  |-- weekly_icu_admissions: string (nullable = true)
#  |-- weekly_icu_admissions_per_million: string (nullable = true)
#  |-- weekly_hosp_admissions: string (nullable = true)
#  |-- weekly_hosp_admissions_per_million: string (nullable = true)
#  |-- new_tests: string (nullable = true)
#  |-- total_tests: string (nullable = true)
#  |-- total_tests_per_thousand: string (nullable = true)
#  |-- new_tests_per_thousand: string (nullable = true)
#  |-- new_tests_smoothed: string (nullable = true)
#  |-- new_tests_smoothed_per_thousand: string (nullable = true)
#  |-- positive_rate: string (nullable = true)
#  |-- tests_per_case: string (nullable = true)
#  |-- tests_units: string (nullable = true)
#  |-- total_vaccinations: string (nullable = true)
#  |-- people_vaccinated: string (nullable = true)
#  |-- people_fully_vaccinated: string (nullable = true)
#  |-- new_vaccinations: string (nullable = true)
#  |-- new_vaccinations_smoothed: string (nullable = true)
#  |-- total_vaccinations_per_hundred: string (nullable = true)
#  |-- people_vaccinated_per_hundred: string (nullable = true)
#  |-- people_fully_vaccinated_per_hundred: string (nullable = true)
#  |-- new_vaccinations_smoothed_per_million: string (nullable = true)
#  |-- stringency_index: string (nullable = true)
#  |-- population: string (nullable = true)
#  |-- population_density: string (nullable = true)
#  |-- median_age: string (nullable = true)
#  |-- aged_65_older: string (nullable = true)
#  |-- aged_70_older: string (nullable = true)
#  |-- gdp_per_capita: string (nullable = true)
#  |-- extreme_poverty: string (nullable = true)
#  |-- cardiovasc_death_rate: string (nullable = true)
#  |-- diabetes_prevalence: string (nullable = true)
#  |-- female_smokers: string (nullable = true)
#  |-- male_smokers: string (nullable = true)
#  |-- handwashing_facilities: string (nullable = true)
#  |-- hospital_beds_per_thousand: string (nullable = true)
#  |-- life_expectancy: string (nullable = true)
#  |-- human_development_index: string (nullable = true)

In [39]:
covid19 = spark.sql("""

select  iso_code                 as country_iso_3_code,
        date                     as report_date,
        coalesce(total_cases, 0) as total_cases,
        population               as population,
        coalesce(cast(total_cases as double)/cast(population as double)*100,0) as cases_population_ratio
from    CDF
where   1=1
        and weekofyear(date)=14
       
""")
covid19.createOrReplaceTempView('covid19')



In [40]:
# Step 2 - Check Null percentage for each attribute

# spark.sql("""

# select  count(*) as row_count,
#         cast(sum( case when  country_iso_3_code     is not null then 1 else 0 end) as double)/count(*) *100 as NN_iso_code,
#         cast(sum( case when  report_date            is not null then 1 else 0 end) as double)/count(*) *100 as NN_date,
#         cast(sum( case when  total_cases            is not null then 1 else 0 end) as double)/count(*) *100 as NN_total_cases,
#         cast(sum( case when  population             is not null then 1 else 0 end) as double)/count(*) *100 as NN_population,
#         cast(sum( case when  cases_population_ratio is not null then 1 else 0 end) as double)/count(*) *100 as NN_cases_population_ratio
# from    covid19

# """).show(10,False)


# +---------+-----------+-------+--------------+----------------+-------------------------+
# |row_count|NN_iso_code|NN_date|NN_total_cases|NN_population   |NN_cases_population_ratio|
# +---------+-----------+-------+--------------+----------------+-------------------------+
# |2867     |100.0      |100.0  |100.0         |99.2675270317405|100.0                    |
# +---------+-----------+-------+--------------+----------------+-------------------------+

# As we can see the population attribute has some null values (lets remove all rows withou population per country)

In [41]:
# Step 3 - Load Countries data set
countries_file = f'{cwd}/data/dims/dim_countries.csv'
countries_df = spark.read.csv(path=countries_file, header=True)
countries_df.createOrReplaceTempView('Countries')
# cdf.printSchema()

# countries.csv (downloaded from - https://github.com/google/dspl/blob/master/samples/google/canonical/countries.csv).
# Schema:
# root
#  |-- country_id: string (nullable = true)
#  |-- country_iso_3_code: string (nullable = true)
#  |-- country_name: string (nullable = true)
#  |-- country_lat: string (nullable = true)
#  |-- country_lng: string (nullable = true)

In [46]:
# TESTS - check that countries are synced in both data freams
# -------------------------------------------------------------
# spark.sql("""

# select  count(distinct country_iso_3_code) as cd_country_iso_3_code,
#         'covid19' as source_
# from    covid19

# union all

# select  count(distinct country_iso_3_code) as cd_country_iso_3_code,
#         'dimCountries' as source_
# from    countries

# """).show(100,False)

# +---------------------+------------+
# |cd_country_iso_3_code|source_     |
# +---------------------+------------+
# |222                  |covid19     |
# |242                  |dimCountries|
# +---------------------+------------+

#  As we can see we have more countries in dim countries, lets check which countries are missing in each data set


# spark.sql("""
# select  c19.iso_3,
#         con.iso_3
# from    (select  distinct lower(country_iso_3_code) as iso_3
#          from    covid19) as c19
#         full join
#         (select  distinct lower(country_iso_3_code) as iso_3
#         from    countries) as con
#         ON c19.iso_3 = con.iso_3
# where   c19.iso_3 is null or con.iso_3 is null
#         and length(c19.iso_3)=3 --Remove invalid iso_3 codes
# """).show(100,False)

#  As we can see we have invalid iso_3 codes lets remove them
#  We still have countries missing in both data sets, for now I will create a fully joined data frame

In [49]:
# Step 4 - Join both data sets and create dim_country enriched with covid 19 data

countries_and_covid_19_week_14_data = spark.sql("""
select  distinct
        con.country_id                  as country_iso_2_code,
        c19.report_date                 as covid19_report_date,
        c19.total_cases                 as total_cases,
        c19.population                  as population,
        c19.cases_population_ratio      as cases_population_ratio,
        con.country_iso_3_code          as country_iso_3_code,
        con.country_name                as country_name,
        con.country_lat                 as country_lat,
        con.country_lng                 as country_lng      
from    countries as con
        left join
        covid19 as c19
        ON c19.country_iso_3_code = con.country_iso_3_code
where   length(c19.country_iso_3_code)=3 --Remove invalid iso_3 codes
""")
countries_and_covid_19_week_14_data.createOrReplaceTempView('countries_and_covid_19_week_14_data')


In [51]:
# Step 5 - Check Null percentage for each attribute

# spark.sql("""

# select  count(*) as row_count,
#         cast(sum( case when  country_iso_2_code     is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_iso_2_code,
#         cast(sum( case when  covid19_report_date    is not null then 1 else 0 end) as double)/count(*) *100 as NN_covid19_report_date,
#         cast(sum( case when  total_cases            is not null then 1 else 0 end) as double)/count(*) *100 as NN_total_cases,
#         cast(sum( case when  population             is not null then 1 else 0 end) as double)/count(*) *100 as NN_population,
#         cast(sum( case when  cases_population_ratio is not null then 1 else 0 end) as double)/count(*) *100 as NN_cases_population_ratio,
#         cast(sum( case when  country_iso_3_code     is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_iso_3_code,
#         cast(sum( case when  country_name           is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_name,
#         cast(sum( case when  country_lat            is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_lat,
#         cast(sum( case when  country_lng            is not null then 1 else 0 end) as double)/count(*) *100 as NN_country_lng
# from    countries_and_covid_19_week_14_data

# """).show(10,False)

# No Null values

In [54]:
countries_and_covid_19_week_14_data.repartition(1).write \
.mode('overwrite') \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("countries_covid19")

!mkdir -p ./data/stg/covid19/
# Copy new file into proper data folder location
!cp ./countries_covid19/*.csv ./data/stg/covid19/countries_and_covid_19_week_14_data.csv
# Delete Spark output folder
!rm -rf ./countries_covid19
# Check that the folder deleted
!ls ./countries_covid19/*.csv

zsh:1: no matches found: ./countries_covid19/*.csv
