In [1]:
# In this Spark ETL process we are creating dim Countries

In [1]:
from pyspark.sql import SparkSession
from datetime import datetime, timedelta
from pyspark.sql.functions import *
import os

In [2]:
cwd = os.getcwd()
spark = SparkSession.builder.appName('DimCountries').getOrCreate()

In [3]:
# Countries
countries_file = f'{cwd}/data/raw_data/labels/countries.csv'
cdf = spark.read.csv(path=countries_file, header=True)
cdf.createOrReplaceTempView('CDF')
# cdf.printSchema()

# countries.csv (downloaded from - https://github.com/google/dspl/blob/master/samples/google/canonical/countries.csv).
# Schema:
# root
#  |-- country: string (nullable = true)
#  |-- latitude: string (nullable = true)
#  |-- longitude: string (nullable = true)
#  |-- name: string (nullable = true)

In [4]:
# Step 2 - Load the iso 2 to 3 lookup

# Covid19 
file = f'{cwd}/data/raw_data/covid19/countries_iso_2_to3_lkp.csv'
lkp = spark.read.csv(path=file, header=True)
lkp.createOrReplaceTempView('LKP')
lkp.printSchema()

# owid-covid-data.csv (downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).
# Schema:


root
 |-- country: string (nullable = true)
 |-- iso_2_code: string (nullable = true)
 |-- iso_3_code: string (nullable = true)
 |-- numeric: string (nullable = true)



In [5]:
# Step 3 - Load continent label for each country

# Covid19 
file = f'{cwd}/data/raw_data/labels/continents.csv'
lkp2 = spark.read.csv(path=file, header=True)
lkp2.createOrReplaceTempView('LKP2')
lkp2.printSchema()

# owid-covid-data.csv (downloaded from - https://github.com/owid/covid-19-data/blob/master/public/data/README.md).
# Schema:


root
 |-- continent_name: string (nullable = true)
 |-- continent_code: string (nullable = true)
 |-- iso_2_code: string (nullable = true)
 |-- iso_3_code: string (nullable = true)



In [6]:
countries = spark.sql("""
select  distinct
        C.country       as country_id,
        L2.continent_name as continent_name,
        L.iso_3_code    as country_iso_3_code,
        C.name          as country_name,        
        C.latitude      as country_lat,        
        C.longitude     as country_lng  
from    CDF C left join LKP L
        ON C.country = L.iso_2_code
        left join LKP2 L2
        ON C.country = L2.iso_2_code
where   1=1

""")


In [7]:
countries.repartition(1).write \
.format("com.databricks.spark.csv") \
.option("header", "true") \
.option("quoteAll", "true") \
.save("countries")

# Copy new file into proper data folder location
!cp ./countries/*.csv ./data/dims/dim_countries.csv
# Delete Spark output folder
!rm -rf ./countries
# Check that the folder deleted
!ls ./countries/*.csv

zsh:1: no matches found: ./countries/*.csv
