In [1]:
import logging
import re

import pycountry
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.enableHiveSupport().config('spark.sql.repl.eagerEval.enabled', 'true').getOrCreate()

In [3]:
df = spark.read.csv("./samples/output-files/csse_covid_19_daily_reports.csv", header=True, inferSchema=True)

In [4]:
df.dtypes

[('Last Update', 'timestamp'),
 ('Country/Region', 'string'),
 ('Province/State', 'string'),
 ('Confirmed', 'int'),
 ('Recovered', 'int'),
 ('Deaths', 'int'),
 ('Latitude', 'double'),
 ('Longitude', 'double'),
 ('input_file_name', 'string')]

In [15]:
def iso_country_udf(string: str, kind: str = 'alpha_3'):
    unmatched = {}
    alphaonly = re.findall(r'(\w+)', string).pop()
    for i in pycountry.countries:
        if alphaonly in i.name or alphaonly in i.alpha_2:
            return getattr(i, kind)

In [21]:
to_iso_country = spark.udf.register('to_iso_country', iso_country_udf)

In [22]:
df = df.withColumn('iso_country', to_iso_country(df['Country/Region']))

In [25]:
df.select(df['Country/Region']).distinct().count() - df.select(df['iso_country']).distinct().count()

36

In [26]:
df

Last Update,Country/Region,Province/State,Confirmed,Recovered,Deaths,Latitude,Longitude,input_file_name,iso_alpha_3,iso_country
2020-03-21 10:13:08,China,Hubei,67800,58946,3139,30.9756,112.2707,file:/tmp/COVID-1...,CHN,CHN
2020-03-21 17:43:03,Italy,,53578,6072,4825,41.8719,12.5674,file:/tmp/COVID-1...,ITA,ITA
2020-03-21 13:13:30,Spain,,25374,2125,1375,40.4637,-3.7492,file:/tmp/COVID-1...,ESP,ESP
2020-03-21 20:43:02,Germany,,22213,233,84,51.1657,10.4515,file:/tmp/COVID-1...,DEU,DEU
2020-03-21 11:13:12,Iran,,20610,7635,1556,32.4279,53.688,file:/tmp/COVID-1...,IRN,IRN
2020-03-21 20:43:02,France,France,14282,12,562,46.2276,2.2137,file:/tmp/COVID-1...,FRA,FRA
2020-03-21 22:43:04,US,New York,11710,0,60,42.1657,-74.9481,file:/tmp/COVID-1...,USA,USA
2020-03-21 11:13:12,"Korea, South",,8799,1540,102,35.9078,127.7669,file:/tmp/COVID-1...,ATF,ATF
2020-03-21 20:43:02,Switzerland,,6575,15,75,46.8182,8.2275,file:/tmp/COVID-1...,CHE,CHE
2020-03-21 20:43:03,United Kingdom,United Kingdom,5018,65,233,55.3781,-3.436,file:/tmp/COVID-1...,GBR,GBR


In [19]:
df.where(isnull(df.iso_alpha_3)).groupby(df['Country/Region']).count()

Country/Region,count
Ivory Coast,1
Kosovo,7
occupied Palestin...,7
Reunion,11
Macau,48
Curacao,2
Hong Kong SAR,1
Congo (Kinshasa),11
Saint Barthelemy,7
Cruise Ship,11
