In [1]:
# https://pypi.org/project/covid-data-api/
# !pip install covid-data-api

In [1]:
from covid.api import CovId19Data
from pyspark.sql import Row, SparkSession, functions
from pyspark.sql.types import StructType, StringType, IntegerType, DateType

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()

In [3]:
api = CovId19Data(force=False)

In [4]:
ls_countries = list(api.get_all_records_by_country().keys())

In [5]:
dic_countries = {}
for country in ls_countries:
    res = api.get_history_by_country(country.replace("_"," "))
    history = res[country]['history']
    for date_time in history.keys():
        date = date_time.split(" ")[0]
        confirmed = history[date_time]["confirmed"]
        recovered = history[date_time]["recovered"]
        deaths = history[date_time]["deaths"]
        k = country + " * " + date
        dic_countries[k] = {}
        dic_countries[k]["confirmed"] = confirmed
        dic_countries[k]["recovered"] = recovered
        dic_countries[k]["deaths"] = deaths

In [6]:
# dic_countries

In [7]:
rowdata = [Row(**{'key': k, **v}) for k, v in dic_countries.items()] 

In [8]:
df = spark.createDataFrame(rowdata).select('key','confirmed','recovered','deaths') 

In [9]:
split_col = functions.split(df['key'], ' * ')
df = df.withColumn('country', split_col.getItem(0))
df = df.withColumn('date', split_col.getItem(2))

In [10]:
#df.show(truncate=False)

In [11]:
print((df.count(), len(df.columns)))
#(229743, 6)

(229743, 6)


In [12]:
# number of date
len(df.select('date').distinct().collect())

1143

In [13]:
# number of countries
len(df.select('country').distinct().collect())

201

In [14]:
ls_column = ["country","date","confirmed","recovered","deaths"]

In [15]:
df_selection = df.select(ls_column)

In [16]:
df_selection.show(truncate=False)

+-----------+----------+---------+---------+------+
|country    |date      |confirmed|recovered|deaths|
+-----------+----------+---------+---------+------+
|afghanistan|2020-01-22|0        |0        |0     |
|afghanistan|2020-01-23|0        |0        |0     |
|afghanistan|2020-01-24|0        |0        |0     |
|afghanistan|2020-01-25|0        |0        |0     |
|afghanistan|2020-01-26|0        |0        |0     |
|afghanistan|2020-01-27|0        |0        |0     |
|afghanistan|2020-01-28|0        |0        |0     |
|afghanistan|2020-01-29|0        |0        |0     |
|afghanistan|2020-01-30|0        |0        |0     |
|afghanistan|2020-01-31|0        |0        |0     |
|afghanistan|2020-02-01|0        |0        |0     |
|afghanistan|2020-02-02|0        |0        |0     |
|afghanistan|2020-02-03|0        |0        |0     |
|afghanistan|2020-02-04|0        |0        |0     |
|afghanistan|2020-02-05|0        |0        |0     |
|afghanistan|2020-02-06|0        |0        |0     |
|afghanistan

In [17]:
df_selection.printSchema()

root
 |-- country: string (nullable = true)
 |-- date: string (nullable = true)
 |-- confirmed: long (nullable = true)
 |-- recovered: long (nullable = true)
 |-- deaths: long (nullable = true)



In [18]:
df_selection.repartition(1).write.mode("overwrite").csv("covid-19.csv",header='true')

In [19]:
# load file

In [20]:
schema = StructType() \
      .add("country",StringType(),True) \
      .add("date",DateType(),True) \
      .add("confirmed",IntegerType(),True) \
      .add("recovered",IntegerType(),True) \
      .add("deaths",IntegerType(),True)
df_load = spark.read.option("header",True).schema(schema) \
     .csv("covid-19.csv")

In [21]:
df_load.printSchema()

root
 |-- country: string (nullable = true)
 |-- date: date (nullable = true)
 |-- confirmed: integer (nullable = true)
 |-- recovered: integer (nullable = true)
 |-- deaths: integer (nullable = true)



In [22]:
df_load.show(truncate=False)

+-----------+----------+---------+---------+------+
|country    |date      |confirmed|recovered|deaths|
+-----------+----------+---------+---------+------+
|afghanistan|2020-01-22|0        |0        |0     |
|afghanistan|2020-01-23|0        |0        |0     |
|afghanistan|2020-01-24|0        |0        |0     |
|afghanistan|2020-01-25|0        |0        |0     |
|afghanistan|2020-01-26|0        |0        |0     |
|afghanistan|2020-01-27|0        |0        |0     |
|afghanistan|2020-01-28|0        |0        |0     |
|afghanistan|2020-01-29|0        |0        |0     |
|afghanistan|2020-01-30|0        |0        |0     |
|afghanistan|2020-01-31|0        |0        |0     |
|afghanistan|2020-02-01|0        |0        |0     |
|afghanistan|2020-02-02|0        |0        |0     |
|afghanistan|2020-02-03|0        |0        |0     |
|afghanistan|2020-02-04|0        |0        |0     |
|afghanistan|2020-02-05|0        |0        |0     |
|afghanistan|2020-02-06|0        |0        |0     |
|afghanistan