In [1]:
# https://pypi.org/project/covid-data-api/
!pip install covid-data-api

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import findspark
findspark.init()
import pyspark

In [3]:
from covid.api import CovId19Data
from pyspark.sql.functions import col
from pyspark.sql import Row, SparkSession, functions
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StringType, IntegerType, DateType

In [4]:
sc = pyspark.SparkContext(master='spark://hd-master:7077',
                          appName='big_data')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/30 13:05:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
sc

In [6]:
spark = SQLContext(sc)



In [7]:
api = CovId19Data(force=False)

In [8]:
ls_countries = list(api.get_all_records_by_country().keys())

In [9]:
dic_countries = {}
for country in ls_countries:
    res = api.get_history_by_country(country.replace("_"," "))
    history = res[country]['history']
    for date_time in history.keys():
        date = date_time.split(" ")[0]
        confirmed = history[date_time]["confirmed"]
        recovered = history[date_time]["recovered"]
        deaths = history[date_time]["deaths"]
        k = country + " * " + date
        dic_countries[k] = {}
        dic_countries[k]["confirmed"] = confirmed
        dic_countries[k]["recovered"] = recovered
        dic_countries[k]["deaths"] = deaths

In [10]:
# dic_countries

In [11]:
rowdata = [Row(**{'key': k, **v}) for k, v in dic_countries.items()] 

In [12]:
df = spark.createDataFrame(rowdata).select('key','confirmed','recovered','deaths') 

In [13]:
split_col = functions.split(df['key'], ' * ')
df = df.withColumn('country', split_col.getItem(0))
df = df.withColumn('date', split_col.getItem(2))

In [14]:
print((df.count(), len(df.columns)))
#(229743, 6)

23/11/30 13:05:45 WARN TaskSetManager: Stage 0 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.

(229743, 6)


                                                                                

In [15]:
# number of date
len(df.select('date').distinct().collect())

23/11/30 13:05:49 WARN TaskSetManager: Stage 3 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

1143

In [16]:
# number of countries
len(df.select('country').distinct().collect())

23/11/30 13:05:52 WARN TaskSetManager: Stage 6 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

201

In [17]:
ls_column = ["country","date","confirmed","recovered","deaths"]

In [18]:
df_selection = df.select(ls_column)

In [19]:
print((df_selection.count(), len(df_selection.columns)))

23/11/30 13:05:54 WARN TaskSetManager: Stage 9 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.
[Stage 9:>                                                          (0 + 6) / 6]

(229743, 5)


                                                                                

In [20]:
non_countries = ['winter_olympics_2022', 'summer_olympics_2020', 'ms_zaandam', 'diamond_princess', 'holy_see']

In [21]:
df_selection = df_selection.filter(~col("country").isin(non_countries))

In [22]:
print((df_selection.count(), len(df_selection.columns)))

23/11/30 13:05:55 WARN TaskSetManager: Stage 12 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.

(224028, 5)


                                                                                

In [23]:
df_selection.show(5,truncate=False)

23/11/30 13:05:57 WARN TaskSetManager: Stage 15 contains a task of very large size (1360 KiB). The maximum recommended task size is 1000 KiB.


+-----------+----------+---------+---------+------+
|country    |date      |confirmed|recovered|deaths|
+-----------+----------+---------+---------+------+
|afghanistan|2020-01-22|0        |0        |0     |
|afghanistan|2020-01-23|0        |0        |0     |
|afghanistan|2020-01-24|0        |0        |0     |
|afghanistan|2020-01-25|0        |0        |0     |
|afghanistan|2020-01-26|0        |0        |0     |
+-----------+----------+---------+---------+------+
only showing top 5 rows



In [24]:
df_selection.printSchema()

root
 |-- country: string (nullable = true)
 |-- date: string (nullable = true)
 |-- confirmed: long (nullable = true)
 |-- recovered: long (nullable = true)
 |-- deaths: long (nullable = true)



In [25]:
df_selection.coalesce(1).write.mode('overwrite').option('header','true').csv('hdfs://hd-master:9000/covid-19')

23/11/30 13:05:58 WARN TaskSetManager: Stage 16 contains a task of very large size (8234 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [26]:
sc.stop()