# COVID-19 testing by country (Python)

 - Task 1: Get a COVID-19 pandemic Wiki page using HTTP request
 - Task 2: Extract COVID-19 testing data table from the wiki HTML page
 - Task 3: Pre-process and export the extracted data frame
 - Task 4: Get a subset of the extracted data frame
 - Task 5: Calculate worldwide COVID testing positive ratio
 - Task 6: Get a sorted name list of countries that reported their testing data
 - Task 7: Identify country names with a specific pattern
 - Task 8: Pick two countries you are interested in, and then review their testing data
 - Task 9: Compare which one of the selected countries has a larger ratio of confirmed cases to population
 - Task 10: Find countries with confirmedcases to population ratio rate less than a threshold


In [39]:
# Libraries

import numpy as np
import pandas as pd

import re

import requests as rq
import xml.etree.ElementTree as et

from io import StringIO
from bs4 import BeautifulSoup as bs
from skimpy import clean_columns

In [None]:
# Page dl helper
def pdl(url, *, timeout=10, session=None):

    """
    The helper downloads the target URL.

    All parameters after * must be passed by keyword, not positionally.

    For a persistent session, define session = rq.Session() and pass it to the fn.
    """

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        ),
        "Accept": (
            "text/html,application/xhtml+xml,application/xml;"
            "q=0.9,*/*;q=0.8"
        ),
        "Accept-Language": "en-GB,en;q=0.9",
        
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Referer": "https://www.google.com/",
    }

    try:
        client = session if session is not None else rq # session.get(url, ...) or rq.get(url, ...)
        resp = client.get(url, headers=headers, timeout=timeout)
        resp.raise_for_status() # requests.HTTPError if status code 4xx/5xx
        return resp
    except rq.RequestException as e:
        print(f"Page not downloaded:\nURL: {url} ({e})")
        return None


In [12]:
# URL
    # Template page
url1 = r"https://en.wikipedia.org/w/index.php?title=Template:COVID-19_testing_by_country"
    # Current page
url2 = r"https://en.wikipedia.org/wiki/COVID-19_testing"

In [15]:
# Download the page
wiki_page = pdl(url2)

In [76]:
# Extract the tab
html = bs(wiki_page.text, "html.parser") # pd.read_html(wiki_page.text)
div = html.find("div", class_="COVID-19_testing_by_country")
df = pd.read_html(StringIO(str(div)))[0]
df.head()

Unnamed: 0,Country or region,Date[a],Tested,Units[b],Confirmed (cases),"Confirmed / tested, %","Tested / population, %","Confirmed / population, %",Ref.
0,Afghanistan,17 Dec 2020,154767,samples,49621,32.1,0.4,0.13,[248]
1,Albania,18 Feb 2021,428654,samples,96838,22.6,15.0,3.4,[249]
2,Algeria,2 Nov 2020,230553,samples,58574,25.4,0.53,0.13,[250][251]
3,Andorra,23 Feb 2022,300307,samples,37958,12.6,387.0,49.0,[252]
4,Angola,2 Feb 2021,399228,samples,20981,5.3,1.3,0.067,[253]


In [77]:
# Clean-up
    # Clean col names
df = clean_columns(df)
df = df.rename(columns={"date_a": "date", "units_b": "units"})

pc_cols = [c for c in df.columns if c.endswith("%")]
pc_cols2 = [re.sub("%", "percent", c) for c in pc_cols]
pc_map = dict(zip(pc_cols, pc_cols2))

df = df.rename(columns=pc_map)

In [78]:
    # Format dates
df["date"] = pd.to_datetime(df["date"], format="%d %b %Y", errors="coerce")

    # Filter out footnotes
df = df.loc[~ df["date"].isna(),:]

    # Format numeric cols
num_cols = [c for c in df.columns if "tested" in c or "confirmed" in c]
df[num_cols] = df[num_cols].astype("float")

    # Clean up entity names
df["country_or_region"] = df["country_or_region"].replace(regex=r"\[.*?\]", value="")

df.sample(10)

Unnamed: 0,country_or_region,date,tested,units,confirmed_cases,confirmed_tested_percent,tested_population_percent,confirmed_population_percent,ref
125,Panama,2023-01-28,7475016.0,samples,1029701.0,13.8,179.0,24.7,[386]
142,Serbia,2023-02-02,12185475.0,cases,2473599.0,20.3,175.0,35.5,[405]
107,Montenegro,2021-05-10,394388.0,samples,98449.0,25.0,62.5,15.6,[365][366]
80,Jamaica,2022-09-30,1184973.0,samples,151931.0,12.8,43.5,5.6,[336]
69,Honduras,2021-11-26,1133782.0,samples,377859.0,33.3,11.8,3.9,[325]
81,Japan,2021-03-01,8487288.0,,432773.0,5.1,6.7,0.34,[337]
143,Singapore,2021-08-03,16206203.0,samples,65315.0,0.4,284.0,1.1,[406][407]
164,United Kingdom,2022-05-19,522526476.0,samples,22232377.0,4.3,774.0,32.9,[430]
103,Mauritius,2020-11-22,289552.0,samples,494.0,0.17,22.9,0.039,[361]
139,San Marino,2023-01-29,192613.0,samples,23427.0,12.2,563.0,68.4,[402]


In [79]:
# Dimension check
d = df.shape

print(f"{d[0]} rows and {d[1]} cols.")

172 rows and 9 cols.
