# Canada Vaccine deliveries

This Notebook extracts the Vaccine delivery forecasts from [Canada.ca](https://www.canada.ca/en/public-health/services/diseases/2019-novel-coronavirus-infection/prevention-risks/covid-19-vaccine-treatment/vaccine-rollout.html#a4b)

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas

In [8]:
def find_table(heading):
    """
    Helper function to find the next table from a heading.
    We are just walking siblings until we find a `<table>`
    """
    sibling = heading
    while sibling.name != 'table':
        sibling = sibling.next_sibling
    return sibling


def format_date(d):
    """
    Helper to sanitize the date.
    The page is inconsistent and uses 'en-dash' (U+2013) and hypen-minus (U+002D)
    as the seperator between dates
    """
    d = d.replace('–', '-')
    return '-'.join([p.strip() for p in d.split('-')])

def parse_table(table):
    """
    Parse a table and return it as a Pandas dataframe
    """
    values = []
    headings = []
    columns = [format_date(th.text) for th in table.find_all('th')]

    for row in table.find_all('tr'):
        tds = row.find_all('td')
        if not tds:
            continue
        heading = tds[0].text
        headings.append(heading)
        data = [int(d.text.replace(',','')) for d in tds[1:]]
        values.append(data)
    return pandas.DataFrame(data=values, columns=columns[1:], index=headings)

In [9]:
# Download the web page and parse it using Beautiful soups
response = requests.get('https://www.canada.ca/en/public-health/services/diseases/2019-novel-coronavirus-infection/prevention-risks/covid-19-vaccine-treatment/vaccine-rollout.html')
soup = BeautifulSoup(response.text)

## The Web Page doesn't have any useful tags or IDs for the tables
## This finds all headers (h2) that contain 'forcased allocation'. In general the names are
## names like "Pfizer vaccine forecasted allocation". So we are looking for those kinds of headers

distribution_headings = [head for head in soup.find_all('h2') 
                         if 'forecasted allocation' in head.text]

## Sanitize the name, by removing 'forecased allocation' and 'vaccine'.

names = [h.text.replace(' forecasted allocation', '').replace('vaccine','').strip()
         for h in distribution_headings]

## Read all tables on the webpage
table_data = [parse_table(find_table(t)) for t in distribution_headings]

## Join the tables with the headings so we get a dictionary {'vaccine': DataFrame}
by_type = dict(zip(names, table_data))

## Create a list only the "Total forecased allocations" row
total_allocations = {key: data.filter(like='Total forecasted allocations', axis=0)
                     for key, data in by_type.items()}

## Now that each data frame contains one row, rename it with the name
## of the vaccine
renamed = [total_allocations[key].rename(lambda _ : key) for key in total_allocations]

## Join all data frames
pandas.concat(renamed, sort=False)

Unnamed: 0,29 Mar-4 Apr,5-11 Apr,12-18 Apr,19-25 Apr,26 Apr-2 May,3-9 May,10-16 May,17-23 May,24-30 May
Pfizer,1195740.0,1019070.0,1019070.0,1019070.0,1019070.0,1024920.0,1024920.0,1024920.0,1024920.0
Moderna,,855600.0,,1202400.0,,,,,
U.S. AstraZeneca,1504200.0,,,,,,,,
COVAX AstraZeneca,,316800.0,,,,,,,
