In [1]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
load_dotenv()

url = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410020101&pickMembers%5B0%5D=1.3&pickMembers%5B1%5D=2.1&cubeTimeFrame.startMonth=01&cubeTimeFrame.startYear=2022&cubeTimeFrame.endMonth=10&cubeTimeFrame.endYear=2022&referencePeriods=20220101%2C20221001"
filter_names = ["Geography", "Type of employee"]

# Helper function to find data between two strings
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# Helper function to check if a string is a float
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [2]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = find_between(soup.prettify(), 'tableContainerElement = $(".tableContainer").clone();', 'window.addEventListener("resize", function() {') + 'end'
data = find_between(result, 'prepareTable(', '\n')[:-2]
json_data = json.loads(data)
rows = json_data['rows']

In [3]:
new_rows = []
for row in rows:
    values = row['values']
    for value in values:
        new_rows.append(value["value"])

In [4]:
# Return the headers for the data table.
headers = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == "Reference period")
header_values = []
for item in headers["values"]:
        header_values.append(item["value"])
rows_values = {}
key = ""
data = []
index = 0
for row in new_rows:
    temp_data = {}
    if not isfloat(row):
        key = row
        rows_values[key] = []
        data = []
        index = 0
    if isfloat(row):
        data.append(float(row))
        rows_values[key] = data
        index += 1

In [5]:
test_data = []
keys = []
for key, value in rows_values.items():
    index=0
    temp_data = {}
    temp_data["key"] = key
    key = key.replace(" ", "_").replace(",","").replace("(","").replace(")","").replace("-","_").replace("__","_").lower()[:60]
    keys.append(key)
    for i in value:
        temp_data[header_values[index]] = i
        index += 1
    test_data.append(temp_data)

In [6]:
df = pd.DataFrame(test_data).transpose().drop("key")
df.columns = keys
df["date"] = soup.find_all('meta', attrs={'name': 'dcterms.issued'})[0]['content']
for filter_name in filter_names:
    df[filter_name] = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == filter_name)["values"][0]["value"]


In [7]:

df['month'] = df.index
df.reset_index(drop=True, inplace=True)
