In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
from datetime import timezone
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import pytz
import itertools
import re
import os
from dotenv import load_dotenv
load_dotenv()

# Helper function to find data between two strings
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# Helper function to check if a string is a float
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

def clean_string(string):
    new_string = re.sub("([\(\[]).*?([\)\]])", '', string).replace(" ", "_").replace(",","").replace("(","").replace(")","").replace("-","_").replace("__","_").lower()[:60]
    if new_string[-1] == '_':
        final_string = new_string[:-1]
    else:
        final_string = new_string
    return final_string

def convert_to_date(string_date):
    # convert string to UTC date time

    return datetime.strptime(string_date, "%B %Y").replace(tzinfo=pytz.UTC)


In [2]:
# employment count: 1410020101
# employment salary: 1410020301
pid=str(1410020301)
pickMembers1=str(1.3)
pickMembers2=str(2.1)
startMonth="01"
startYear="2022"
endMonth="12"
endYear="2022"

filter_names = ["Geography", "Type of employees"]

In [3]:
# combined_list_picker = [(x, y) for x in list_pick_members1 for y in list_pick_members2]
referencePeriods = startYear + startMonth + "01" + "%2C" + endYear + endMonth + "28" 

In [4]:
url = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=' + pid + '&pickMembers%5B0%5D='+ pickMembers1 + '&pickMembers%5B1%5D='+ pickMembers2 + '&cubeTimeFrame.startMonth='+ startMonth + '&cubeTimeFrame.startYear=' + startYear + '&cubeTimeFrame.endMonth=' + endMonth + '&cubeTimeFrame.endYear=' + endYear + '&referencePeriods=' + referencePeriods

In [5]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = find_between(soup.prettify(), 'tableContainerElement = $(".tableContainer").clone();', 'window.addEventListener("resize", function() {') + 'end'
data = find_between(result, 'prepareTable(', '\n')[:-2]
json_data = json.loads(data)


In [6]:
# Return the headers for the data table.
# The headers contain the data for our columns. We need to extract the values from the headers and return them as a list.
headers = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == "Reference period")
header_values = []
for item in headers["values"]:
        header_values.append(item["value"])

In [7]:
rows = json_data['rows']
flattened_rows = list(itertools.chain.from_iterable([row['values'] for row in rows]))
new_rows = []
for row in flattened_rows:
    new_rows.append(row['value'])

In [8]:
keys = []
data = {}

for row in new_rows:
    if not isfloat(row):

        key = clean_string(row)
        keys.append(key)
        data[key] = []
    if isfloat(row):
        data[key].append(float(row))

In [9]:
rows_values = {key: value for key, value in data.items()}
# We transform the data to its final format and return it as a pandas dataframe.
final_data = [{"key": name, **{month: value for month, value in zip(header_values, values)}} for name, values in rows_values.items()]

In [10]:
df = pd.DataFrame(final_data).transpose().drop("key")
df.columns = keys
df["date"] = soup.find_all('meta', attrs={'name': 'dcterms.issued'})[0]['content']

In [11]:
for filter_name in filter_names:
    new_name = clean_string(filter_name)
    df[new_name] = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == filter_name)["values"][0]["value"]

df=df.rename(columns = {'type_of_employees':'type_of_employee'})
df['month'] = df.index
df['month'] = df['month'].apply(convert_to_date)
df.reset_index(drop=True, inplace=True)

Unnamed: 0,industrial_aggregate_excluding_unclassified_businesses,goods_producing_industries,forestry_logging_and_support,mining_quarrying_and_oil_and_gas_extraction,utilities,construction,manufacturing,service_producing_industries,trade,transportation_and_warehousing,...,educational_services,health_care_and_social_assistance,arts_entertainment_and_recreation,accommodation_and_food_services,other_services_,public_administration,date,geography,type_of_employees,month
0,973.52,1088.03,0.0,0.0,0.0,1046.3,1060.24,954.39,724.48,1063.94,...,1124.86,977.3,0.0,412.05,0.0,1308.19,2022-12-22,Prince Edward Island,All employees,2022-01-01 00:00:00+00:00
1,956.36,1078.05,0.0,0.0,0.0,1017.46,1058.3,936.72,696.58,974.04,...,1176.42,962.36,0.0,397.81,0.0,1330.2,2022-12-22,Prince Edward Island,All employees,2022-02-01 00:00:00+00:00
2,990.45,1155.95,0.0,0.0,0.0,1059.06,1161.95,963.84,714.67,1060.37,...,1123.77,1056.61,0.0,418.69,0.0,1349.3,2022-12-22,Prince Edward Island,All employees,2022-03-01 00:00:00+00:00
3,966.16,1167.3,0.0,0.0,0.0,1069.22,1179.71,933.61,739.58,0.0,...,1138.05,943.95,0.0,418.13,0.0,1251.02,2022-12-22,Prince Edward Island,All employees,2022-04-01 00:00:00+00:00
4,979.22,1147.17,0.0,0.0,0.0,1049.65,1171.07,948.74,853.27,0.0,...,1138.72,944.46,0.0,434.43,0.0,1309.54,2022-12-22,Prince Edward Island,All employees,2022-05-01 00:00:00+00:00


In [24]:
import os
import sys
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.insert(1, os.getenv('LIBRARY_PATH'))
import pandas as pd
import scrapper


In [26]:
df = scrapper.simple_scrapper(url, filter_names)
