In [1]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
import itertools
import os
import sys
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.insert(1, os.getenv('LIBRARY_PATH'))
import scrapper

#https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1010013901&cubeTimeFrame.startDaily=2022-01-01&cubeTimeFrame.endDaily=2023-01-01&referencePeriods=20220101%2C20230101
url = "https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1010013901"
table_name = "money_markets_interest_rates"
filter_names = ["Geography"]

In [2]:
# Helper function to find data between two strings
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

# Helper function to check if a string is a float
def isfloat(num):
    try:
        float(num)
        return True
    except ValueError:
        return False

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
result = find_between(soup.prettify(), 'tableContainerElement = $(".tableContainer").clone();', 'window.addEventListener("resize", function() {') + 'end'
data = find_between(result, 'prepareTable(', '\n')[:-2]
json_data = json.loads(data)
rows = json_data['rows']

In [4]:
# Return the headers for the data table.
headers = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == "Reference period")
header_values = []
for item in headers["values"]:
        header_values.append(item["value"])

In [5]:
rows = json_data['rows']
flattened_rows = list(itertools.chain.from_iterable([row['values'] for row in rows]))
new_rows = []
for row in flattened_rows:
    new_rows.append(row['value'])
keys = []
data = {}

for row in new_rows:
    if not isfloat(row):
        key = row.replace(" ", "_").replace(",","").replace("(","").replace(")","").replace("-","_").replace("__","_").lower()[:60]
        keys.append(key)
        data[key] = []
    if isfloat(row):
        data[key].append(float(row))

rows_values = {key: value for key, value in data.items()}

In [6]:
final_data = [{"key": name, **{month: value for month, value in zip(header_values, values)}} for name, values in rows_values.items()]


In [7]:
df = pd.DataFrame(final_data).transpose().drop("key")
df.columns = keys
df["date"] = soup.find_all('meta', attrs={'name': 'dcterms.issued'})[0]['content']



In [8]:

df['month'] = df.index
df.reset_index(drop=True, inplace=True)
for filter_name in filter_names:
        new_name = filter_name.replace(" ", "_").replace(",","").replace("(","").replace(")","").replace("-","_").replace("__","_").lower()[:60]
        df[new_name] = next(item for item in json_data['headers']["columnHeaders"] if item["name"] == filter_name)["values"][0]["value"]


In [9]:
df.to_csv(os.getenv('PROJECT_PATH') + "data/" + table_name + ".csv", index=False)