In [1]:
from datetime import date
from dateutil.relativedelta import relativedelta
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
try:
    from . import scrapper
except:
    import scrapper
today = date.today()
four_months_ago = date.today() - relativedelta(months=+4)

In [2]:
pid="1410020101"
filter_names = ["Geography", "Type of employee"]
startMonth=str(four_months_ago.month).zfill(2)
startYear=str(four_months_ago.year)
endMonth=str(today.month).zfill(2)
endYear=str(today.year)
referencePeriods = startYear + startMonth + "01" + "%2C" + endYear + endMonth + "28" 
table_name = "monthly_employment_by_industry"


In [6]:
pick_members_1 = {"names": ["1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12", "1.14", "1.15"], "values": ["Canada", "Newfoundland and Labrador", "Prince Edward Island", "Nova Scotia", "New Brunswick", "Quebec", "Ontario", "Manitoba", "Saskatchewan", "Alberta", "British Columbia", "Yukon", "Northwest Territories", "Nunavut"]}
pick_members_2 = {"names": ["2.2", "2.3"], "values": ["Salaried employees paid a fixed salary", "Employees paid by the hour"]}
pick_members_1_dict = dict(zip(pick_members_1["names"], pick_members_1["values"]))
pick_members_2_dict = dict(zip(pick_members_2["names"], pick_members_2["values"]))

result = [(x, y) for x in pick_members_1["names"] for y in pick_members_2["names"]]

In [4]:
# Connecting to Planet Scale
ssl_args = {'ssl_ca': "/etc/ssl/cert.pem"}

conn_string = 'mysql+pymysql://' + os.getenv("USERNAME") + ':' + os.getenv("PASSWORD") + '@' + os.getenv("HOST") + '/' + os.getenv("DATABASE") 

for x, y in result:
    geography=pick_members_1_dict[x]
    type_of_employee=pick_members_2_dict[y]
    month= today.strftime("%B") + " " + str(today.year)
    url = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=' + pid + '&pickMembers%5B0%5D='+ x + '&pickMembers%5B1%5D='+ y + '&cubeTimeFrame.startMonth='+ startMonth + '&cubeTimeFrame.startYear=' + startYear + '&cubeTimeFrame.endMonth=' + endMonth +'&cubeTimeFrame.endYear=' + endYear + '&referencePeriods=' + referencePeriods
    df = scrapper.simple_scrapper(url, filter_names).iloc[-1]
    latest_month = df["month"]
    
    query = "SELECT * FROM " + table_name +  " WHERE geography = '" + geography + "' AND type_of_employee = '" + type_of_employee + "' AND month = '" + latest_month + "';"

    engine = create_engine(conn_string, connect_args=ssl_args)
    with engine.begin() as engine:
        sql_data = pd.read_sql_query(query, engine)
        sql_latest_month = sql_data["month"].values[0]
        
        if (sql_latest_month == latest_month):
            print(latest_month + " " + geography + " "+ type_of_employee + " data already exists in " + table_name + " table")
        else:
            df.to_sql(table_name, engine, if_exists="append", index=False)
            print("Inserted data from " + x + " and " + y + " into " + table_name + " successfully")


October 2022 Canada Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Canada Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Newfoundland and Labrador Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Newfoundland and Labrador Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Prince Edward Island Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Prince Edward Island Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Nova Scotia Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Nova Scotia Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 New Bru

In [13]:
try:
    from . import data_pipeline
except:
    import data_pipeline


In [14]:
pid="1410020101"
table_name = "monthly_employment_by_industry"
pick_members_1 = {"names": ["1.1", "1.2", "1.3", "1.4", "1.5", "1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12", "1.14", "1.15"], "values": ["Canada", "Newfoundland and Labrador", "Prince Edward Island", "Nova Scotia", "New Brunswick", "Quebec", "Ontario", "Manitoba", "Saskatchewan", "Alberta", "British Columbia", "Yukon", "Northwest Territories", "Nunavut"]}
pick_members_2 = {"names": ["2.2", "2.3"], "values": ["Salaried employees paid a fixed salary", "Employees paid by the hour"]}
filter_names = ["Geography", "Type of employee"]
data_pipeline.data_pipeline_job(pid=pid, table_name=table_name, pick_members_1=pick_members_1, pick_members_2=pick_members_2, filter_names=filter_names)


October 2022 Canada Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Canada Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Newfoundland and Labrador Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Newfoundland and Labrador Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Prince Edward Island Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Prince Edward Island Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 Nova Scotia Salaried employees paid a fixed salary data already exists in monthly_employment_by_industry table
October 2022 Nova Scotia Employees paid by the hour data already exists in monthly_employment_by_industry table
October 2022 New Bru

In [16]:
test = 'mysql+pymysql://o0s5nq8am6m15aks6r5h:pscale_pw_elqpdkS2PxEbaAB0NDeTdk48y4xskU2SqZFsLimjj1t@us-west.connect.psdb.cloud/statscanvis'
conn_string = 'mysql+pymysql://' + os.getenv("USERNAME") + ':' + os.getenv("PASSWORD") + '@' + os.getenv("HOST") + '/' + os.getenv("DATABASE") 

print(test == conn_string)


True
