In [None]:
#init workspace
import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql import SparkSession

geography = {
    "Canada": 1,
    "Newfoundland and Labrador": 2,
    "Prince Edward Island": 3,
    "Nova Scotia": 4,
    "New Brunswick": 5,
    "Quebec": 6,
    "Ontario": 7,
    "Manitoba": 8,
    "Saskatchewan": 9,
    "Alberta": 10,
    "British Columbia": 11,
    "Yukon": 12,
    "Northwest Territories including Nunavut": 13,
    "Northwest Territories": 14,
    "Nunavut": 15
}

trade = {
    "Total exports": 1,
    "Domestic exports": 2,
    "Re-exports": 3,
    "Total imports": 4
}

united_states = {
    "Total United States": 1,
    "Alaska": 2,
    "Alabama": 3,
    "Arkansas": 4,
    "Arizona": 5,
    "California": 6,
    "Colorado": 7,
    "Connecticut": 8,
    "District of Columbia": 9,
    "Delaware": 10,
    "Florida": 11,
    "Georgia": 12,
    "Hawaii": 13,
    "Iowa": 14,
    "Idaho": 15,
    "Illinois": 16,
    "Indiana": 17,
    "Kansas": 18,
    "Kentucky": 19,
    "Louisiana": 20,
    "Massachusetts": 21,
    "Maryland": 22,
    "Maine": 23,
    "Michigan": 24,
    "Minnesota": 25,
    "Missouri": 26,
    "Mississippi": 27,
    "Montana": 28,
    "North Carolina": 29,
    "North Dakota": 30,
    "Nebraska": 31,
    "New Hampshire": 32,
    "New Jersey": 33,
    "New Mexico": 34,
    "Nevada": 35,
    "New York": 36,
    "Ohio": 37,
    "Oklahoma": 38,
    "Oregon": 39,
    "Pennsylvania": 40,
    "Rhode Island": 41,
    "South Carolina": 42,
    "South Dakota": 43,
    "Tennessee": 44,
    "Texas": 45,
    "Utah": 46,
    "Virginia": 47,
    "Vermont": 48,
    "Washington, State": 49,
    "Wisconsin": 50,
    "West Virginia": 51,
    "Wyoming": 52,
    "Other states": 53
}

hs_sections = {
    "Total Harmonized System (HS) sections": 1,
    "I - Live animals and animal products": 2,
    "II - Vegetable products": 3,
    "III - Animal or vegetable fats and oils and their cleavage products, prepared edible fats, animal or vegetable waxes": 4,
    "IV - Prepared foodstuffs, beverages, spirits and vinegar, tobacco and manufactures tobacco substitutes": 5,
    "V - Mineral products": 6,
    "VI - Products of the chemical or allied industries": 7,
    "VII - Plastics and articles thereof, rubber and articles thereof": 8,
    "VIII - Raw hides and skins, leather, furskins and articles thereof, saddlery and harness, travel goods, handbags and similar containers, articles of animal gut (other than silk-worm gut)": 9,
    "IX - Wood and articles of wood, wood charcoal, cork and articles of cork, manufactures of straw, of esparto or of other plaiting materials, basketware and wickerwork": 10,
    "X - Pulp of wood or of other fibrous cellulosic material, recovered (waste and scrap) paper or paperboard": 11,
    "XI - Textiles and textile articles": 12,
    "XII - Footwear, headgear, umbrellas, sun umbrellas, walking-sticks, seat-sticks, whips, riding-crops and parts thereof, prepared feathers and articles made therewith, artificial flowers, articles of human hair": 13,
    "XIII - Articles of stone, plaster, cement, asbestos, mica or similar materials, ceramic products, glass and glassware": 14,
    "XIV - Natural or cultured pearls, precious or semi-precious stones, precious metals, metals clad with precious metal and articles thereof, imitation jewellery, coin": 15,
    "XV - Base metals and articles of base metal": 16,
    "XVI - Machinery and mechanical appliances, electrical equipment, parts thereof, sound recorders and reproducers, television image and sound recorders and reproducers, and parts and accessories of such articles": 17,
    "XVII - Vehicles, aircraft, vessels and associated transport equipment": 18,
    "XVIII - Optical, photographic, cinematographic, measuring, checking, precision, medical or surgical instruments and apparatus, clocks and watches, musical instruments, parts and accessories thereof": 19,
    "XIX - Arms and ammunition, parts and accessories thereof": 20,
    "XX - Miscellaneous manufactured articles": 21,
    "XXI - Works of art, collectors' pieces and antiques": 22
}
spark = SparkSession.builder.appName("bronze_data").getOrCreate()


In [None]:
#GET IMPORT DATA

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.4.{state}.{hs}.0.0.0.0.0.0", "latestN":1}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 4,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("append").saveAsTable("FACT_TRADE")

In [None]:
#GET DOMESTIC EXPORT DATA

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.2.{state}.{hs}.0.0.0.0.0.0", "latestN":1}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 2,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("append").saveAsTable("FACT_TRADE")