In [2]:
import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql import SparkSession

geography = {
    "Canada": 1,
    "Newfoundland and Labrador": 2,
    "Prince Edward Island": 3,
    "Nova Scotia": 4,
    "New Brunswick": 5,
    "Quebec": 6,
    "Ontario": 7,
    "Manitoba": 8,
    "Saskatchewan": 9,
    "Alberta": 10,
    "British Columbia": 11,
    "Yukon": 12,
    "Northwest Territories including Nunavut": 13,
    "Northwest Territories": 14,
    "Nunavut": 15
}

trade = {
    "Total exports": 1,
    "Domestic exports": 2,
    "Re-exports": 3,
    "Total imports": 4
}

united_states = {
    "Total United States": 1,
    "Alaska": 2,
    "Alabama": 3,
    "Arkansas": 4,
    "Arizona": 5,
    "California": 6,
    "Colorado": 7,
    "Connecticut": 8,
    "District of Columbia": 9,
    "Delaware": 10,
    "Florida": 11,
    "Georgia": 12,
    "Hawaii": 13,
    "Iowa": 14,
    "Idaho": 15,
    "Illinois": 16,
    "Indiana": 17,
    "Kansas": 18,
    "Kentucky": 19,
    "Louisiana": 20,
    "Massachusetts": 21,
    "Maryland": 22,
    "Maine": 23,
    "Michigan": 24,
    "Minnesota": 25,
    "Missouri": 26,
    "Mississippi": 27,
    "Montana": 28,
    "North Carolina": 29,
    "North Dakota": 30,
    "Nebraska": 31,
    "New Hampshire": 32,
    "New Jersey": 33,
    "New Mexico": 34,
    "Nevada": 35,
    "New York": 36,
    "Ohio": 37,
    "Oklahoma": 38,
    "Oregon": 39,
    "Pennsylvania": 40,
    "Rhode Island": 41,
    "South Carolina": 42,
    "South Dakota": 43,
    "Tennessee": 44,
    "Texas": 45,
    "Utah": 46,
    "Virginia": 47,
    "Vermont": 48,
    "Washington, State": 49,
    "Wisconsin": 50,
    "West Virginia": 51,
    "Wyoming": 52,
    "Other states": 53
}

hs_sections = {
    "Total Harmonized System (HS) sections": 1,
    "I - Live animals and animal products": 2,
    "II - Vegetable products": 3,
    "III - Animal or vegetable fats and oils and their cleavage products, prepared edible fats, animal or vegetable waxes": 4,
    "IV - Prepared foodstuffs, beverages, spirits and vinegar, tobacco and manufactures tobacco substitutes": 5,
    "V - Mineral products": 6,
    "VI - Products of the chemical or allied industries": 7,
    "VII - Plastics and articles thereof, rubber and articles thereof": 8,
    "VIII - Raw hides and skins, leather, furskins and articles thereof, saddlery and harness, travel goods, handbags and similar containers, articles of animal gut (other than silk-worm gut)": 9,
    "IX - Wood and articles of wood, wood charcoal, cork and articles of cork, manufactures of straw, of esparto or of other plaiting materials, basketware and wickerwork": 10,
    "X - Pulp of wood or of other fibrous cellulosic material, recovered (waste and scrap) paper or paperboard": 11,
    "XI - Textiles and textile articles": 12,
    "XII - Footwear, headgear, umbrellas, sun umbrellas, walking-sticks, seat-sticks, whips, riding-crops and parts thereof, prepared feathers and articles made therewith, artificial flowers, articles of human hair": 13,
    "XIII - Articles of stone, plaster, cement, asbestos, mica or similar materials, ceramic products, glass and glassware": 14,
    "XIV - Natural or cultured pearls, precious or semi-precious stones, precious metals, metals clad with precious metal and articles thereof, imitation jewellery, coin": 15,
    "XV - Base metals and articles of base metal": 16,
    "XVI - Machinery and mechanical appliances, electrical equipment, parts thereof, sound recorders and reproducers, television image and sound recorders and reproducers, and parts and accessories of such articles": 17,
    "XVII - Vehicles, aircraft, vessels and associated transport equipment": 18,
    "XVIII - Optical, photographic, cinematographic, measuring, checking, precision, medical or surgical instruments and apparatus, clocks and watches, musical instruments, parts and accessories thereof": 19,
    "XIX - Arms and ammunition, parts and accessories thereof": 20,
    "XX - Miscellaneous manufactured articles": 21,
    "XXI - Works of art, collectors' pieces and antiques": 22
}
spark = SparkSession.builder.appName("bronze_data").getOrCreate()

StatementMeta(, fb23e352-432f-4276-8a4a-d87e4f3760ab, 3, Finished, Available, Finished)

In [13]:

spark = SparkSession.builder.appName("bronze_data").getOrCreate()

# spark.sql("""
# CREATE OR REPLACE TABLE IMPORTS (
#     ID INT,
#     RECORD_DATE STRING,
#     STATE INT,
#     HS_CAT INT,
#     VALUE FLOAT
# )
# USING DELTA
# """)

spark.sql("""
CREATE OR REPLACE TABLE EXPORTS (
    ID INT,
    RECORD_DATE STRING,
    STATE INT,
    HS_CAT INT,
    VALUE FLOAT
)
USING DELTA
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS FACT_TRADE (
    TRADE_DATE STRING,
    REPORTER_KEY INT,
    PARTNER_KEY INT,
    HS_CODE INT,
    FLOW_CODE INT,
    CURRENCY_KEY INT,
    VALUE FLOAT
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS DIM_LOCATION (
    LOCATION_KEY INT,
    NAME STRING,
    ISO2 STRING
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS DIM_HS_PRODUCT (
    HS_CODE INT,
    DESCRIPTION STRING
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS DIM_FLOW (
    FLOW_CODE INT,
    FLOW_DESC STRING
)
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS DIM_CURRENCY (
    CURRENCY_KEY INT,
    ISO_CODE STRING
)
""")


StatementMeta(, 1f477e8d-bf03-4a78-a481-72df185d8cb0, 15, Finished, Available, Finished)

DataFrame[]

In [None]:
#insert location dims
invert_united_states = {v: k for k, v in united_states.items()}
location_rows = []

location_rows.append({
    "LOCATION_KEY": 1,
    "NAME": "Ontario",
    "ISO2": "CA"
})

for key in range(2,53):
    location_rows.append({
    "LOCATION_KEY": key,
    "NAME": invert_united_states[key],
    "ISO2": "US"})

schema = StructType([
    StructField("LOCATION_KEY", IntegerType(), True),
    StructField("NAME", StringType(), True),
    StructField("ISO2", StringType(), True)         
])

df = spark.createDataFrame(location_rows, schema=schema)
df.write.mode("overwrite").saveAsTable("DIM_LOCATION")

#insert hs dims
invert_hs_sections = {v: k for k, v in hs_sections.items()}
hs_rows = []

for key in range (1,23):
    hs_rows.append({
        "HS_CODE": key,
        "DESCRIPTION": invert_hs_sections[key]
    })

schema = StructType([
    StructField("HS_CODE", IntegerType(), True),
    StructField("DESCRIPTION", StringType(), True)        
])

df = spark.createDataFrame(hs_rows, schema=schema)
df.write.mode("overwrite").saveAsTable("DIM_HS_PRODUCT")

#insert flow dims
invert_trade = {v: k for k, v in trade.items()}
flow_rows = []

for key in range (1,5):
    flow_rows.append({
        "FLOW_CODE": key,
        "FLOW_DESC": invert_trade[key]
    })

schema = StructType([
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("FLOW_DESC", StringType(), True)        
])

df = spark.createDataFrame(flow_rows, schema=schema)
df.write.mode("overwrite").saveAsTable("DIM_FLOW")

#insert currency dims
currency_rows = []

currency_rows.append({
    "CURRENCY_KEY": 1,
    "ISO_CODE": "CAD"
})

schema = StructType([
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("ISO_CODE", StringType(), True)        
])

df = spark.createDataFrame(currency_rows, schema=schema)
df.write.mode("overwrite").saveAsTable("DIM_CURRENCY")

StatementMeta(, ca1433ba-c8fd-4ac6-9de1-aecd6f21b70b, 5, Finished, Available, Finished)

In [3]:
#GET IMPORT DATA

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.4.{state}.{hs}.0.0.0.0.0.0", "latestN":36}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 4,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("overwrite").saveAsTable("FACT_TRADE")

StatementMeta(, fb23e352-432f-4276-8a4a-d87e4f3760ab, 4, Finished, Available, Finished)

37
73
109
145
181
217
253
289
325
361
too many requests/timeout... nap time ! zzzzzz
200
397
433
469
505
541
577
613
649
685
721
757
793
too many requests/timeout... nap time ! zzzzzz
200
829
865
too many requests/timeout... nap time ! zzzzzz
200
901
937
973
1009
1045
1081
1117
1153
1189
1225
1261
1297
1333
1369
1405
too many requests/timeout... nap time ! zzzzzz
200
1441
1477
1513
1549
1585
1621
1657
1693
1729
1765
1801
1837
1873
1909
1945
1981
2017
2053
2089
2125
2161
2197
2233
2269
2305
2341
2377
2413
2449
2485
2521
2557
2593
2629
2665
2701
2737
2773
2809
2845
2881
2917
2953
2989
3025
3061
3097
3133
3169
3205
3241
3277
3313
3349
3385
3421
3457
3493
3529
3565
3601
3637
3673
3709
3745
3781
3817
3853
3889
3925
3961
3997
4033
4069
4105
4141
4177
4213
4249
4285
4321
4357
4393
4429
4465
4501
4537
4573
4609
4645
4681
4717
4753
4789
4825
4861
4897
4933
4969
5005
5041
5077
5113
5149
5185
5221
5257
5293
5329
5365
5401
5437
5473
5509
5545
5581
5617
5653
5689
5725
5761
5797
5833
5869
5905
5941


In [4]:
#GET DOMESTIC EXPORT DATA

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.2.{state}.{hs}.0.0.0.0.0.0", "latestN":36}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 2,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("append").saveAsTable("FACT_TRADE")

StatementMeta(, fb23e352-432f-4276-8a4a-d87e4f3760ab, 5, Finished, Available, Finished)

37
73
109
145
181
217
253
289
325
361
397
433
469
505
541
577
613
649
685
721
757
793
829
865
901
937
973
1009
1045
1081
1117
1153
1189
1225
1261
1297
1333
1369
1405
1441
1477
1513
1549
1585
1621
1657
1693
1729
1765
1801
1837
1873
1909
1945
1981
2017
2053
2089
2125
2161
2197
2233
2269
2305
2341
2377
2413
2449
2485
2521
2557
2593
2629
2665
2701
2737
2773
2809
2845
2881
2917
2953
2989
3025
3061
3097
3133
3169
3205
3241
3277
3313
3349
3385
3421
3457
3493
3529
3565
3601
3637
3673
3709
3745
3781
3817
3853
3889
3925
3961
3997
4033
4069
4105
4141
4177
4213
4249
4285
4321
4357
4393
4429
4465
4501
4537
4573
4609
4645
4681
4717
4753
4789
4825
4861
4897
4933
4969
5005
5041
too many requests/timeout... nap time ! zzzzzz
200
5077
5113
5149
5185
5221
5257
5293
5329
5365
5401
5437
5473
5509
5545
5581
5617
5653
5689
5725
5761
5797
5833
5869
5905
5941
5977
6013
6049
6085
6121
6157
6193
6229
6265
6301
6337
6373
6409
6445
6481
6517
6553
6589
6625
6661
6697
6733
6769
6805
6841
6877
6913
6949
6985
7021
705

In [None]:
#GET TOTAL EXPORT DATA (RETURNS BLANK BECAUSE NO RE-EXPORT FOR NON-COUNTRY LEVEL)

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.1.{state}.{hs}.0.0.0.0.0.0", "latestN":12}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 1,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("append").saveAsTable("FACT_TRADE")

In [None]:
#GET RE-EXPORT DATA (ONLY AVAILABLE AT COUNTRY LEVEL)

import requests
import pandas
import json
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType

url = "https://www150.statcan.gc.ca/t1/wds/rest/getDataFromCubePidCoordAndLatestNPeriods"
states = range(2,53)
hs_categories = range(1,23)
id = 1

rows = []

for state in states:
    # print("State: " + str(state))
    for hs in hs_categories:
        # print("HS:" + str(hs))

        body = [{"productId": "12100099", "coordinate":f"7.3.{state}.{hs}.0.0.0.0.0.0", "latestN":12}]


        try:

            response = requests.post(url,json=body, timeout=10)

            if(response.status_code != 200):
                print(response.status_code)
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        except Exception as e:
                print("too many requests/timeout... nap time ! zzzzzz")
                time.sleep(5)
                response = requests.post(url,json=body, timeout=10)
                print(response.status_code)

        try:
            data = response.json()
            datapoints = data[0]["object"]["vectorDataPoint"]
            
            for dp in datapoints:
                rows.append({
                    "TRADE_DATE": dp["refPer"],
                    "REPORTER_KEY": 1,
                    "PARTNER_KEY": state,
                    "HS_CODE": hs,
                    "FLOW_CODE": 3,
                    "CURRENCY_KEY": 1,
                    "VALUE": dp["value"]
                })
                id += 1

            print(id)
            time.sleep(0.5)

        except Exception as e:
            print(f"Failed to parse data for STATE {state}, HS {hs}: {e}. skipping :P")
            continue

print("Done") 

schema = StructType([
    StructField("TRADE_DATE", StringType(), True),
    StructField("REPORTER_KEY", IntegerType(), True),
    StructField("PARTNER_KEY", IntegerType(), True),
    StructField("HS_CODE", IntegerType(), True),
    StructField("FLOW_CODE", IntegerType(), True),
    StructField("CURRENCY_KEY", IntegerType(), True),
    StructField("VALUE", FloatType(), True)         
])

df = spark.createDataFrame(rows, schema=schema)
df.write.mode("append").saveAsTable("FACT_TRADE")

In [14]:
# spark.sql("""
# CREATE OR REPLACE TABLE EXPORTS (
#     ID INT,
#     RECORD_DATE STRING,
#     STATE INT,
#     HS_CAT INT,
#     VALUE FLOAT
# )
# USING DELTA
# """)

schema = StructType([
   StructField("ID", IntegerType()),
   StructField("RECORD_DATE", StringType()),
   StructField("STATE", IntegerType()),
   StructField("HS_CAT", IntegerType()),
   StructField("VALUE", FloatType()),   
])

from pyspark.sql.window import Window
import pyspark.sql.functions as F

exports_df_raw = (
    spark.table("FACT_TRADE")
        .filter("FLOW_CODE = 2")
        .select(
            "TRADE_DATE",
            "PARTNER_KEY",
            "HS_CODE",
            "VALUE"
        )
)

exports_df = (
    exports_df_raw
        .withColumn("ID", F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
        .withColumnRenamed("TRADE_DATE", "RECORD_DATE")
        .withColumnRenamed("PARTNER_KEY", "STATE")
        .withColumnRenamed("HS_CODE", "HS_CAT")
)


exports_df.write.mode("append").saveAsTable("EXPORTS")

StatementMeta(, 1f477e8d-bf03-4a78-a481-72df185d8cb0, 16, Finished, Available, Finished)

In [16]:
spark.sql("SELECT COUNT(*) FROM EXPORTS").show()

StatementMeta(, 1f477e8d-bf03-4a78-a481-72df185d8cb0, 18, Finished, Available, Finished)

+--------+
|count(1)|
+--------+
|   13464|
+--------+

