# 02_ingest_comtrade_data
Bronze layer: UN Comtrade monthly imports for Germany (HS 2-digit shortlist, 2018–2024)
Writes managed Delta table: fx_impact.bronze_comtrade_imports


In [0]:
# Repo path + imports (+ reload)
import sys, importlib, inspect

repo_root = "/Workspace/Users/chiuyunhan@gmail.com/germany_import_cost_fx_impact"
if repo_root not in sys.path:
    sys.path.append(repo_root)

import scripts.api_clients as api_clients
import scripts.utils as utils
importlib.reload(api_clients)
importlib.reload(utils)

print("api_clients:", inspect.getsourcefile(api_clients))
print("utils:", inspect.getsourcefile(utils))


In [0]:
# Widget-only secret setup (no getpass)
import os

# Clean any old widgets and create a fresh one
dbutils.widgets.removeAll()
dbutils.widgets.text("COMTRADE_API_KEY", "")

print("👉 Paste your key into the COMTRADE_API_KEY box at the top of the notebook, then press Enter.")


In [0]:
# read widget → env var → sanity check
import os

API_KEY = dbutils.widgets.get("COMTRADE_API_KEY")
if not API_KEY:
    raise RuntimeError("Set the COMTRADE_API_KEY widget (top of notebook) and press Enter.")

os.environ["COMTRADE_API_KEY"] = API_KEY

def mask(k, n=4): 
    return (k[:n] + "…" + k[-n:]) if k and len(k) > 8 else "set"
print("Key set:", mask(os.getenv("COMTRADE_API_KEY")))


In [0]:
# One-row API ping to confirm before the full fetch
import os, requests

url = "https://comtradeapi.un.org/data/v1/get/C/M/HS"
params = {
    "cmdCode": "27",
    "reporterCode": 276,   # Germany
    "partnerCode": 0,      # World
    "flowCode": "M",       # Import
    "period": "202001",    # <-- YYYYMM (safe for monthly)
    "motCode": 0,          # TOTAL mode of transport
    "partner2Code": 0,     # World
    "customsCode": "C00",  # TOTAL customs
    "includeDesc": "TRUE"
}
headers = {"Ocp-Apim-Subscription-Key": os.getenv("COMTRADE_API_KEY")}

r = requests.get(url, params=params, headers=headers, timeout=60)
print("URL:", r.request.url)
print("Status:", r.status_code)
print("Body (first 500):", r.text[:500])
r.raise_for_status()


In [0]:
# Full fetch → Pandas DF
import time, pandas as pd
from scripts.api_clients import fetch_comtrade_monthly_v1
import os

YEARS = list(range(2018, 2025)) + [2025]
HS_2  = ("27","84","85","39","72")

pdfs = []
for y in YEARS:
    print(f"Fetching {y} …")
    df_y = fetch_comtrade_monthly_v1(
        year=y,
        hs_codes=HS_2,
        reporter_code=276,    # Germany
        partner_code=0,       # World
        flow_code="M",        # Import
        api_key=os.getenv("COMTRADE_API_KEY")
    )
    print("  rows:", 0 if df_y is None else len(df_y))
    if df_y is not None and not df_y.empty:
        pdfs.append(df_y)
    time.sleep(0.6)  # be polite / avoid 429

pdf = pd.concat(pdfs, ignore_index=True)
keep = [c for c in [
    "period","period_date","cmdCode","cmdDesc",
    "reporterCode","reporterDesc","partnerCode","partnerDesc",
    "flowCode","flowDesc","TradeValue","netWgt","year"
] if c in pdf.columns]
pdf = pdf[keep]
print("Final rows:", len(pdf))


In [0]:
#  Write managed Delta (Bronze)
from pyspark.sql import functions as F
from scripts.utils import with_ingest_meta

sdf = spark.createDataFrame(pdf)\
    .withColumn("period_date", F.to_date("period_date"))\
    .withColumn("cmdCode", F.col("cmdCode").cast("string"))\
    .withColumn("TradeValue", F.col("TradeValue").cast("double"))\
    .withColumn("netWgt", F.col("netWgt").cast("double"))\
    .withColumn("year", F.col("year").cast("int"))

sdf = with_ingest_meta(sdf, src="COMTRADE_API_V1_HS2_WORLD_IMPORTS")

# Quick DQ
assert sdf.groupBy("period_date","cmdCode").count().filter("count > 1").count() == 0
assert sdf.filter(F.col("TradeValue") <= 0).count() == 0

spark.sql("CREATE DATABASE IF NOT EXISTS fx_impact")
# Full refresh overwrite (safe if schema didn’t change)
(sdf.write
   .format("delta")
   .mode("overwrite")
   .option("overwriteSchema", "true")      # harmless if schema same; required if you added cols
   .partitionBy("year","cmdCode")
   .saveAsTable("fx_impact.bronze_comtrade_imports"))

print("✅ Wrote: fx_impact.bronze_comtrade_imports")


In [0]:
%sql
-- Should include 2025 now
SELECT MIN(period_date), MAX(period_date)
FROM fx_impact.bronze_comtrade_imports;

SELECT year, COUNT(*) rows
FROM fx_impact.bronze_comtrade_imports
GROUP BY year
ORDER BY year;