In [0]:
%run "../01_setup/01_config"

In [0]:
import requests
from pyspark.sql.functions import current_timestamp, lit


icd_client_id = dbutils.secrets.get(scope="rcm-secrets", key="icd-api-client-id")
icd_client_secret = dbutils.secrets.get(scope="rcm-secrets", key="icd-api-client-secret")


def get_access_token(client_id, client_secret):
    endpoint_url = 'https://icdaccessmanagement.who.int/connect/token'
    payload = {
        'grant_type': 'client_credentials',
        'scope': 'icdapi_access',
        'client_id': client_id,
        'client_secret': client_secret
    }
    try:
        response = requests.post(endpoint_url, data=payload)
        response.raise_for_status()
        return response.json().get('access_token')
    except Exception as e:
        print(f"‚ùå Error authenticating with WHO API: {e}")
        raise e

token = get_access_token(icd_client_id, icd_client_secret)
print("‚úÖ API Token retrieved successfully.")

In [0]:
headers = {
    'Authorization': f'Bearer {token}',
    'API-Version': 'v2',
    'Accept-Language': 'en'
}

def get_url_data(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
        return None
    except Exception as e:
        print(f"‚ö†Ô∏è Error requesting {url}: {e}")
        return None

def extract_codes_recursive(url, collected_codes):
    data = get_url_data(url)
    if not data: return

    # Extract Code
    if 'code' in data and 'title' in data:
        collected_codes.append({
            'icd_code': data.get('code'),
            'icd_code_type': 'ICD-10',
            'code_description': data['title'].get('@value', 'Unknown'),
            'source_url': url
        })

    # Recursive Step
    if 'child' in data:
        for child_url in data['child']:
            extract_codes_recursive(child_url, collected_codes)


In [0]:
print("üöÄ Starting Extraction...")
# Testing with a small chapter
root_url = 'http://id.who.int/icd/release/10/2019/A00-A09' 
all_codes = []

extract_codes_recursive(root_url, all_codes)
print(f"‚úÖ Found {len(all_codes)} codes.")

In [0]:

from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("icd_code", StringType(), True),
    StructField("icd_code_type", StringType(), True),
    StructField("code_description", StringType(), True),
    StructField("source_url", StringType(), True)
])

if all_codes:
    df = spark.createDataFrame(all_codes, schema)
    
    # Add Audit Columns
    df_final = df.withColumn("inserted_date", current_timestamp()) \
                 .withColumn("updated_date", current_timestamp()) \
                 .withColumn("is_current_flag", lit(True))

    # WRITE using the configured filesystem
    output_path = f"{bronze_path}/icd_codes"
    
    print(f"üíæ Saving to {output_path}...")
    df_final.write.mode("overwrite").parquet(output_path)
    print("‚úÖ Success!")
    display(df_final)
else:
    print("‚ö†Ô∏è No data found.")