In [0]:
%run "../01_setup/01_config"

In [0]:
# =============================================================================
# 1. SETUP
# =============================================================================
import requests
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import current_timestamp, lit

# =============================================================================
# 2. EXTRACTION LOGIC
# =============================================================================
base_url = 'https://npiregistry.cms.hhs.gov/api/'

# Parameters for the Search
params = {
    "version": "2.1",
    "state": "CA",
    "city": "Los Angeles",
    "limit": 200  # Increased limit for better batching
}

print(f"üöÄ Fetching NPI data from {base_url}...")
response = requests.get(base_url, params=params)

detailed_results = []

if response.status_code == 200:
    data = response.json()
    results = data.get('results', [])
    
    print(f"‚úÖ Found {len(results)} records. Processing...")

    for result in results:
        # Extract Basic Info safely using .get()
        basic_info = result.get("basic", {})
        
        # Initialize variables to avoid NameError
        npi_number = result.get('number')
        fname = ""
        lname = ""
        position = ""
        organization = ""
        
        # Logic based on Entity Type
        if result.get('enumeration_type') == 'NPI-1': # Individual
            fname = basic_info.get('first_name', "")
            lname = basic_info.get('last_name', "")
            position = basic_info.get('credential', "") # Example mapping
        else: # Organization (NPI-2)
            fname = basic_info.get('authorized_official_first_name', "")
            lname = basic_info.get('authorized_official_last_name', "")
            position = basic_info.get('authorized_official_title_or_position', "")
        
        organization = basic_info.get('organization_name', "")
        last_updated = basic_info.get('last_updated', "")

        # Append to list
        detailed_results.append({
            'npi_id': npi_number,
            'first_name': fname,
            'last_name': lname,
            'position': position,
            'organization_name': organization,
            'last_updated': last_updated
        })
else:
    raise Exception(f"‚ùå API Request failed: {response.status_code} - {response.text}")

# =============================================================================
# 3. SAVE TO BRONZE
# =============================================================================

# Define Explicit Schema (Best Practice)
schema = StructType([
    StructField("npi_id", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("position", StringType(), True),
    StructField("organization_name", StringType(), True),
    StructField("last_updated", StringType(), True)
])

if detailed_results:
    # Create DataFrame
    df = spark.createDataFrame(detailed_results, schema)
    
    # Add Audit Columns using Spark Functions (Not Python)
    df_final = df.withColumn("inserted_date", current_timestamp()) \
                 .withColumn("updated_date", current_timestamp()) \
                 .withColumn("is_current_flag", lit(True))

    # Define Path (Using Direct Access variable from config)
    output_path = f"{bronze_path}/npi_codes"
    
    print(f"üíæ Saving {df_final.count()} rows to {output_path}...")
    
    # Write to Bronze
    df_final.write.mode('overwrite').parquet(output_path)
    
    print("‚úÖ Success!")
    display(df_final)
else:
    print("‚ö†Ô∏è No results found to save. {response.status_code} - {response.text}")