In [None]:
/* ===================================================================================
   VIEW: vw_cost_center_mapping_bootstrap
   SOURCE: hive_metastore.ra_adido_2025.fy25_cost_center_mapping
   
   BUSINESS RULES & LOGIC APPLIED:
   
   1. MUTUALLY EXCLUSIVE BRANCHING (The "Yes" Rule):
      - If Col E is 'Yes' (or blank): Use ONLY Primary Columns B, C, D. Ignore E & F.
      - If Col E is NOT 'Yes': Use ONLY Columns E & F. Ignore B, C, D entirely.
      
   2. DIRTY STRING PARSING (Mashed Concatenations):
      - Col E and F often contain multiple Assessable Units mashed together without 
        spacing (e.g., "123456 - Name - Seg123457 - Name - Seg").
      - Uses Regex to slice the string into individual blocks strictly at every 6-digit boundary.
      
   3. HYPHEN PROTECTION & FALLBACK:
      - Extracts the 6-digit AU_ID.
      - For the remaining text: If hyphens exist, splits Name and Segment based on the 
        LAST hyphen (protecting internal hyphens like "Retail - Banking").
      - If no hyphens exist, treats the entire text as the AU Name with a blank Segment.
      
   4. DATA CLEANSING:
      - Strips rogue leading/trailing double quotes (caused by CSV escaping).
      - Converts double/multiple spaces into a single space.
      - Strictly drops any ghost rows where the AU_ID or AU_Name is missing/blank.
      
   5. SINGLE SOURCE OF TRUTH (Deduplication & Standardization):
      - Uses UNION and DISTINCT to remove duplicate AU mappings per Cost Center.
      - Resolves naming conflicts across different Cost Centers (e.g., "Insurance" vs 
        "General Insurance" for the same ID) by using a Window Function to force a 
        single, standardized AU Name and Segment per AU_ID alphabetically.
=================================================================================== */

CREATE OR REPLACE TEMP VIEW vw_cost_center_mapping_bootstrap AS

WITH Base_Data AS (
    -- 0. Pull the raw columns to make downstream code cleaner
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        TRIM(CAST(`AssessableUnitID` AS STRING)) AS Primary_AU_ID,
        TRIM(`AssessableUnitName`) AS Primary_AU_Name,
        TRIM(`Segment`) AS Primary_Segment,
        TRIM(`AdditionalAssessableUnitIDandNameandSegment`) AS Col_E,
        TRIM(`AdditionalAUID`) AS Col_F
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
),

Branch_Primary AS (
    -- LOGIC 1a: If Col E is 'Yes' (or blank), STRICTLY use B, C, D
    SELECT 
        Cost_Center_ID,
        Primary_AU_ID AS AU_ID,
        Primary_AU_Name AS AU_Name,
        Primary_Segment AS Segment_Name
    FROM Base_Data
    WHERE Col_E = 'Yes' 
       OR Col_E IS NULL 
       OR Col_E = ''
),

Branch_Additional_Raw AS (
    -- LOGIC 1b: If Col E is NOT 'Yes', STRICTLY use E and F (Ignore B, C, D)
    SELECT 
        Cost_Center_ID,
        CONCAT_WS(' ', COALESCE(Col_E, ''), COALESCE(Col_F, '')) AS Mashed_String
    FROM Base_Data
    WHERE Col_E != 'Yes' 
      AND Col_E IS NOT NULL 
      AND Col_E != ''
),

Extracted_Blocks AS (
    -- LOGIC 2: Slice the mashed E & F text into blocks at every 6-digit boundary
    SELECT 
        Cost_Center_ID,
        EXPLODE(regexp_extract_all(Mashed_String, '([0-9]{6}.*?(?=[0-9]{6}|$))')) AS Raw_Block
    FROM Branch_Additional_Raw
    WHERE Mashed_String != ''
),

Separated_ID_And_Remainder AS (
    -- LOGIC 3a: Pull out the 6-digit ID, and isolate the rest of the text
    SELECT 
        Cost_Center_ID,
        TRIM(regexp_extract(Raw_Block, '^([0-9]{6})', 1)) AS AU_ID,
        TRIM(REGEXP_REPLACE(Raw_Block, '^[0-9]{6}[ \t-]*', '')) AS Remainder
    FROM Extracted_Blocks
    WHERE TRIM(Raw_Block) != ''
),

Parsed_Additionals AS (
    -- LOGIC 3b: Smartly parse the remainder based on whether hyphens exist
    SELECT 
        Cost_Center_ID,
        AU_ID,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '^(.*)[ \t]*-[ \t]*[^-]+$', 1))
            ELSE Remainder 
        END AS AU_Name,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '.*[ \t]*-[ \t]*([^-]+)$', 1))
            ELSE '' 
        END AS Segment_Name
    FROM Separated_ID_And_Remainder
),

Cleaned_Stack AS (
    -- LOGIC 4: Combine BOTH branches, clean quotes, remove weird spaces, drop blanks
    SELECT DISTINCT 
        Cost_Center_ID, 
        AU_ID, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(AU_Name, '^"|"$', ''), '[ ]+', ' ')) AS AU_Name, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(Segment_Name, '^"|"$', ''), '[ ]+', ' '))