In [None]:
CREATE OR REPLACE TEMP VIEW vw_cost_center_mapping_bootstrap AS

WITH Primary_AU AS (
    -- 1. Grab the primary AU directly from the base columns
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        TRIM(CAST(`AssessableUnitID` AS STRING)) AS AU_ID,
        TRIM(`AssessableUnitName`) AS AU_Name,
        TRIM(`Segment`) AS Segment_Name
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
    WHERE `AssessableUnitID` IS NOT NULL
),

Additional_Strings AS (
    -- 2. Handle the Col E "Yes" rule
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        CASE 
            WHEN TRIM(`AdditionalAssessableUnitIDandNameandSegment`) = 'Yes' 
            THEN COALESCE(`AdditionalAUID`, '')
            ELSE CONCAT_WS(' ', 
                    COALESCE(`AdditionalAssessableUnitIDandNameandSegment`, ''), 
                    COALESCE(`AdditionalAUID`, '')
                 )
        END AS Mashed_String
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
),

Extracted_Blocks AS (
    -- 3. Slice the mashed text into blocks at every 6-digit boundary
    SELECT 
        Cost_Center_ID,
        EXPLODE(regexp_extract_all(Mashed_String, '([0-9]{6}.*?(?=[0-9]{6}|$))')) AS Raw_Block
    FROM Additional_Strings
    WHERE Mashed_String != ''
),

Separated_ID_And_Remainder AS (
    -- 4a. Pull out the 6-digit ID, and isolate the rest of the text
    SELECT 
        Cost_Center_ID,
        TRIM(regexp_extract(Raw_Block, '^([0-9]{6})', 1)) AS AU_ID,
        TRIM(REGEXP_REPLACE(Raw_Block, '^[0-9]{6}[ \t-]*', '')) AS Remainder
    FROM Extracted_Blocks
    WHERE TRIM(Raw_Block) != ''
),

Parsed_Additionals AS (
    -- 4b. Smartly parse the remainder based on whether hyphens exist
    SELECT 
        Cost_Center_ID,
        AU_ID,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '^(.*)[ \t]*-[ \t]*[^-]+$', 1))
            ELSE Remainder 
        END AS AU_Name,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '.*[ \t]*-[ \t]*([^-]+)$', 1))
            ELSE '' 
        END AS Segment_Name
    FROM Separated_ID_And_Remainder
),

Cleaned_Stack AS (
    -- 5. Combine and clean quotes/spaces
    SELECT DISTINCT 
        Cost_Center_ID, 
        AU_ID, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(AU_Name, '^"|"$', ''), '[ ]+', ' ')) AS AU_Name, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(Segment_Name, '^"|"$', ''), '[ ]+', ' ')) AS Segment_Name 
    FROM (
        SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Primary_AU
        UNION
        SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Parsed_Additionals
        WHERE AU_ID != '' AND AU_ID IS NOT NULL
    )
    WHERE AU_Name IS NOT NULL AND TRIM(AU_Name) != ''
)

-- 6. STANDARDIZATION: Force one strict Name and Segment per AU_ID
SELECT DISTINCT 
    Cost_Center_ID,
    AU_ID,
    -- Picks the first name alphabetically to establish a single source of truth
    FIRST_VALUE(AU_Name) OVER (PARTITION BY AU_ID ORDER BY AU_Name ASC) AS AU_Name,
    -- Uses the same ordering so the Segment stays correctly paired with the Name
    FIRST_VALUE(Segment_Name) OVER (PARTITION BY AU_ID ORDER BY AU_Name ASC) AS Segment_Name
FROM Cleaned_Stack