In [None]:
CREATE OR REPLACE TEMP VIEW vw_cost_center_mapping_bootstrap AS

WITH Primary_AU AS (
    -- 1. Grab the primary AU directly from the base columns
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        TRIM(CAST(`AssessableUnitID` AS STRING)) AS AU_ID,
        TRIM(`AssessableUnitName`) AS AU_Name,
        TRIM(`Segment`) AS Segment_Name
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
    WHERE `AssessableUnitID` IS NOT NULL
),

Additional_Strings AS (
    -- 2. Handle the Col E "Yes" rule to build our string for parsing
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        CASE 
            WHEN TRIM(`AdditionalAssessableUnitIDandNameandSegment`) = 'Yes' 
            THEN COALESCE(`AdditionalAUID`, '')
            
            ELSE CONCAT_WS(' ', 
                    COALESCE(`AdditionalAssessableUnitIDandNameandSegment`, ''), 
                    COALESCE(`AdditionalAUID`, '')
                 )
        END AS Mashed_String
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
),

Extracted_Blocks AS (
    -- 3. Slice the mashed text into discrete string blocks
    SELECT 
        Cost_Center_ID,
        EXPLODE(regexp_extract_all(Mashed_String, '(\\d{6}.*?(?=\\d{6}|$))')) AS Raw_Block
    FROM Additional_Strings
    WHERE Mashed_String != ''
),

Parsed_Additionals AS (
    -- 4. Parse the discrete strings into the 3 target columns
    SELECT 
        Cost_Center_ID,
        TRIM(regexp_extract(Raw_Block, '^(\\d{6})', 1)) AS AU_ID,
        TRIM(regexp_extract(Raw_Block, '^\\d{6}\\s*-\\s*(.*)\\s*-\\s*[^-]+$', 1)) AS AU_Name,
        TRIM(regexp_extract(Raw_Block, '.*\\s*-\\s*([^-]+)$', 1)) AS Segment_Name
    FROM Extracted_Blocks
    WHERE TRIM(Raw_Block) != ''
)

-- 5. Force absolute distinctness across the entire combined dataset
SELECT DISTINCT 
    Cost_Center_ID, 
    AU_ID, 
    -- Clean up any hidden carriage returns or double spaces that cause fake duplicates
    TRIM(REGEXP_REPLACE(AU_Name, '\\s+', ' ')) AS AU_Name, 
    TRIM(REGEXP_REPLACE(Segment_Name, '\\s+', ' ')) AS Segment_Name 
FROM (
    SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Primary_AU
    UNION
    SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Parsed_Additionals
    WHERE AU_ID != '' AND AU_ID IS NOT NULL
)