In [None]:
CREATE OR REPLACE TEMP VIEW vw_cost_center_mapping_bootstrap AS

WITH Base_Data AS (
    -- 0. Pull the raw columns to make the downstream code cleaner
    SELECT 
        TRIM(CAST(`CostCenterId` AS STRING)) AS Cost_Center_ID,
        TRIM(CAST(`AssessableUnitID` AS STRING)) AS Primary_AU_ID,
        TRIM(`AssessableUnitName`) AS Primary_AU_Name,
        TRIM(`Segment`) AS Primary_Segment,
        TRIM(`AdditionalAssessableUnitIDandNameandSegment`) AS Col_E,
        TRIM(`AdditionalAUID`) AS Col_F
    FROM hive_metastore.ra_adido_2025.fy25_cost_center_mapping
),

Branch_Primary AS (
    -- BRANCH 1: If Col E is 'Yes' (or completely blank), STRICTLY use B, C, D
    SELECT 
        Cost_Center_ID,
        Primary_AU_ID AS AU_ID,
        Primary_AU_Name AS AU_Name,
        Primary_Segment AS Segment_Name
    FROM Base_Data
    WHERE Col_E = 'Yes' 
       OR Col_E IS NULL 
       OR Col_E = '' -- (Safe fallback: if E is blank, assume normal BCD applies)
),

Branch_Additional_Raw AS (
    -- BRANCH 2: If Col E is NOT 'Yes', STRICTLY use E and F. (Ignore B, C, D)
    SELECT 
        Cost_Center_ID,
        CONCAT_WS(' ', COALESCE(Col_E, ''), COALESCE(Col_F, '')) AS Mashed_String
    FROM Base_Data
    WHERE Col_E != 'Yes' 
      AND Col_E IS NOT NULL 
      AND Col_E != ''
),

Extracted_Blocks AS (
    -- Slice the mashed E & F text into blocks at every 6-digit boundary
    SELECT 
        Cost_Center_ID,
        EXPLODE(regexp_extract_all(Mashed_String, '([0-9]{6}.*?(?=[0-9]{6}|$))')) AS Raw_Block
    FROM Branch_Additional_Raw
    WHERE Mashed_String != ''
),

Separated_ID_And_Remainder AS (
    -- Pull out the 6-digit ID, and isolate the rest of the text
    SELECT 
        Cost_Center_ID,
        TRIM(regexp_extract(Raw_Block, '^([0-9]{6})', 1)) AS AU_ID,
        TRIM(REGEXP_REPLACE(Raw_Block, '^[0-9]{6}[ \t-]*', '')) AS Remainder
    FROM Extracted_Blocks
    WHERE TRIM(Raw_Block) != ''
),

Parsed_Additionals AS (
    -- Smartly parse the remainder based on whether hyphens exist
    SELECT 
        Cost_Center_ID,
        AU_ID,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '^(.*)[ \t]*-[ \t]*[^-]+$', 1))
            ELSE Remainder 
        END AS AU_Name,
        CASE 
            WHEN Remainder LIKE '%-%' THEN TRIM(regexp_extract(Remainder, '.*[ \t]*-[ \t]*([^-]+)$', 1))
            ELSE '' 
        END AS Segment_Name
    FROM Separated_ID_And_Remainder
),

Cleaned_Stack AS (
    -- Combine BOTH branches, clean quotes, and remove weird spaces
    SELECT DISTINCT 
        Cost_Center_ID, 
        AU_ID, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(AU_Name, '^"|"$', ''), '[ ]+', ' ')) AS AU_Name, 
        TRIM(REGEXP_REPLACE(REGEXP_REPLACE(Segment_Name, '^"|"$', ''), '[ ]+', ' ')) AS Segment_Name 
    FROM (
        SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Branch_Primary
        UNION
        SELECT Cost_Center_ID, AU_ID, AU_Name, Segment_Name FROM Parsed_Additionals
    )
    -- Strictly drop any blank rows
    WHERE AU_ID IS NOT NULL AND AU_ID != ''
      AND AU_Name IS NOT NULL AND TRIM(AU_Name) != ''
)

-- FINAL STANDARDIZATION: Force one strict Name and Segment per AU_ID across the entire dataset
SELECT DISTINCT 
    Cost_Center_ID,
    AU_ID,
    FIRST_VALUE(AU_Name) OVER (PARTITION BY AU_ID ORDER BY AU_Name ASC) AS AU_Name,
    FIRST_VALUE(Segment_Name) OVER (PARTITION BY AU_ID ORDER BY AU_Name ASC) AS Segment_Name
FROM Cleaned_Stack