In [None]:
WITH Cleaned_Canadian_PEPs AS (
    -- STEP 1 & 2: Filter for Canada and extract exactly 4 digits from the Cost Center
    SELECT 
        EmployeeName,
        ACIF,
        `Title/Department`,
        Costcenter AS Original_PEP_CostCenter,
        PeopleManager,
        Segment AS PEP_Segment,
        Region,
        -- Extracts the 4 digits at the start of the string
        REGEXP_EXTRACT(TRIM(Costcenter), '^(\d{4})', 1) AS Extracted_CC
    FROM hive_metastore.ra_adido_2025.employee_pep_list_as_of_oct312025
    WHERE TRIM(Region) = 'Canada'
      -- Drops rows that don't start with 4 digits (e.g., completely removes 'ZHAH2')
      AND REGEXP_EXTRACT(TRIM(Costcenter), '^(\d{4})', 1) != ''
)

-- STEP 3: Join to the Mapping Table and pull Columns B, C, D, E, F
SELECT 
    p.EmployeeName,
    p.ACIF,
    p.`Title/Department`,
    p.Original_PEP_CostCenter,
    p.Extracted_CC,
    
    -- Pulling Columns B through F from the mapping file as requested in the EMP01 logic
    m.`Assessable Unit ID and Name` AS Mapping_Col_B_AU,
    m.Segment AS Mapping_Col_C_Segment,
    m.`Additional Assessable Unit ID and Name` AS Mapping_Col_D_Addtl_AU,
    m.Segment2 AS Mapping_Col_E_Segment2,
    m.`Contact of the mapping Confirmation` AS Mapping_Col_F_Contact
    
FROM Cleaned_Canadian_PEPs p
LEFT JOIN cost_center_mapping m
    -- LPAD ensures the mapping file's "533" becomes "0533" so the string join matches perfectly
    ON p.Extracted_CC = LPAD(CAST(m.`Cost Center` AS STRING), 4, '0')