# Standardization steps for the death data integration process. 


### 1 Normalization of individual Variables

In [None]:
%%bigquery  --project $project_dev
-- defined functions  to be used in all Tokens
-- We want to return Null when the input is ''
--We want everything to be upper case


CREATE OR REPLACE FUNCTION Death_Data_Integration.IsValid_DATE(input ANY Type)
--Return "" if invalid DOB else return the DOB
-- A Valid DOB is a date in the format YYYYMMDD
-- A 4-digit year from 0000 to 2022
-- Followed by a 2 digit month from 01 to 12 (padded with leading zeros if necessary)
-- Followed by a 2 digit day from 01 to 31 (padded with leading zeros if necessary)
-- TODO: remove date 1901-01-01
RETURNS STRING

  AS (

    IF(
      (
        cast(
              LEFT
                (
                  regexp_replace(TRIM(cast(input as string)), '[^0-9]', ''),4
                ) as INT64 )
                <1850 -- remove dates before 1850
         OR 
         
         cast(
              LEFT
                (
                  regexp_replace(TRIM(cast(input as string)), '[^0-9]', ''),4
                ) as INT64 )
                >(SELECT EXTRACT(YEAR FROM CURRENT_DATETIME() )) -- remove dates > than the current year
      )
         ,'',input 
      ) 
  );


CREATE OR REPLACE FUNCTION Death_Data_Integration.IsValid_SSN(input STRING)
--Return "" if invalid SSN else return the SSN
-- An invalid SSN is one that the SSA never assigned. In case you’re wondering, a valid SSN will never look like this:

-- The first three digits as “000,” “666,” or in the 900 series.
-- The second group that consists of two digits as “00.”
-- The third group consisting of four digits as “0000.”

RETURNS STRING
  AS (
      REGEXP_replace(
        input,
        '^((666|000|[9][0-9][0-9])\\d{2}\\d{4})|(\\d{3}00\\d{4})|(\\d{3}\\d{2}0000)0{9}|1{9}|2{9}|3{9}|4{9}|5{9}|6{9}|7{9}|8{9}|9{9}|219099999|078051120|123456789|012345678|001010001|090909090$',
        ""
        )
  );



CREATE OR REPLACE FUNCTION Death_Data_Integration.Normalize_SSN(input ANY Type)
  RETURNS STRING
  AS (
    NULLIF
    (
      Death_Data_Integration.IsValid_SSN(
                  UPPER(
                    regexp_replace(TRIM(input), '[^0-9]', '')
                      )
                    ),
     '')
        
     );

CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_String(input STRING)
  RETURNS STRING
  AS (
      NULLIF
      (
        UPPER
        (
          regexp_replace(TRIM(input), '[^a-zA-Z]', '')),'')
     );

CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Middle_Name(input STRING)
  RETURNS STRING
  AS (
      NULLIF
      (
        LEFT (  -- trucating string  
          UPPER
        (
          regexp_replace(TRIM(input), '[^a-zA-Z]', '')
          )
            ,15),
        '')
     );

CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_First_Name(input STRING)
  RETURNS STRING
  AS (
      NULLIF
      (
        LEFT (  -- trucating string  
          UPPER
        (
          regexp_replace(TRIM(input), '[^a-zA-Z]', '')
          )
            ,15),
        '')
     );

CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Last_Name(input STRING)
  RETURNS STRING
  AS (
      NULLIF
      (
        LEFT (
          UPPER
        (
          regexp_replace(TRIM(input), '[^a-zA-Z]', '')
          )
            ,20),
        '')
     );


CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_DATE(input ANY Type)
  RETURNS STRING
  AS (
    NULLIF
    (
      Death_Data_Integration.IsValid_DATE(
      UPPER
      (
        LEFT
        (
          regexp_replace(TRIM(cast(input as string)), '[^0-9]', ''),8)
      )
    )
      ,'')
     );

-- 1st initial of first name
CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Extract_First_N_Characters(input STRING, characters INT64)
  RETURNS STRING
  AS (
      NULLIF
      (
        UPPER
        (
          LEFT
          (
            regexp_replace
            (
                TRIM(input), '[^a-zA-Z]', ''), characters)),'')
     );

-- SOUNDEX
CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Soundex(input STRING)
  RETURNS STRING
  AS (
    NULLIF
    (
      UPPER
      (
        SOUNDEX(
          regexp_replace(TRIM(input), '[^a-zA-Z]', ''))),'')
     );

-- extract Year
CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Extract_Year(input ANY Type)
  RETURNS STRING
  AS (
    NULLIF
    (
      Death_Data_Integration.IsValid_DATE(
      UPPER
      (
        LEFT
        (
          regexp_replace(TRIM(cast(input as string)), '[^0-9]', ''),4)
          )
      )
          ,'')
     );

-- extract last 4 digits of SSN
CREATE OR REPLACE  FUNCTION Death_Data_Integration.Normalize_Extract_Last_4_Digits_SSN(input ANY Type)
  RETURNS STRING
  AS (
    NULLIF
    (
      RIGHT
                  (
      Death_Data_Integration.IsValid_SSN
      (
            UPPER
                (
                  
                    regexp_replace(TRIM(cast(input as string)), '[^0-9]', '')

                    )
                    
                    )
             ,4),
            '')
           
     );
     


