In [13]:
import pandas as pd
import re

#This code is a python port of the R code at:
#https://github.com/LACountyDPH/generic-bot/blob/main/Web%20CMR/2_matching/scripts/clean_addr_source.R. 
#This code was used to preprocess the address information only prior to geolocation and deduplication/record linkage. 
#It does not perform a complete cleanup as seen in the R code from:
#https://github.com/LACountyDPH/generic-matching/blob/main/match_scripts/clean_addr_source.R.
######################################FUNCTION CALL###################################################################
## Assuming the address, city, state, and zip columns are named 'address', 'city', 'state', and 'zip' respectively.
#df_cleaned = clean_address(df, 'address', 'city', 'state', 'zip'). That code also preprocesses: email, telephone numbers, etc..

# Define PEH and invalid address keywords and patterns
invalid_addr1 = (
    "UNKNO|HOMELES|NEED ADDR|NEED INFO|HOMESLESS|TRANSIENT|SUPPRESS|ENCAMPM|UNDOMIC|UNDERPASS|HOMELESS|NOADDRESS|"
    "NO ADDRESS|NONE PROVIDED|REFUSED|123 DEFAULT ST|PT STATES NONE|999 TRANSIENT WAY|UNSHELTERED|UNHOUSED|"
    "UNKKNOWN|UNKNWOWN|UNKONW|UNKOWN|UNSTABLE HOUSING|XNEEDX|XNEEDSX|HOMELEXX|X NEED X|TRANSET|MISSING|"
    "MAILING ADDRESS|MAILING ONLY|NOT AVAILABLE|NOT PROVIDED|THIS IS NOT A HOME ADDRESS|MAIL RTND BAD ADDRESS|"
    "MAILING ADDRESS ONLY|NEED NEW ADDRESS|FILLING OUT FORMS|ADDRS VERF|NO ACTUAL ADDRESS|NEEDS ADDRESS|"
    "DOES NOT HAVE AN ADDRESS|REQUEST CALL|UNABLE TO PROVIDE|RETURN MAIL|BAD ADDRESS|UNABLE TO LOCATE|NOT GIVEN|"
    "NONE GIVEN|NO KNOWN ADDRESS|450 BAUCHET"
)

peh_keywords1 = [
    'NOT PROVIDED', 'UNSPECIFIED', '*NONE', 'REFUSED', '`', 'NULL', 'NONE', 'NA', 'N/A',
    'ODR', 'SHELTER', 'IN TRANSIT', 'UNK', 'CA', 'NOT CALIFORNIA', 'LAC'
]

invalid_addr5 = peh_keywords1 + [
    'LOS ANGELES', 'LA', '# LA', '#LA', 'N A', 'NA',
    '999 TRANSIENT WAY', '999 TRANSIENT', '99 JAIL CASE', '123 UNK', '9999 TRANSIENT', '9999 TRANSIENT WAY', '9999 TRANSIENT',
    'SOUTHBAY ER', 'XXX HOMELESS XXX', 'POST OFFICE BOX', 'PTIENT STATES', 'SUNSET ED', 'VALLEY PALMS CARE CENTER', 'W LOS ANGELES ED', 'WLA ED', 'UNK'
]

pobox_keywords = (
    "POBOX|PO BOX|P0 BOX|POSTAL|PO BX|9O BOX|OK BOX|P 0 BOX|P BOX|PB BOX|PI BOX|PIC BOX|P.O. BOX|P.O.BOX|"
    "PIO BOX|PITZER BOX|PO B0X|PO BAX|PO BOS|PO BOT|PO BOV|POX BOX|SHOW BOX|POP BOX|P O BOX|POX BOX"
)

# Cleaning function
def clean_addr_source(df, addr_var, city_var, state_var, zip_var):
    # Address cleaning based on the patterns
    def check_and_clean_address(row):
        address = re.sub(r"[^a-zA-Z0-9\s,/]", " ", str(row[addr_var]).upper()).strip()
        if any(keyword in address for keyword in invalid_addr5):
            return ""
        if re.search(pobox_keywords, address, re.IGNORECASE):
            return ""
        return re.sub(r"^\s*$", "", address)

    df['address_source2'] = df.apply(check_and_clean_address, axis=1)
    
    return df

# Example data
data = {
    'address': [
        "1234 Main St. UNKNO",
        "PO Box 123",
        "9999 Transient Way",
        "450 Bauchet St",
        "100 Normal St",
        "456 Elm St, Apt 789",
        "UNK 7890",
        "NA",
        "SOUTHBAY ER Medical Facility",
        "PO BOX@EMAIL.COM"
    ],
    'city': ["Anytown"] * 10,
    'state': ["CA"] * 10,
    'zip': ["90001"] * 10
}

df = pd.DataFrame(data)

# Clean the DataFrame
df_cleaned = clean_addr_source(df, 'address', 'city', 'state', 'zip')

# Display the cleaned DataFrame
print(df_cleaned[['address', 'address_source2']])

                        address      address_source2
0           1234 Main St. UNKNO                     
1                    PO Box 123                     
2            9999 Transient Way                     
3                450 Bauchet St       450 BAUCHET ST
4                 100 Normal St        100 NORMAL ST
5           456 Elm St, Apt 789  456 ELM ST, APT 789
6                      UNK 7890                     
7                            NA                     
8  SOUTHBAY ER Medical Facility                     
9              PO BOX@EMAIL.COM                     


In [14]:
print(df_cleaned)

                        address     city state    zip      address_source2
0           1234 Main St. UNKNO  Anytown    CA  90001                     
1                    PO Box 123  Anytown    CA  90001                     
2            9999 Transient Way  Anytown    CA  90001                     
3                450 Bauchet St  Anytown    CA  90001       450 BAUCHET ST
4                 100 Normal St  Anytown    CA  90001        100 NORMAL ST
5           456 Elm St, Apt 789  Anytown    CA  90001  456 ELM ST, APT 789
6                      UNK 7890  Anytown    CA  90001                     
7                            NA  Anytown    CA  90001                     
8  SOUTHBAY ER Medical Facility  Anytown    CA  90001                     
9              PO BOX@EMAIL.COM  Anytown    CA  90001                     
