Implement fuzzy matching on LLCs that have filed evictions to lookup registered LLC info

Updated 4/4/25

In [None]:
import pandas as pd
import numpy as np
import re
from concurrent.futures import ProcessPoolExecutor
import unicodedata
from rapidfuzz import process, fuzz

In [None]:
evictions_df = pd.read_parquet('DATA/evictions.parquet')

In [None]:
llc_df = pd.read_parquet('DATA/llc.parquet')

Conservative pre-processing approach to maintain efficiency

Can make this more aggressive if it doesn't work out

In [None]:
def preprocess_names(df, column_name):
    """
    Normalize names, sort words alphabetically for natural language processing if we need it later
    @df: pandas dataframe
    @column_name: str column name to preprocess
    
    """
    column = column_name + "_normalized"
    df[column] = (
        df[column_name]
        .str.lower()
        .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
        .str.replace(r'\s+', ' ', regex=True)
        .str.strip()
    ) 

    df[column] = (
        df[column]
        .str.split()
        .map(lambda x: " ".join(sorted(x)) if isinstance(x, list) else ' ')
    )

    return df

In [None]:
e_columns_for_p = ['plaintiff_name'] # columns for preprocessing, add more as needed
l_columns_for_p = ['EntityID']

In [None]:
for i in e_columns_for_p: # apply preprocessing for both df
    evictions_df = preprocess_names(evictions_df, i)

for i in l_columns_for_p:
    llc_df = preprocess_names(llc_df, i)

In [None]:
# create flag 'is_llc' to restrict lookup to only llcs
llc_keywords = r"\b(llc|l\.l\.c|inc|inc\.|corporation|corp|corp\.|co|co\.|company|ltd|ltd\.|lp|l\.p\.|pllc|plc|plc\.|limited|limited liability company)\b"
evictions_df["is_llc"] = evictions_df["plaintiff_name_normalized"].str.contains(llc_keywords, regex=True)

  evictions["is_llc"] = evictions["plaintiff_name_normalized"].str.contains(llc_keywords, regex=True)


In [None]:
# Now that we've set the 'is_llc' flag, we can remove these terms to reduce noise
suffixes = [
    "llc", "l.l.c", "inc", "incorporated", "corp", "corporation",
    "co", "company", "ltd", "l.p", "lp", "pllc", "plc", "llp", "p.c"
]

suffix_pattern = r'\s*(?:' + '|'.join(suffixes) + r')\.?\s*$'

evictions_df['plaintiff_name_normalized'] = evictions_df['plaintiff_name_normalized'].str.replace(suffix_pattern, '', regex=True)
llc_df['EntityID_normalized'] = llc_df['EntityID_normalized'].str.replace(suffix_pattern, '', regex=True)

In [6]:
evictions_df = evictions_df[evictions_df['is_llc']] # restrict lookup to only llcs
plaintiffs = set(evictions_df['plaintiff_name_normalized']) # set of unique LLC plaintiffs
llcs = set(llc_df['EntityID_normalized']) # set of unique registered LLCs

In [None]:
# Function to process matching for a subset of plaintiff names
def process_chunk(plaintiff_chunk, llc_names, confidence=80):
    matches = {}
    for i, plaintiff in enumerate(plaintiff_chunk):
        best_match, score = process.extractOne(plaintiff, llc_names, scorer=fuzz.WRatio)
        if best_match and score >= confidence:
            matches[plaintiff] = (best_match, score)
        else:
            matches[plaintiff] = (None, 0)
        if (i + 1) % 100 == 0:
            print(f"Processed {i + 1} out of {len(plaintiff_chunk)} plaintiffs in this chunk.")
    return matches

# Function to split the plaintiff set and process in parallel
def parallel_match_plaintiffs(plaintiff_names_set, llc_names_set, chunk_size=1000):
    plaintiff_chunks = [list(plaintiff_names_set)[i:i + chunk_size] for i in range(0, len(plaintiff_names_set), chunk_size)]
    print(f"Total of {len(plaintiff_chunks)} chunks to process.")

    all_matches = {}
    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_chunk, chunk, llc_names_set) for chunk in plaintiff_chunks]
        for i, future in enumerate(futures):
            all_matches.update(future.result())
            print(f"Chunk {i + 1} processed. Total matches found: {len(all_matches)}")
    
    return all_matches

In [8]:
chunk_size = len(plaintiffs) // 16 # adjust for num of cores, 16 worked well under an hour
# probably only needed ~100GB memory or less

In [None]:
# Matches looks like:
# {plaintiff_name: (best_match_llc, confidence_score)}
matches = parallel_match_plaintiffs(plaintiffs, llcs, chunk_size=chunk_size) 

Total of 17 chunks to process.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 100 out of 973 plaintiffs in this chunk.
Processed 200 out of 973 plaintiffs in this chunk.
Processed 200 out of 973 plaintiffs in this chunk.
Processed 200 out of 973 plaintiffs in this chunk.


In [None]:
evictions_df['match_tuple'] = evictions_df['plaintiff_name_normalized'].map(matches) # Match tuple looks like (best_match, confidence_score)
# Unpack into two separate columns
evictions_df[['best_match', 'match_confidence']] = pd.DataFrame(evictions_df['match_tuple'].tolist(), index=evictions_df.index)

In [22]:
evictions_df = pd.merge(
    left=evictions_df,
    right=llc_df,
    left_on='best_match',
    right_on='EntityID_normalized',
    how='left'
).drop(columns=['best_match'])

In [None]:
evictions_df.to_parquet('evictions_matched.parquet')