In [5]:
import numpy as np
import pandas as pd

# Input Data and Quick Cleaning

In [12]:
fsis_df = pd.read_csv("../data/clean/fsis_cleaned_vWeek3.csv")
nets_df = pd.read_csv("../data/clean/nets_cleaned_vWeek3.csv")

In [13]:
fsis_df_cleaned = fsis_df.drop(columns=["DBAs","LatestMPIActiveDate",
                                        "GrantDate"])
fsis_df_cleaned = fsis_df_cleaned[fsis_df_cleaned["Activities"].str.contains("Poultry Processing")].copy()
#fsis_df_cleaned = fsis_df_cleaned[fsis_df_cleaned["Chicken\nSlaughter"].notnull()]

# Fuzzy Matching

In [14]:
from fuzzywuzzy import fuzz
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm as tqdm_progress



In [15]:
# The function takes in the fsis and nets datasets and uses fuzzy matching on address to add in
# parent company and sales information from the nets dataset to the fsis dataset

def find_best_match(df_fsis, df_nets, fuzz_ratio: float=75):
    df_fsis["Parent Company"] = np.NaN
    df_fsis["Sales"] = np.NaN

    for i, fsis in tqdm_progress(df_fsis.iterrows(), total=len(df_fsis)):
        fsis_address = fsis["Full Address"].lower()
        for k, nets in df_nets.iterrows():
            nets_address = nets["ADDRESS"]
            if fuzz.token_sort_ratio(nets_address, fsis_address) > fuzz_ratio:
                df_fsis.loc[i, "Parent Company"] = nets["PARENT COMPANY"]
                df_fsis.loc[i, "Sales"] = nets["SALESHERE"]
                break

    return df_fsis

In [16]:
# The function takes in the fsis and nets datasets and uses fuzzy matching on address to add in
# parent company and sales information from the nets dataset to the fsis dataset
# This function utilizes multi-threading to speed up the process of iterating through the datasets for matches

def find_best_match_threaded(df_fsis, df_nets, fuzz_ratio: float=75, num_threads: int=4):
    df_fsis["Parent Company"] = np.NaN
    df_fsis["Sales"] = np.NaN

    def find_match(i, df_fsis, df_nets):
        fsis_address = df_fsis.at[i, "Full Address"].lower()
        for k, nets in df_nets.iterrows():
            nets_address = nets["ADDRESS"]
            if fuzz.token_sort_ratio(nets_address, fsis_address) > fuzz_ratio:
                return i, {
                    "Parent Company": nets["PARENT COMPANY"],
                    "Sales": nets["SALESHERE"]
                }
        return i, {}

    # Create a ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        results = list(tqdm_progress(executor.map(find_match, df_fsis.index, [df_fsis]*len(df_fsis), [df_nets]*len(df_fsis), chunksize=1), total=len(df_fsis)))

    # Update the DataFrame with the results
    for i, result in results:
        if result:
            df_fsis.at[i, "Parent Company"] = result["Parent Company"]
            df_fsis.at[i, "Sales"] = result["Sales"]

    return df_fsis

In [17]:
# Utilizing the multi-threading does not speed up performance

df = find_best_match(fsis_df_cleaned, nets_df)
df.head(2)

  0%|          | 9/4189 [00:03<24:14,  2.87it/s]


KeyboardInterrupt: 

In [None]:
print(df.isna().sum())
print(df.shape[0])
print(df["Parent Company"].value_counts())

In [13]:
df_vF = df[df.Sales.notnull()]

In [14]:
df_vF.to_csv("output/cleaned_matched_plants_v3.csv")