In [111]:
import numpy as np
import pandas as pd

# Input Data and Cleaning

In [128]:
fsis_df = pd.read_csv("data/FSIS.csv")
fsis_df.head(2)

Unnamed: 0.1,Unnamed: 0,EstNumber,Establishment ID,Company,Phone,GrantDate,Activities,DBAs,LatestMPIActiveDate,Size,Chicken\nSlaughter,Full Address
0,0,G1028,195,Papetti's Hygrade Egg Products Inc.,(908) 282-7900,12/05/2019,Egg Product,,2023-09-18,Small,,One Papetti Plaza
1,1,G1105,126734,"American Egg Products, LLC",(912) 449-5700,09/22/2021,Egg Product,,2023-09-18,Small,,375 Pierce Industrial Blvd.


In [129]:
fsis_df[fsis_df["Chicken\nSlaughter"].notnull()]

Unnamed: 0.1,Unnamed: 0,EstNumber,Establishment ID,Company,Phone,GrantDate,Activities,DBAs,LatestMPIActiveDate,Size,Chicken\nSlaughter,Full Address
270,270,M10038 + P10038 + V10038,7125,Scotts Hook & Cleaver Inc.,(269) 626-8891,08/11/2021,"Meat Slaughter, Meat Processing, Poultry Slaug...",Pease Packing Corporation,2023-09-18,Very Small,Yes,8713 South 38th Street
274,274,M10053 + P10053,7130,Michigan State University Dept of Animal Science,(517) 355-8452,09/09/2014,"Meat Slaughter, Meat Processing, Poultry Slaug...",MSU Meat lab,2023-09-18,Very Small,Yes,Anthony Hall
297,297,M1015 + P1015,6413,"Table Trust Brands, LLC",(717) 436-5921,10/28/2021,"Meat Processing, Poultry Slaughter, Poultry Pr...",Alle; Empire Kosher; Galil; Kosher Valley,2023-09-18,Large,Yes,247 Empire Drive
422,422,M112 + P112,3847,"Tyson Foods, Inc",(870) 438-5211,12/20/2019,"Poultry Slaughter, Poultry Processing","Tyson Foods , Inc. M112/P112",2023-09-18,Large,Yes,601 Tyson Drive
474,474,M1234 + P1234,6163896,Mountaire Farms Inc.,,09/14/2020,"Meat Processing, Poultry Slaughter, Poultry Pr...",Mountaire Farms of North Carolina Corp.,2023-09-18,Large,Yes,1101 East Third Street
...,...,...,...,...,...,...,...,...,...,...,...,...
6182,6182,P912,4493,Wayne Farms LLC,(334) 738-2148,04/05/2019,"Poultry Slaughter, Poultry Processing","Covington Farms; Sanderson Farms, Inc.; Sander...",2023-09-18,Large,Yes,444 Baskin Street South
6184,6184,P9197,696,"Perdue Foods, LLC.",(252) 348-4200,02/04/2021,"Poultry Slaughter, Poultry Processing",Chef's Choice; Cook 'n Good; Country Corner; H...,2023-09-18,Large,Yes,3539 Governor's Road
6187,6187,P935,2260,Allen Harim LLC,(302) 684-1640,03/30/2021,"Poultry Slaughter, Poultry Processing",,2023-09-18,Large,Yes,18752 Harbeson Road
6188,6188,P9378 + V9378,124559,Baffoni's Poultry Farm Inc.,,02/07/2020,"Poultry Slaughter, Poultry Processing, Volunta...",,2023-09-18,Very Small,Yes,324 Greenville Avenue


In [130]:
fsis_df.Activities.value_counts()[:10]

Meat Processing, Poultry Processing                                        2823
Meat Processing                                                             874
Meat Slaughter, Meat Processing                                             496
Certification - Export, Identification - Meat, Identification - Poultry     304
Poultry Slaughter, Poultry Processing                                       206
Meat Processing, Poultry Processing, Certification - Export                 181
Imported Product                                                            148
Poultry Processing                                                          142
Meat Slaughter, Meat Processing, Poultry Processing                         113
Meat Processing, Poultry Processing, Voluntary Processing - Meat             93
Name: Activities, dtype: int64

In [160]:
fsis_df_cleaned = fsis_df.drop(columns=["DBAs","LatestMPIActiveDate",
                                        "GrantDate"])
fsis_df_cleaned = fsis_df_cleaned[fsis_df_cleaned["Activities"].str.contains("Poultry Processing")].copy()
#fsis_df_cleaned = fsis_df_cleaned[fsis_df_cleaned["Chicken\nSlaughter"].notnull()]

In [161]:
fsis_df_cleaned.head(2)

Unnamed: 0.1,Unnamed: 0,EstNumber,Establishment ID,Company,Phone,Activities,Size,Chicken\nSlaughter,Full Address
260,260,M1 + P1370,2766,Vienna Beef Ltd.,(312) 278-7800,"Meat Processing, Poultry Processing",Small,,1000 West Pershing Road
261,261,M10 + P7212 + V10,4119,Buckhead Meat & Seafood of Houston.,(281) 405-3200,"Meat Processing, Poultry Processing, Certifica...",Small,,10310 Greens Crossing Blvd.


In [162]:
cleaned_nets = pd.read_csv("data/nets/nets_cleaned.csv")
cleaned_nets.head(2)

Unnamed: 0.1,Unnamed: 0,DUNSNUMBER,COMPANY,ADDRESS,CITY,STATE,ZIPCODE,PARENT COMPANY,SIC8,PARENT DUNS,SALESHERE,SALESHEREC,SALESGROWTH,LATITUDE,LONGITUDE
0,0,2344232,WATSONS QULTY TURKEY PDTS INC ...,641 STATE RTE 168 ...,BLACKWOOD,NJ,8012,WATSONS QULTY TURKEY PDTS INC ...,20150101,2344232,26654460.0,3.0,2.0,39.7912,75.0597
1,1,1326545,HENNINGSEN FOODS INC ...,10025 I ST ...,OMAHA,NE,68127,POST HOLDINGS INC ...,20150000,805730178,6876800.0,3.0,2.0,41.2156,96.0697


# Fuzzy Matching

In [163]:
from fuzzywuzzy import fuzz

In [174]:
# The function takes in the fsis and nets datasets and uses fuzzy matching on address to add in
# parent company and sales information from the nets dataset to the fsis dataset

def find_best_match(df_fsis, df_nets, fuzz_ratio: float=75):
    df_fsis["Parent Company"] = np.NaN
    df_fsis["Sales"] = np.NaN
    df_fsis["latitude"] = np.NaN
    df_fsis["longitude"] = np.NaN
    for i, fsis in df_fsis.iterrows():
        fsis_address = fsis["Full Address"].lower()
        for k, nets in df_nets.iterrows():
            nets_address = nets["ADDRESS"]
            if fuzz.token_sort_ratio(nets_address, fsis_address) > fuzz_ratio:
                df_fsis.loc[i, "Parent Company"] = nets["PARENT COMPANY"]
                df_fsis.loc[i, "Sales"] = nets["SALESHERE"]
                df_fsis.loc[i, "latitude"] = nets["LATITUDE"]
                df_fsis.loc[i, "longitude"] = nets["LONGITUDE"]
                break
                
        # Progress reporting as function is slow
        if i % 50 == 0:
            print("{}%".format(np.round(i / df_fsis.shape[0] * 100,2)))
        # Band-aid fix for infinite looping error
        if i >= df_fsis.shape[0]:
            return df_fsis
    return df_fsis

In [175]:
df = find_best_match(fsis_df_cleaned, cleaned_nets)
df.head(2)

7.16%
8.36%
9.55%
10.74%
11.94%
13.13%
15.52%
17.9%
19.1%
20.29%
21.48%
22.68%
23.87%
25.07%
26.26%
27.45%
28.65%
32.23%
33.42%
34.61%
35.81%
37.0%
38.2%
39.39%
40.58%
41.78%
42.97%
45.36%
46.55%
47.74%
50.13%
52.52%
54.91%
56.1%
58.49%
59.68%
64.45%
68.04%
69.23%
70.42%
71.62%
75.2%
76.39%
77.58%
78.78%
81.16%
82.36%
83.55%
85.94%
87.13%
88.33%
93.1%
95.49%
96.68%
97.88%
99.07%


Unnamed: 0.1,Unnamed: 0,EstNumber,Establishment ID,Company,Phone,Activities,Size,Chicken\nSlaughter,Full Address,Parent Company,Sales,latitude,longitude
260,260,M1 + P1370,2766,Vienna Beef Ltd.,(312) 278-7800,"Meat Processing, Poultry Processing",Small,,1000 West Pershing Road,,,,
261,261,M10 + P7212 + V10,4119,Buckhead Meat & Seafood of Houston.,(281) 405-3200,"Meat Processing, Poultry Processing, Certifica...",Small,,10310 Greens Crossing Blvd.,,,,


In [176]:
print(df.isna().sum())
print(df.shape[0])
print(df["Parent Company"].value_counts())

Unnamed: 0               0
EstNumber                0
Establishment ID         0
Company                  0
Phone                  619
Activities               0
Size                    41
Chicken\nSlaughter    3932
Full Address             0
Parent Company        3734
Sales                 3734
latitude              3734
longitude             3734
dtype: int64
4189
TYSON FOODS INC                                       48
PERDUE FARMS INCORPORATED                             26
PERDUE FARMS INC                                      26
DLISTED                                               17
JBS USA HOLDINGS INC                                  17
                                                      ..
EL JAY POULTRY CORP                                    1
IDEAL PLTY BREEDING FARMS INC                          1
CRIDER CLAXTON LLC                                     1
TALISMAN FOODS INC                                     1
AMERICAN OSTRICH COMPANY LLC                           1
Name

In [177]:
df_vF = df[df.Sales.notnull()]

In [178]:
df_vF.to_csv("output/cleaned_matched_plants_v2.csv")