In [131]:
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np

This notebooks filters data in the infogroup dataset based upon SIC or NAICS code. It also includes fuzzy string matching to match addresses to the FSIS dataset on meat packing plants.

In [142]:
FSIS_DATA = "../data/fsis-processors-with-location.csv"

INFOGROUP_2021 = ["../data/2021_Business_FullFile_QCQ-A.txt", "../data/2021_Business_FullFile_QCQ-B.txt"]
INFOGROUP_2022 = ["../data/2022_Business_Academic_QCQ.txt"]

SMOKE_TEST = False

In [133]:
USEFUL_COLS = [
    "COMPANY",
    "ADDRESS LINE 1",
    "CITY",
    "STATE",
    ]
SIC_COLS = [
    "PRIMARY SIC CODE", 
    'SIC CODE 1', 
    'SIC CODE 2', 
    'SIC CODE 3',
    'SIC CODE 4'
    ]
NAICS_COLS = [
    "NAICS CODE",
    "PRIMARY NAICS CODE"
    ]
DESCRIPTION_COLS = [
    # "NAICS8 DESCRIPTIONS", # only in 2021
    # "SIC6_DESCRIPTIONS (PRIMARYSIC)" # only in 2021
    "SIC6_DESCRIPTIONS (SIC)"
]

In [134]:
SIC_CODE = "2015" # Poultry Slaughtering and Processing
NAICS_CODE = "311615" # Poultry Processing

SIC_CODE = "0251" # Broiler, Fryer, and Roaster Chickens

In [135]:
df_head = pd.read_csv(INFOGROUP_2022[0], nrows=10000, dtype=str)

In [136]:
df_head.columns

Index(['COMPANY', 'ADDRESS LINE 1', 'CITY', 'STATE', 'ZIPCODE', 'ZIP4',
       'COUNTY CODE', 'AREA CODE', 'IDCODE', 'LOCATION EMPLOYEE SIZE CODE',
       'LOCATION SALES VOLUME CODE', 'PRIMARY SIC CODE', 'SIC6_DESCRIPTIONS',
       'PRIMARY NAICS CODE', 'NAICS8 DESCRIPTIONS', 'SIC CODE',
       'SIC6_DESCRIPTIONS (SIC)', 'SIC CODE 1', 'SIC6_DESCRIPTIONS (SIC1)',
       'SIC CODE 2', 'SIC6_DESCRIPTIONS(SIC2)', 'SIC CODE 3',
       'SIC6_DESCRIPTIONS(SIC3)', 'SIC CODE 4', 'SIC6_DESCRIPTIONS(SIC4)',
       'ARCHIVE VERSION YEAR', 'YELLOW PAGE CODE',
       'EMPLOYEE SIZE (5) - LOCATION', 'SALES VOLUME (9) - LOCATION',
       'BUSINESS STATUS CODE', 'INDUSTRY SPECIFIC FIRST BYTE',
       'YEAR ESTABLISHED', 'OFFICE SIZE CODE', 'COMPANY HOLDING STATUS', 'ABI',
       'SUBSIDIARY NUMBER', 'PARENT NUMBER', 'PARENT ACTUAL EMPLOYEE SIZE',
       'PARENT ACTUAL SALES VOLUME', 'PARENT EMPLOYEE SIZE CODE',
       'PARENT SALES VOLUME CODE', 'SITE NUMBER', 'ADDRESS TYPE INDICATOR',
       'POPULAT

In [137]:
df_head[USEFUL_COLS + SIC_COLS + DESCRIPTION_COLS]

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,PRIMARY SIC CODE,SIC CODE 1,SIC CODE 2,SIC CODE 3,SIC CODE 4,SIC6_DESCRIPTIONS (SIC)
0,MIRAGE DRAPERIES,4731 E SUNNY DUNES RD,PALM SPRINGS,CA,225903,259103,259198,502328,519953,DRAPERIES & CURTAINS-RETAIL/CUSTOM MADE
1,,11700 OLIO RD,FISHERS,IN,804907,,,,,
2,STONE RESIN SURFACING LLC,1 MAYFLOWER PL,MILFORD,CT,282105,,,,,
3,,2121 7TH ST,PARKERSBURG,WV,832221,,,,,
4,IRONWOOD STATE PRISON,19005 WILEYS WELL RD,BLYTHE,CA,912102,922302,,,,FEDERAL GOVERNMENT CONTRACTORS
...,...,...,...,...,...,...,...,...,...,...
9995,GOODMAN CAMPBELL BRAIN & SPINE,13345 ILLINOIS ST,CARMEL,IN,801101,,,,,
9996,UVAPE LLC,467 S MAIN ST,COLCHESTER,CT,599302,,,,,
9997,DQ GRILL & CHILL,80 BLUE HERON DR,RIDGELAND,SC,581208,581203,581206,,,SERVICE STATIONS-GASOLINE & OIL
9998,,19 MOSS CREEK VLG # D,HILTON HEAD ISLE,SC,804907,,,,,


In [144]:
def filter_infogroup(filenames: list[str, ...], search_cols: list[str, ...], search_str: str, chunksize: int=10000, smoke_test: bool=SMOKE_TEST):
    filtered_df = pd.DataFrame([])
    for filename in filenames:
        for df in pd.read_csv(filename, iterator=True, chunksize=chunksize):
            rows_to_add = df[df[search_cols].apply(lambda r: r.astype(str).str.contains(search_str, case=False).any(), axis=1)]
            filtered_df = pd.concat([filtered_df, rows_to_add], axis=0)
            if smoke_test:
                break
    return filtered_df

In [155]:
SMOKE_TEST = False
SIC_CODE = "0259" # Poultry and Eggs, Not Elsewhere Classified
# no results for 0259 or 0251
df_filtered = filter_infogroup(INFOGROUP_2022, SIC_COLS, SIC_CODE, chunksize=1000000)

In [156]:
df_filtered.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE


In [59]:
df_filtered.columns

Index(['COMPANY', 'ADDRESS LINE 1', 'CITY', 'STATE', 'ZIPCODE', 'ZIP4',
       'COUNTY CODE', 'AREA CODE', 'IDCODE', 'LOCATION EMPLOYEE SIZE CODE',
       'LOCATION SALES VOLUME CODE', 'PRIMARY SIC CODE', 'SIC6_DESCRIPTIONS',
       'PRIMARY NAICS CODE', 'NAICS8 DESCRIPTIONS', 'SIC CODE',
       'SIC6_DESCRIPTIONS (SIC)', 'SIC CODE 1', 'SIC6_DESCRIPTIONS (SIC1)',
       'SIC CODE 2', 'SIC6_DESCRIPTIONS(SIC2)', 'SIC CODE 3',
       'SIC6_DESCRIPTIONS(SIC3)', 'SIC CODE 4', 'SIC6_DESCRIPTIONS(SIC4)',
       'ARCHIVE VERSION YEAR', 'YELLOW PAGE CODE',
       'EMPLOYEE SIZE (5) - LOCATION', 'SALES VOLUME (9) - LOCATION',
       'BUSINESS STATUS CODE', 'INDUSTRY SPECIFIC FIRST BYTE',
       'YEAR ESTABLISHED', 'OFFICE SIZE CODE', 'COMPANY HOLDING STATUS', 'ABI',
       'SUBSIDIARY NUMBER', 'PARENT NUMBER', 'PARENT ACTUAL EMPLOYEE SIZE',
       'PARENT ACTUAL SALES VOLUME', 'PARENT EMPLOYEE SIZE CODE',
       'PARENT SALES VOLUME CODE', 'SITE NUMBER', 'ADDRESS TYPE INDICATOR',
       'POPULAT

In [27]:
# save the filtered infogroup data to csv
FILENAME = "../data/poultry_plants_2022.csv"
df_filtered.to_csv(FILENAME)

#### Do Fuzzy String Matching

In [55]:
df_filtered["Full Address"] = df_filtered["ADDRESS LINE 1"] + ", " + df_filtered["CITY"] + ", " + df_filtered["STATE"] + " " + df_filtered["ZIPCODE"].astype(int).astype(str)

In [70]:
df_filtered["Full Address"] = df_filtered["Full Address"].astype(str)

In [33]:
df_fsis = pd.read_csv(FSIS_DATA, index_col=0)

In [36]:
df_fsis.head()

Unnamed: 0,EstNumber,EstID,Parent Corporation,Establishment Name,State,Size,Animals Processed,Processed\nVolume\nCategory,Slaughter\nVolume\nCategory,Full Address,latitude,longitude
0,M267,4802,JBS,JBS Tolleson Inc.,AZ,Large,Beef,5.0,4.0,"651 S. 91st Ave, Tolleson, AZ 85353",33.44166,-112.252559
1,M354,5117,Cargill,Cargill Meat Solutions,CA,Large,Beef,5.0,4.0,"3115 S. Fig Ave., Fresno, CA 93706",36.689651,-119.80195
2,M6063A,5289,Central Valley Meat Company,"Central Valley Meat Co., Inc.",CA,Large,Beef,5.0,4.0,"10431 8 3/4 Ave., Hanford, CA 93230",36.321273,-119.612222
3,M783 + P783 + V783,5144,Central Valley Meat Company,Harris Ranch Beef Company,CA,Large,Beef,5.0,4.0,"16277 S McCall Ave., Selma, CA 93662",36.499212,-119.614553
4,M21488 + V21488,8349,Independent,OWB Packers LLC,CA,Large,Beef,4.0,4.0,"57 East Shank Road, Brawley, CA 92227",33.000969,-115.521786


In [82]:
df_poultry = df_fsis[df_fsis["Animals Processed"] == "Chicken"].copy()

In [122]:
df_poultry["Sales Volume (Location)"] = np.NaN

In [123]:
len(df_poultry)

184

In [124]:
df_poultry.head()

Unnamed: 0,EstNumber,EstID,Parent Corporation,Establishment Name,State,Size,Animals Processed,Processed\nVolume\nCategory,Slaughter\nVolume\nCategory,Full Address,latitude,longitude,Sales Volume (Location)
71,P1317 + V1317,4495,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"700 McDonald Avenue, Albertville, AL 35950",34.260726,-86.203222,
72,P7485 + V7485,4518,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"1020 County Road 114, Jack, AL 36346",31.500628,-85.903438,
73,P912,4493,Cargill,"Wayne Farms, LLC",AL,Large,Chicken,5.0,5.0,"444 Baskin Street South, Union Springs, AL 36089",32.139166,-85.721704,
74,P7342 + V7342,4516,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"808 Ross Clark Circle NE, Dothan, AL 36303",31.225754,-85.362068,
75,P1235,4509,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"254 Ipsco Road, Decatur, AL 35601",34.607601,-87.042595,


In [99]:
df_match = pd.DataFrame()
df_match["Sales Volume (Location)"] = np.NaN

In [125]:
plants_to_update = {}
for i, fsis in df_poultry.iterrows():
    fsis_address = fsis["Full Address"].lower()
    for k, infogroup in df_filtered.iterrows():
        infogroup_address = infogroup["Full Address"].lower()
        if fuzz.token_sort_ratio(infogroup_address, fsis_address) > 75:
            print(f"Found a match at index {k}")
            print(infogroup_address)
            print(fsis_address)
            # plants_to_update[i] = infogroup['SALES VOLUME (9) - LOCATION']
            df_poultry.loc[i, "Sales Volume (Location)"] = infogroup['SALES VOLUME (9) - LOCATION']
            break

Found a match at index 15009214
700 mcdonald ave, albertville, al 35950
700 mcdonald avenue, albertville, al 35950
Found a match at index 6550918
1020 county road 114, jack, al 36346
1020 county road 114, jack, al 36346
Found a match at index 16502311
444 baskin st s, union springs, al 36089
444 baskin street  south, union springs, al 36089
Found a match at index 5944563
2045 highway 244, russellville, al 35654
2045 highway 244, russellville, al 35654
Found a match at index 14173500
3500 lake guntersville park dr, guntersville, al 35976
3500 lake guntersville park drive, guntersville, al 35976
Found a match at index 6892968
4693 county road 636, enterprise, al 36330
4693 county road 636, enterprise, al 36330
Found a match at index 14950143
764 george cagle dr, collinsville, al 35961
764 george cagle drive, collinsville, al 35961
Found a match at index 13707675
3301 3rd ave s, jasper, al 35501
3301 3rd avenue, jasper, al 35501
Found a match at index 14937138
57 melvin clark rd, eufaula,

In [127]:
df_poultry.head()

Unnamed: 0,EstNumber,EstID,Parent Corporation,Establishment Name,State,Size,Animals Processed,Processed\nVolume\nCategory,Slaughter\nVolume\nCategory,Full Address,latitude,longitude,Sales Volume (Location)
71,P1317 + V1317,4495,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"700 McDonald Avenue, Albertville, AL 35950",34.260726,-86.203222,438268.0
72,P7485 + V7485,4518,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"1020 County Road 114, Jack, AL 36346",31.500628,-85.903438,576660.0
73,P912,4493,Cargill,"Wayne Farms, LLC",AL,Large,Chicken,5.0,5.0,"444 Baskin Street South, Union Springs, AL 36089",32.139166,-85.721704,271713.0
74,P7342 + V7342,4516,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"808 Ross Clark Circle NE, Dothan, AL 36303",31.225754,-85.362068,
75,P1235,4509,Cargill,Wayne Farms LLC,AL,Large,Chicken,5.0,5.0,"254 Ipsco Road, Decatur, AL 35601",34.607601,-87.042595,


In [128]:
df_poultry["Sales Volume (Location)"].isna().sum()

74

In [130]:
# save poultry plants with matches sales data
df_poultry.to_csv("../data/poultry_plants_with_sales.csv")