In [2]:
import pandas as pd
from fuzzywuzzy import fuzz
import numpy as np
import dask.dataframe as dd



In [3]:
FSIS_DATA = "../data/fsis-processors-with-location.csv"
SIC_CODE = "2015" # Poultry Slaughtering and Processing
NAICS_CODE = "311615" # Poultry Processing
INFOGROUP_2022 = ["../data/2022_Business_Academic_QCQ.txt"]
# df = pd.read_csv(INFOGROUP_2022[0], nrows=10000, dtype=str)


## Method Using Dask DataFrames

In [4]:
# Read in csv file using Dask dataframe (rather than pandas)
df = dd.read_csv(INFOGROUP_2022[0], dtype=str, encoding='unicode_escape')
df.columns = df.columns.str.upper() # force all columns to be uppercase
df.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,MIRAGE DRAPERIES,4731 E SUNNY DUNES RD,PALM SPRINGS,CA,92264,1536,65,760,2,A,...,6,44804,1,33.812119,-116.4945,0,40140,2,348,6065
1,,11700 OLIO RD,FISHERS,IN,46037,7618,57,317,1,A,...,7,110808,1,39.959999,-85.921268,P,26900,2,294,18057
2,STONE RESIN SURFACING LLC,1 MAYFLOWER PL,MILFORD,CT,6460,4520,9,203,2,D,...,7,150200,2,41.207739,-73.070136,P,35300,2,408,9009
3,,2121 7TH ST,PARKERSBURG,WV,26101,3803,107,740,1,,...,7,702,1,39.269405,-81.520599,P,37620,2,425,54107
4,IRONWOOD STATE PRISON,19005 WILEYS WELL RD,BLYTHE,CA,92225,2287,65,760,2,A,...,7,46900,1,33.570065,-114.898031,0,40140,2,348,6065


In [5]:
# function to filter the DataFrame that has everything read in from the master CSV.
# Inputs are the master dataframe and the SIC code of choice. Returns a Dask DataFrame
# that has been filtered to be the lines from Infogroup records that contain the desired
# SIC Code in 1 of 5 different columns

def dask_sic_matches_df(mst_df, sic_code):
    
    # reassure SIC Code is a string    
    sic_code = str(sic_code)
    
    # this filters checks SIC Code 1 through 4 & Primary SIC Code if they contain desired SIC Code we are searching for
    filtered_df = mst_df[ mst_df['SIC CODE'].str.contains(sic_code, na=False) |
                          mst_df['SIC CODE 1'].str.contains(sic_code, na=False) |
                          mst_df['SIC CODE 2'].str.contains(sic_code, na=False) |
                          mst_df['SIC CODE 3'].str.contains(sic_code, na=False) |
                          mst_df['SIC CODE 4'].str.contains(sic_code, na=False) |
                          mst_df['PRIMARY SIC CODE'].str.contains(sic_code, na=False)
                         ]
    
    result = filtered_df.compute()
            
    return result

##### SIC Code 8611: Farm Organizations

In [30]:
code_8611 = dask_sic_matches_df(df, '8611')

In [31]:
code_8611.to_csv("../data/code_8611.csv") # save as CSV file

##### SIC Code 0761: Farm Labor

In [32]:
code_0761 = dask_sic_matches_df(df, '0761')

In [33]:
code_0761.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
1380,ROLLING R ENTERPRISES INC,628 US HIGHWAY 78 # B,BRAWLEY,CA,92227,9201,25,760,2,E,...,7,10300,1,32.979027,-115.480782,P,20940,2,0,6025
5926,CUSTOM HARVEST,340 W K ST,BRAWLEY,CA,92227,3120,25,760,2,C,...,7,10600,4,32.972783,-115.546545,P,20940,2,0,6025
17795,FRESH PIC HARVEST,86235 AVENUE 52,COACHELLA,CA,92236,2720,65,760,2,C,...,7,45609,2,33.670396,-116.160083,P,40140,2,348,6065
24196,ESPARZA ENTERPRISES INC,51335 CESAR CHAVEZ ST # 112,COACHELLA,CA,92236,1528,65,760,2,A,...,7,45705,1,33.675853,-116.182276,P,40140,2,348,6065
24770,WILLIAMS-WILLIAMS HAY CNTRCTNG,1679 RIVER DR,BRAWLEY,CA,92227,1747,25,760,2,C,...,7,10400,4,32.986658,-115.510196,P,20940,2,0,6025


In [34]:
code_0761.to_csv("../data/code_0761.csv") # save as CSV file

##### SIC Code 0762: Farm Management Services

In [35]:
code_0762 = dask_sic_matches_df(df, '0762')

In [36]:
code_0762.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
1054,DRAGO INDIANA,6147 N COUNTY ROAD 500 W,MULBERRY,IN,46058,9445.0,23,765,2,D,...,3,950300,3,40.377054,-86.596185,P,23140,1.0,320,18023
5545,AURORA COOPERATIVE ELEVATOR CO,407 SOUTH DAKOTA HIGHWAY 50,TYNDALL,SD,57066,,9,605,2,A,...,3,967600,2,42.9889,-97.8647,X,0,,0,46009
5580,UNITED FARM & RANCH MANAGEMENT,400 W BRAASCH AVE # D,NORFOLK,NE,68701,4157.0,119,402,2,A,...,7,961000,1,42.0341,-97.412,P,35740,1.0,0,31119
5926,CUSTOM HARVEST,340 W K ST,BRAWLEY,CA,92227,3120.0,25,760,2,C,...,7,10600,4,32.972783,-115.546545,P,20940,2.0,0,6025
13051,SAN PASQUAL LAND & CATTLE CO,5300 KALIN RD,BRAWLEY,CA,92227,9804.0,25,760,2,B,...,7,10200,1,33.03004,-115.578442,0,20940,2.0,0,6025


In [37]:
code_0762.to_csv("../data/code_0762.csv") # save as CSV file

##### SIC Code 8748: Farm Management Systems

In [38]:
code_8748 = dask_sic_matches_df(df, '8748')

In [39]:
code_8748.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
95,HOSPITALITY CONSULTANTS LLC,199 FOXON RD,NORTH BRANFORD,CT,6471,1075,9,203,2,A,...,5,186100,5,41.327977,-72.815337,0,35300,2,408,9009
129,FRONTIER K9 TRAINING,18019 JOLIET RD,SHERIDAN,IN,46069,9118,57,317,2,A,...,5,110300,5,40.047932,-86.230566,P,26900,2,294,18057
148,THERMAL AVIATION,56935 WARHAWK WAY,THERMAL,CA,92274,9344,65,760,2,A,...,6,45609,3,33.635843,-116.165491,0,40140,2,348,6065
275,PENSION PORTFOLIOS,79200 COYOTE CRK,LA QUINTA,CA,92253,4528,65,760,2,A,...,7,45121,2,33.683134,-116.282457,P,40140,2,348,6065
310,ARSENAL 401K ADVISORS,1614 EDINBURGH AVE,PORT ROYAL,SC,29935,1808,13,804,2,A,...,4,800,4,32.382146,-80.697951,P,25940,2,0,45013


In [40]:
code_8748.to_csv("../data/code_8748.csv") # save as CSV file

##### SIC Code 5431: Farm Markets

In [6]:
code_5431 = dask_sic_matches_df(df, '5431')

In [7]:
code_5431.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
70,CIRCLE PRODUCE CO,2420 M L KING ST # A,CALEXICO,CA,92231,3214,25,760,2,C,...,7,11900,2,32.692673,-115.491417,P,20940,2,0,6025
239,SMOOTHIE KING,894 N COLONY RD,WALLINGFORD,CT,6492,2411,9,203,2,B,...,7,175400,1,41.482231,-72.809883,P,35300,2,408,9009
1697,SOUTHLAND COOLING,115 W ROSS RD,EL CENTRO,CA,92243,9751,25,760,2,D,...,7,11300,2,32.780925,-115.545362,P,20940,2,0,6025
2368,MEIJER BAKERY,17000 MERCANTILE BLVD,NOBLESVILLE,IN,46060,3941,57,309,2,,...,7,110506,2,40.03519,-85.992059,P,26900,2,294,18057
3797,BYRD'S HOOT OWL PECANS,8869 SW STATE ROUTE V,BUTLER,MO,64730,4523,13,660,2,A,...,5,70100,2,38.233732,-94.278296,2,28140,2,312,29013


In [8]:
code_5431.to_csv("../data/code_5431.csv") # save as CSV file

##### SIC Code 0723: Farm Produce

In [9]:
code_0723 = dask_sic_matches_df(df, '0723')

In [10]:
code_0723.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
6947,RUBIN SEEDS LLC,4746 US HIGHWAY 111,BRAWLEY,CA,92227,9618,25,760,2,B,...,7,10400,5,32.997196,-115.524737,P,20940,2,0,6025
12109,FIFIELD LAND CO,4307 FIFIELD RD,BRAWLEY,CA,92227,9520,25,760,2,D,...,7,10300,1,32.967908,-115.457591,0,20940,2,0,6025
21175,HEFTY SEED CO,2210 STATE ST,CENTERVILLE,SD,57014,2313,125,605,2,B,...,3,965200,2,43.11372,-96.971866,P,43620,2,0,46125
32491,GREEN THUMB PRODUCE,2648 W RAMSEY ST,BANNING,CA,92220,3716,65,951,2,F,...,7,44101,1,33.925105,-116.904558,P,40140,2,348,6065
36273,GAVILON,2700 GRAND AVE,KEARNEY,NE,68847,4134,19,308,2,B,...,6,969300,2,40.702693,-99.061435,P,28260,1,0,31019


In [11]:
code_0723.to_csv("../data/code_0723.csv") # save as CSV file

##### SIC Code 245: Poultry Farms

In [12]:
code_254 = dask_sic_matches_df(df, '254')

In [13]:
code_254.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
3129,ACTION BEHAVIOR CTR-ABA THRPY,320 E 1ST AVE # 101,BROOMFIELD,CO,80020,3786.0,14,720,2,D,...,7,30300,2,39.915481,-105.065944,P,19740,2.0,216,8014
5326,HASSELBACH MEATS,4637 OAK HARBOR RD,FREMONT,OH,43420,9373.0,143,419,2,C,...,7,960900,2,41.441192,-83.134802,P,23380,1.0,534,39143
6850,MURRAY CABINETRY LUXURY & BATH,407 N BLOOMINGTON ST,STREATOR,IL,61364,2201.0,99,815,2,B,...,7,963900,4,41.125039,-88.835096,P,36837,1.0,176,17099
11380,COOTERS FARM,1784 TUCK WILKES RD,NOXAPATER,MS,39346,9311.0,159,662,2,I,...,3,950200,4,33.011665,-89.126923,P,0,,0,28159
11845,MCM POULTRY,711 W FOURTH ST,BEAUMONT,CA,92223,,65,951,2,C,...,7,43807,2,33.9479,-116.9771,X,40140,2.0,348,6065


In [14]:
code_254.to_csv("../data/code_254.csv") # save as CSV file

##### SIC Code 0251: Broiler, Fryer, and Roaster Chickens

In [8]:
code_0251 = dask_sic_matches_df(df, '0251')

In [9]:
code_0251.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
53368,SANDERSON FARMS,2535 SANDERSON DR,LAUREL,MS,39440,4741,67,601,2,H,...,7,950600,3,31.667854,-89.163265,0,29860,1.0,279,28067
81980,CONERLY POULTRY,12 MCCRAY RD,JAYESS,MS,39641,8053,147,601,2,A,...,4,950100,2,31.304582,-90.159784,P,0,,0,28147
110002,TRIPLE P RANCH & EQUIPMENT LLC,22211 FIVE BRIDGES AVE,COLE CAMP,MO,65325,2303,15,660,2,B,...,4,460100,1,38.360331,-93.211563,P,0,,0,29015
51821,THREE WAY LIVE POULTRY,3071 WEBSTER AVE,BRONX,NY,10467,4900,5,718,2,A,...,8,42500,1,40.870614,-73.878419,P,35620,2.0,408,36005
42882,GOLDEN-ROD BROILER INC,2352 COUNTY ROAD 719,CULLMAN,AL,35055,9655,43,256,2,H,...,6,964900,6,34.154194,-86.768567,0,18980,1.0,142,1043


In [10]:
code_0251.to_csv("../data/code_0251.csv") # save as CSV file

## Panda Dataframe Method

In [None]:
# read in entire Infrogroup 2022 file
df = pd.read_csv(INFOGROUP_2022[0], dtype=str)

In [4]:
# covert all the elements of these columns to strings
df['SIC CODE'] = df['SIC CODE'].astype(str)
df['SIC CODE 1'] = df['SIC CODE 1'].astype(str)
df['SIC CODE 2'] = df['SIC CODE 2'].astype(str)
df['SIC CODE 3'] = df['SIC CODE 3'].astype(str)
df['SIC CODE 4'] = df['SIC CODE 4'].astype(str)
df['PRIMARY SIC CODE'] = df['PRIMARY SIC CODE'].astype(str)
            

In [5]:
# functin to build a dataframe of rows from Infogroup that have SIC Codes that match
# the desired input SIC Code

def sic_matches_df(mst_df, sic_code):
    df_sic = pd.DataFrame(columns=df.columns)
    sic_code = str(sic_code)
    
    for i in range(len(df)):
        if (  (df.iloc[i]['SIC CODE'].__contains__(sic_code) ) |
              (df.iloc[i]['SIC CODE 1'].__contains__(sic_code)) |
              (df.iloc[i]['SIC CODE 2'].__contains__(sic_code)) |
              (df.iloc[i]['SIC CODE 3'].__contains__(sic_code)) |
              (df.iloc[i]['SIC CODE 4'].__contains__(sic_code)) |
              (df.iloc[i]['PRIMARY SIC CODE'].__contains__(sic_code))
            ):
            df_sic.loc[len(df_sic.index)] = mst_df.iloc[i]
            
    return df_sic

In [7]:
# create new empty dataframe (to append to) containing only SIC Codes that we 0291
# df_sic_0291 = pd.DataFrame(columns=df.columns)
df_sic_0291 = sic_matches_df(df, '0291')


In [8]:
df_sic_0291.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,CHURCH RANCH,20009 HIGHWAY 72,ARVADA,CO,80007,8236,59,303,2,A,...,5,60500,3,39.869196,-105.230716,P,19740,2,216,8059
1,SAN PABLITO RANCH,7660 WOODWAY DR,HOUSTON,TX,77063,1533,201,713,2,A,...,7,430300,1,29.752743,-95.503489,P,26420,2,288,48201
2,REMINGTON CREEK RANCH SALES,1015 STABLE SIDE CT,HOUSTON,TX,77073,6409,201,281,2,A,...,7,240702,2,29.966813,-95.393341,P,26420,2,288,48201
3,MOUNTAIN SHADOWS RANCH,3135 SHADY HOLLOW LN,JAMUL,CA,91935,2236,73,619,2,A,...,5,21302,2,32.73798,-116.818903,P,41740,2,0,6073
4,SMOKE TREE RANCH,1850 SMOKE TREE LN,PALM SPRINGS,CA,92264,9270,65,760,2,E,...,6,44807,1,33.797426,-116.521896,P,40140,2,348,6065


In [7]:
FILENAME = "../data/code_0291.csv"
df_sic_0291.to_csv(FILENAME)

In [8]:
# create new empty dataframe (to append to) containing only SIC Codes: 0241
df_sic_0241 = sic_matches_df(df, '0241')


In [9]:
df_sic_0241.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,MOSSING'S DAIRY LLC,47447 233A ST,EGAN,SD,57024,6439,101,605,2,B,...,2,959600,2,43.99813,-96.72161,P,0,,0,46101
1,LONG DAIRY FARM INC,27164 477TH AVE,HARRISBURG,SD,57032,8210,83,605,2,A,...,5,10107,3,43.450902,-96.667723,P,43620,2.0,0,46083
2,CYPHER DAIRY CONSULTING LLC,26443 461ST AVE,HARTFORD,SD,57033,6703,99,605,2,A,...,4,10300,2,43.5518,-96.992924,P,43620,2.0,0,46099
3,UNITED DAIRY INC,6 GARVIN DAIRY RD,FAIRMONT,WV,26554,5058,49,304,2,D,...,7,21000,2,39.481453,-80.082759,P,21900,1.0,390,54049
4,DAIRYNET INC,2301 RESEARCH PARK WAY # 155,BROOKINGS,SD,57006,1724,11,605,2,B,...,7,958900,5,44.320347,-96.764656,P,15100,1.0,0,46011


In [10]:
FILENAME = "../data/code_0241.csv"
df_sic_0241.to_csv(FILENAME)

In [6]:
df_sic_5144 = sic_matches_df(df, '5144')

In [7]:
df_sic_5144.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,SONSTEGARD FOODS CO,5005 S BUR OAK PL,SIOUX FALLS,SD,57108,2228,99,605,2,B,...,6,10102,4,43.499985,-96.762425,0,43620,2,0,46099
1,STEMCOR,1 E BROWARD BLVD # 1599,FORT LAUDERDALE,FL,33301,2040,11,954,2,A,...,6,42500,3,26.122817,-80.143243,P,33100,2,370,12011
2,ROCK RIDGE FARM,752 STATE RD,RICHMOND,MA,1254,5246,3,413,2,A,...,2,935100,3,42.409203,-73.355258,P,38340,2,0,25003
3,CHICK-A-RAY POULTRY,24877 THOMPSON RD,ALBEMARLE,NC,28001,7467,167,704,2,A,...,7,931202,1,35.319232,-80.25505,P,10620,1,172,37167
4,MAR-JAC POULTRY INC,1301 JAMES ST,HATTIESBURG,MS,39401,4570,35,601,2,H,...,7,10500,1,31.307197,-89.278029,P,25620,2,279,28035


In [8]:
FILENAME = "../data/code_5144.csv"
df_sic_5144.to_csv(FILENAME)

In [9]:
df_sic_5154 = sic_matches_df(df, '5154')

In [10]:
df_sic_5154.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,MADISON LIVESTOCK SALES CO,1209 E CENTER ST,MADISON,SD,57042,,79,605,2,A,...,5,960200,4,44.0005,-97.1185,X,0,,0,46079
1,UNITED PRODUCERS INC,3491 E STATE ROAD 44,RUSHVILLE,IN,46173,7846.0,139,765,2,B,...,6,974400,3,39.615453,-85.379248,P,0,,0,18139
2,LAFLEUR BROTHERS LIVESTOCK,33022 482ND AVE,JEFFERSON,SD,57038,6800.0,127,605,2,B,...,3,20300,1,42.60222,-96.565863,P,43580,2.0,0,46127
3,TYSON FOODS INC,47283 SD HIGHWAY 34,COLMAN,SD,57017,6543.0,101,605,2,A,...,3,959600,2,43.978463,-96.7531,P,0,,0,46101
4,MENNO LIVESTOCK AUCTION,602 S PEARL ST,MENNO,SD,57045,2064.0,67,605,2,D,...,2,968600,3,43.234794,-97.574566,P,0,,0,46067


In [11]:
FILENAME = "../data/code_5154.csv"
df_sic_5154.to_csv(FILENAME)

In [12]:
df_sic_0191 = sic_matches_df(df, '0191')

In [13]:
df_sic_0191.head()

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,LOCATION EMPLOYEE SIZE CODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,OLD RIVER FARM,50 MILLER RD,MIDDLEFIELD,CT,6455,1229,7,860,2,A,...,4,580100,4,41.500023,-72.69973,P,25540,2,278,9007
1,FELIX CHAC CHUO FARMS INC,89785 80TH AVE,THERMAL,CA,92274,8906,65,760,2,B,...,6,45605,5,33.468129,-116.098759,P,40140,2,348,6065
2,C & N PETERSON FARMS INC,30310 455TH AVE,WAKONDA,SD,57073,6300,27,605,2,A,...,2,965800,3,42.992997,-97.096458,P,46820,1,0,46027
3,E-I-E-I-O FARM LLC,1003 NEWFIELD ST,MIDDLETOWN,CT,6457,1817,7,860,2,A,...,7,541200,3,41.586472,-72.67469,P,25540,2,278,9007
4,DRAGO INDIANA,6147 N COUNTY ROAD 500 W,MULBERRY,IN,46058,9445,23,765,2,D,...,3,950300,3,40.377054,-86.596185,P,23140,1,320,18023


In [14]:
FILENAME = "../data/code_0191.csv"
df_sic_0191.to_csv(FILENAME)

In [15]:
# left off here
# df_sic_8611 = sic_matches_df(df, '8611')

: 

: 

In [32]:
USEFUL_COLS = [
    "COMPANY",
    "ADDRESS LINE 1",
    "CITY",
    "STATE",
    ]
SIC_COLS = [
    "PRIMARY SIC CODE", 
    'SIC CODE 1', 
    'SIC CODE 2', 
    'SIC CODE 3',
    'SIC CODE 4'
    ]
NAICS_COLS = [
    "NAICS CODE",
    "PRIMARY NAICS CODE"
    ]
DESCRIPTION_COLS = [
    # "NAICS8 DESCRIPTIONS", # only in 2021
    # "SIC6_DESCRIPTIONS (PRIMARYSIC)" # only in 2021
    "SIC6_DESCRIPTIONS (SIC)"
]

In [33]:
poultry_2022 = pd.read_csv("../data/poultry_plants_2022.csv")
poultry_2022.iloc[5]['SIC CODE']

poultry_2022[USEFUL_COLS + SIC_COLS + DESCRIPTION_COLS]

# poultry_2022.columns


Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,PRIMARY SIC CODE,SIC CODE 1,SIC CODE 2,SIC CODE 3,SIC CODE 4,SIC6_DESCRIPTIONS (SIC)
0,COOTERS FARM,1784 TUCK WILKES RD,NOXAPATER,MS,201501,,,,,POULTRY FARMS
1,TYSON FOODS INC,47283 SD HIGHWAY 34,COLMAN,SD,201104,25401.0,201501.0,204803.0,207702.0,LIVESTOCK-DEALERS (WHLS)
2,SIMMONS FOODS INC,2101 TWIN CIRCLE DR,VAN BUREN,AR,201501,,,,,
3,TYSON FOODS INC,605 235-3301 #DD813,NORTH SIOUX CITY,SD,25401,201501.0,204803.0,207702.0,209903.0,MEAT PRODUCTS (MFRS)
4,MAR-JAC POULTRY INC,1301 JAMES ST,HATTIESBURG,MS,201501,,,,,POULTRY-WHOLESALE
...,...,...,...,...,...,...,...,...,...,...
715,PILGRIMS PRIDE,2237 WHITLEY RD E,WILSON,NC,201501,,,,,TRUCKING-CONTRACT HAULING
716,2-C'S ENTERPRISES,961 WHITES BOTTOM RD,BURKESVILLE,KY,201501,,,,,
717,DAYBREAK FOODS INC,533 E TYRANENA PARK RD,LAKE MILLS,WI,541105,201501.0,,,,FEDERAL GOVERNMENT CONTRACTORS
718,BELLARD'S POULTRY,405 S BULLARD ST,OPELOUSAS,LA,201501,549911.0,549907.0,571912.0,,POULTRY FARMS
