## Match plants between Infogroup and FSIS by location
- Within 5 km, all matches verified by hand to confirm name and parent corporation
    - Made 20 matches
- Once matched, add sales volume data from Infogroup to FSIS entry
- Filled in median sales data from relevant parent corporation for all plants that still did not have a match
    - Reduced 54 remaining unmatched plants to 6 (out of 184)

### Import and load poultry plant data

In [19]:
import pandas as pd
import numpy as np
import time

In [20]:
pp_2022 = pd.read_csv("../data/poultry_plants_2022.csv")
pp_sales = pd.read_csv("../data/poultry_plants_with_sales.csv")
fsis = pd.read_csv("../data/fsis-processors-with-location.csv")

In [21]:
no_match = pp_sales[pp_sales["Sales Volume (Location)"].isna()]
len(no_match)

74

### Longitude/Latitude Distance Calculator

In [4]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

### Matching farms by location

In [22]:
no_match = pp_sales[pp_sales["Sales Volume (Location)"].isna()]
len(no_match) # 54 unmatched plants currently

74

In [16]:
def loc_match(no_match, pp_2022, threshold):
    no_match_nulls = no_match[no_match["Sales Volume (Location)"].isna()]
    for index, row in no_match_nulls.iterrows():
        target_point = (row["latitude"], row["longitude"])

        for j, infogroup in pp_2022.iterrows():
            candidate_point = infogroup["LATITUDE"], infogroup["LONGITUDE"]
            distance = haversine(target_point[1], target_point[0], candidate_point[1], candidate_point[0])
            if distance <= threshold:
                print("current point: " + str(target_point) + "; match from pp22: " + str(candidate_point))
                print("current company: " + row["Parent Corporation"] + ", " + row["Establishment Name"] + 
                      "; matched: parent ABI (" + str(infogroup["PARENT NUMBER"]) + ") " + infogroup["COMPANY"])
                time.sleep(2)
                x = input("confirm location")
                if (x == "yes"):
                    pp_sales.loc[index, "Sales Volume (Location)"] = infogroup["SALES VOLUME (9) - LOCATION"]
                    no_match.loc[index, "Sales Volume (Location)"] = infogroup["SALES VOLUME (9) - LOCATION"]
                    break

In [23]:
loc_match(no_match, pp_2022, 5)

current point: (34.6076012, -87.04259549999999); match from pp22: (34.611448, -87.048549)
current company: Cargill, Wayne Farms LLC; matched: parent ABI (433353331.0) WAYNE FARMS FURTHER PROCNG
current point: (35.7652697, -91.641319); match from pp22: (35.764666, -91.648842)
current company: George's, Ozark Mountain Poultry, Inc.; matched: parent ABI (531052413.0) PECO FOODS INC
current point: (35.473009, -93.457503); match from pp22: (35.4904, -93.4823)
current company: Tyson, Tyson Foods, Inc.; matched: parent ABI (7537913.0) TYSON FOODS INC
current point: (36.1905228, -94.1254707); match from pp22: (36.154021, -94.154599)
current company: Tyson, Tyson Foods, Inc.; matched: parent ABI (7537913.0) BRUSS CO
current point: (36.7164767, -119.8229319); match from pp22: (36.693249, -119.783614)
current company: Foster Farms, Zorro Leasing LLC; matched: parent ABI (9564816.0) FOSTER FARMS
current point: (37.6559523, -87.5184314); match from pp22: (37.673862, -87.546565)
current company: Tys

In [24]:
len(no_match[no_match["Sales Volume (Location)"].isna()])

54

In [26]:
no_match.to_csv("../data/location_match_only.csv")
pp_sales.to_csv("../data/location_match_fuller.csv")

In [86]:
(184-54)/184 # number of matches

0.7065217391304348

In [27]:
pp_2022

Unnamed: 0.1,Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,ZIP4,COUNTY CODE,AREA CODE,IDCODE,...,POPULATION CODE,CENSUS TRACT,CENSUS BLOCK,LATITUDE,LONGITUDE,MATCH CODE,CBSA CODE,CBSA LEVEL,CSA CODE,FIPS CODE
0,11380,COOTERS FARM,1784 TUCK WILKES RD,NOXAPATER,MS,39346.0,9311.0,159.0,662,2,...,3,950200.0,4.0,33.011665,-89.126923,P,0.0,,0.0,28159.0
1,21772,TYSON FOODS INC,47283 SD HIGHWAY 34,COLMAN,SD,57017.0,6543.0,101.0,605,2,...,3,959600.0,2.0,43.978463,-96.753100,P,0.0,,0.0,46101.0
2,21912,SIMMONS FOODS INC,2101 TWIN CIRCLE DR,VAN BUREN,AR,72956.0,6027.0,33.0,479,2,...,7,20501.0,2.0,35.428061,-94.336725,P,22900.0,2.0,0.0,5033.0
3,39534,TYSON FOODS INC,605 235-3301 #DD813,NORTH SIOUX CITY,SD,57049.0,,127.0,605,2,...,5,20300.0,2.0,42.524600,-96.497100,X,43580.0,2.0,0.0,46127.0
4,74599,MAR-JAC POULTRY INC,1301 JAMES ST,HATTIESBURG,MS,39401.0,4570.0,35.0,601,2,...,7,10500.0,1.0,31.307197,-89.278029,P,25620.0,2.0,279.0,28035.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,16677215,PILGRIMS PRIDE,2237 WHITLEY RD E,WILSON,NC,27893.0,7951.0,195.0,252,2,...,7,700.0,1.0,35.732556,-77.880145,P,48980.0,1.0,468.0,37195.0
716,16774099,2-C'S ENTERPRISES,961 WHITES BOTTOM RD,BURKESVILLE,KY,42717.0,,57.0,270,2,...,5,950100.0,3.0,36.780400,-85.381500,X,0.0,,0.0,21057.0
717,16797787,DAYBREAK FOODS INC,533 E TYRANENA PARK RD,LAKE MILLS,WI,53551.0,9683.0,55.0,920,2,...,5,100500.0,2.0,43.087388,-88.893628,P,48020.0,1.0,376.0,55055.0
718,16806974,BELLARD'S POULTRY,405 S BULLARD ST,OPELOUSAS,LA,70570.0,2575.0,97.0,337,2,...,7,961600.0,1.0,30.532858,-92.098679,P,36660.0,1.0,318.0,22097.0


In [28]:
pp_sales_updated = pp_sales.copy()

### Filling in median sales volume for remaining NaNs

In [29]:
median = pp_sales.groupby(['Parent Corporation'])['Sales Volume (Location)'].median().reset_index()
median

Unnamed: 0,Parent Corporation,Sales Volume (Location)
0,Amick,
1,Aterian Investment Partners,
2,Bachoco,16680.0
3,Cargill,243269.0
4,Case Farms,0.0
5,Costco,
6,Farmer Focus,19328.0
7,Fieldale Farms Corporation,270690.5
8,Foster Farms,972698.0
9,George's,14743.5


In [35]:
median["Sales Volume (Location)"] = median["Sales Volume (Location)"].fillna(114909.0)
median.loc[4, "Sales Volume (Location)"] = 114909.0
#pp_sales["Sales Volume (Location)"].median()

In [37]:
dict1 = dict(zip(median["Parent Corporation"], median["Sales Volume (Location)"]))

In [38]:
pp_sales_updated2 = pp_sales_updated.copy()

In [39]:
for index, row in pp_sales_updated2.iterrows():
    if np.isnan(row["Sales Volume (Location)"]):
        parent = row["Parent Corporation"]
        pp_sales_updated2.loc[index, "Sales Volume (Location)"] = dict1[parent]
        
pp_sales_updated2

Unnamed: 0.1,Unnamed: 0,EstNumber,EstID,Parent Corporation,Establishment Name,State,Size,Animals Processed,Processed\nVolume\nCategory,Slaughter\nVolume\nCategory,Full Address,latitude,longitude,Sales Volume (Location)
0,71,P1317 + V1317,4495,Cargill,Wayne Farms LLC,Alabama,Large,Chicken,5.0,5.0,"700 McDonald Avenue, Albertville, AL 35950",34.260726,-86.203222,438268.0
1,72,P7485 + V7485,4518,Cargill,Wayne Farms LLC,Alabama,Large,Chicken,5.0,5.0,"1020 County Road 114, Jack, AL 36346",31.500628,-85.903438,576660.0
2,73,P912,4493,Cargill,"Wayne Farms, LLC",Alabama,Large,Chicken,5.0,5.0,"444 Baskin Street South, Union Springs, AL 36089",32.139166,-85.721704,271713.0
3,74,P7342 + V7342,4516,Cargill,Wayne Farms LLC,Alabama,Large,Chicken,5.0,5.0,"808 Ross Clark Circle NE, Dothan, AL 36303",31.225754,-85.362068,243269.0
4,75,P1235,4509,Cargill,Wayne Farms LLC,Alabama,Large,Chicken,5.0,5.0,"254 Ipsco Road, Decatur, AL 35601",34.607601,-87.042595,8474.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,610,P45912,6162782,Independent,Midwest Poultry Processing LLC,Missouri,Very Small,Chicken,2.0,4.0,"47422 State Hwy V, Rutledge, MO 63563",40.272931,-92.025286,1631.0
180,611,M47472 + P47472,6165638,Independent,"Pure Pasture Packing, LLC",Missouri,Very Small,Chicken,2.0,3.0,"29612 Pony Path Road, Sedalia, MO 65301",38.687938,-93.140737,46949.0
181,612,M46730 + P46730,6164667,Independent,"NGF Processing, LLC",Mississippi,Very Small,Chicken,1.0,2.0,"367-A Mckenzie Rd, Petal, MS 39465",31.237684,-89.105619,46949.0
182,617,M744 + P744,846,Independent,Vineland Poultry LLC,New Jersey,Very Small,Chicken,4.0,4.0,"1100 South Mill Road, Vineland, NJ 8360",39.471963,-75.063579,19330.0


In [40]:
pp_sales_updated2[np.isnan(pp_sales_updated2["Sales Volume (Location)"])]

Unnamed: 0.1,Unnamed: 0,EstNumber,EstID,Parent Corporation,Establishment Name,State,Size,Animals Processed,Processed\nVolume\nCategory,Slaughter\nVolume\nCategory,Full Address,latitude,longitude,Sales Volume (Location)


In [41]:
pp_sales_updated2.to_csv("../data/location_match_fullest.csv")