In [1]:
import pandas as pd
import geopandas as gpd
from shapely import Point
from math import radians, cos, sin, asin, sqrt

In [57]:
RADIUS_KM = 1.7

In [2]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    # print("Distance:", c * r)
    return c * r

In [25]:
"""
funtion will take 4 inputs. infogroup_df & counterglow_df2 are the dataframes being analyzed.
the third input is the specific state that we are choosing to analyze
the fourth input is the threshold distance. this distance is the max allowable distance between 2
datapoints. the output is a dataframe that in each row, we have a datapoint that represents a
datapoint from infogroup and (x distance away) a farm spotted in the counterglow data. we are
speculating that it is possible for a spotted farm in counterglow to not have the most precise address
and actually exist in the Infogroup data, but with slightly different longitude and latitude
"""

def potential_farms(infogroup_df, counterglow_df, two_letter_state, max_dist_km ):
    
    # ruduce dataframe to only consist rows with specified two_letter_state
    infogroup_state_df      = infogroup_df[infogroup_df["STATE"] == two_letter_state]
    counterglow_state_df    = counterglow_df[counterglow_df["State"] == two_letter_state]
    
    # create two new empty dataframe that we will be adding to when match is found
    infogroup_trim     = pd.DataFrame(columns=infogroup_state_df.columns)
    counterglow_trim   = pd.DataFrame(columns=counterglow_state_df.columns)
    
    counter = 0 # to counter matches, can print to terminal if desired
    for i in range(len(infogroup_state_df)):
        
        infogroup_longitude  = infogroup_state_df.iloc[i]["LONGITUDE"]  # get longitude
        infogroup_latitude   = infogroup_state_df.iloc[i]["LATITUDE"]   # get latitude
        
        for j in range(len(counterglow_state_df)):
            
            counterglow_latitude    = counterglow_state_df.iloc[j]["Lat"]    # get latitude
            counterglow_longitude   = counterglow_state_df.iloc[j]["Lat.1"]  # get longitude
            
            dist_km                 = haversine(infogroup_longitude, infogroup_latitude, 
                                                counterglow_longitude, counterglow_latitude)
            
            if(dist_km <= max_dist_km): # if the distance is <= the max distance set in function
                counter += 1
            
                infogroup_trim.loc[len(infogroup_trim.index)]      = infogroup_state_df.iloc[i]
                counterglow_trim.loc[len(counterglow_trim.index)]  = counterglow_state_df.iloc[j]
        
    # change the name of the columns for Latitude & Longitude in each dataframe
    # for Infogroup Dataframe
    infogroup_trim      = infogroup_trim.rename(columns={'LATITUDE': 'LATITUDE 1', 
                                                        'LONGITUDE': 'LONGITUDE 1'})
    counterglow_trim    = counterglow_trim.rename(columns={'Lat' : 'LATITUDE 2',
                                                           'Lat.1': 'LONGITUDE 2'})
    
    if(counter !=0):
        print(two_letter_state + ":", "Total number of matches:", counter)
                
    return infogroup_trim, counterglow_trim   
    

In [59]:
"""
function take a dataframe and a list of states. This function calls the function 'potential_farms'
if the two dataframes returned from 'potential_farms' are NOT empty, they are added to the dictionaries
the keys for the dictionaries are the STATE in the current iteration (in the for loop) and the values
are the dataframes associated with the state. This function will return two dictionaries
"""

def infogroup_counterglow_dict(sic_df, counterglow_df, list_of_states, radium_km):
    # empty dictionaries
    igroup_sic_dict = {}
    ctrglow_dict = {}

    # loop through states list and use those as keys to match with the dataframe that was output 
    for state in range(len(list_of_states)):
        infogroup_matches, counterglow_matches = potential_farms(sic_df, counterglow_df, list_of_states[state], radium_km)
        
        # set up key value pairing for both dictionaries
        # only adding to dictionaries if there are matches
        if( (len(infogroup_matches)!=0) & (len(counterglow_matches)!=0) ):
            igroup_sic_dict[list_of_states[state]]    = infogroup_matches
            ctrglow_dict[list_of_states[state]]       = counterglow_matches
            
    return igroup_sic_dict, ctrglow_dict


In [62]:
"""
fucntion will take a dataframe and trim off all columns, only leaving:
Company, address line 1, city, state, zipcode, primary sic code, sic code 1, sic code 2, sic code 3, sic code 4,
"""

def keep_certain_columns(dict):
    for key in dict:
        dict[key] = dict[key][['COMPANY', 'ADDRESS LINE 1', 'CITY', 'STATE', 'ZIPCODE', 'PRIMARY SIC CODE', 'SIC6_DESCRIPTIONS',
                'PRIMARY NAICS CODE', 'NAICS8 DESCRIPTIONS', 'SIC CODE', 'SIC6_DESCRIPTIONS (SIC)',
                'SIC CODE 1', 'SIC6_DESCRIPTIONS (SIC1)', 'SIC CODE 2', 'SIC6_DESCRIPTIONS(SIC2)',
                'SIC CODE 3', 'SIC6_DESCRIPTIONS(SIC3)', 'SIC CODE 4', 'SIC6_DESCRIPTIONS(SIC4)', 'PARENT NUMBER']]
    
    return dict


In [4]:
# load counterglow dataset
df_counterglow = pd.read_csv("../data/Counterglow+Facility+List+Complete.csv")

In [5]:
# filter counterglow dataset to have poultry/chicken only

# Chickens & Other Birds (Meat)
# Chickens (Meat)

df_counterglow_poultry = df_counterglow[(df_counterglow['Farm Type'] == 'Chickens & Other Birds (Meat)') |
                                        (df_counterglow['Farm Type'] == 'Chickens (Meat)')]


In [6]:
states = ['IA', 'OK', 'MO', 'OK', 'AL', 'LA', 'MS', 'IL', 'IN', 'OH', 'KY', 'TN', 'AR', 'NC', 'SC', 'GA']

## SIC Code 0291 Exploration: General Farms, Primarily Livestock and Animal Specialties.

In [60]:
# To see how many matches exist within the dataframe consisting of SIC Codes containing 0291
df_sic_0291 = pd.read_csv("../data/code_0291.csv")
  
igroup_sic_0291_dict, ctrglow_0291_dict = infogroup_counterglow_dict(df_sic_0291, df_counterglow_poultry, states, RADIUS_KM)


MO: Total number of matches: 1
AL: Total number of matches: 2
NC: Total number of matches: 1


In [63]:
igroup_sic_0291_dict = keep_certain_columns(igroup_sic_0291_dict)

In [64]:
igroup_sic_0291_dict['MO']

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,PRIMARY SIC CODE,SIC6_DESCRIPTIONS,PRIMARY NAICS CODE,NAICS8 DESCRIPTIONS,SIC CODE,SIC6_DESCRIPTIONS (SIC),SIC CODE 1,SIC6_DESCRIPTIONS (SIC1),SIC CODE 2,SIC6_DESCRIPTIONS(SIC2),SIC CODE 3,SIC6_DESCRIPTIONS(SIC3),SIC CODE 4,SIC6_DESCRIPTIONS(SIC4),PARENT NUMBER
0,K-MAC RANCH,15223 STATE HIGHWAY 37,CASSVILLE,MO,65625,29101,RANCHES,11299013.0,ALL OTHER ANIMAL PRODUCTION,,,,,,,,,,,


In [65]:
igroup_sic_0291_dict['AL']

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,PRIMARY SIC CODE,SIC6_DESCRIPTIONS,PRIMARY NAICS CODE,NAICS8 DESCRIPTIONS,SIC CODE,SIC6_DESCRIPTIONS (SIC),SIC CODE 1,SIC6_DESCRIPTIONS (SIC1),SIC CODE 2,SIC6_DESCRIPTIONS(SIC2),SIC CODE 3,SIC6_DESCRIPTIONS(SIC3),SIC CODE 4,SIC6_DESCRIPTIONS(SIC4),PARENT NUMBER
0,RAEANN RANCH LLC,2665 KELLY CREEK RD,ODENVILLE,AL,35120,29101,RANCHES,11299013.0,ALL OTHER ANIMAL PRODUCTION,,,,,,,,,,,
1,NARROW GATE RANCH,5652 BIRD FARM RD,JASPER,AL,35503,29101,RANCHES,11299013.0,ALL OTHER ANIMAL PRODUCTION,,,,,,,,,,,


In [66]:
igroup_sic_0291_dict['NC']

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,PRIMARY SIC CODE,SIC6_DESCRIPTIONS,PRIMARY NAICS CODE,NAICS8 DESCRIPTIONS,SIC CODE,SIC6_DESCRIPTIONS (SIC),SIC CODE 1,SIC6_DESCRIPTIONS (SIC1),SIC CODE 2,SIC6_DESCRIPTIONS(SIC2),SIC CODE 3,SIC6_DESCRIPTIONS(SIC3),SIC CODE 4,SIC6_DESCRIPTIONS(SIC4),PARENT NUMBER
0,LATTAS EGG RANCH INC,1016 GOVERNOR BURKE RD,HILLSBOROUGH,NC,27278,29101,RANCHES,11299013.0,ALL OTHER ANIMAL PRODUCTION,,,,,,,,,,,


## SIC Code 0241 Exploration: Dairy Farms

In [67]:
# To see how many matches exist within the dataframe consisting of SIC Codes containing 0291
df_sic_0241 = pd.read_csv("../data/code_0241.csv")
  
igroup_sic_0241_dict, ctrglow_0241_dict = infogroup_counterglow_dict(df_sic_0241, df_counterglow_poultry, states, RADIUS_KM)


IA: Total number of matches: 1
MS: Total number of matches: 1
IN: Total number of matches: 3
OH: Total number of matches: 2
NC: Total number of matches: 3


In [69]:
igroup_sic_0241_dict = keep_certain_columns(igroup_sic_0241_dict)

In [71]:
igroup_sic_0241_dict['IA']

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,PRIMARY SIC CODE,SIC6_DESCRIPTIONS,PRIMARY NAICS CODE,NAICS8 DESCRIPTIONS,SIC CODE,SIC6_DESCRIPTIONS (SIC),SIC CODE 1,SIC6_DESCRIPTIONS (SIC1),SIC CODE 2,SIC6_DESCRIPTIONS(SIC2),SIC CODE 3,SIC6_DESCRIPTIONS(SIC3),SIC CODE 4,SIC6_DESCRIPTIONS(SIC4),PARENT NUMBER
0,WIELENGA DAIRY INC,3387 JAY AVE,BOYDEN,IA,51234,24101,DAIRIES (MILK),11212001.0,DAIRY CATTLE & MILK PRODUCTION,545101.0,DAIRY PRODUCTS-RETAIL,,,,,,,,,


In [72]:
igroup_sic_0241_dict['MS']

Unnamed: 0,COMPANY,ADDRESS LINE 1,CITY,STATE,ZIPCODE,PRIMARY SIC CODE,SIC6_DESCRIPTIONS,PRIMARY NAICS CODE,NAICS8 DESCRIPTIONS,SIC CODE,SIC6_DESCRIPTIONS (SIC),SIC CODE 1,SIC6_DESCRIPTIONS (SIC1),SIC CODE 2,SIC6_DESCRIPTIONS(SIC2),SIC CODE 3,SIC6_DESCRIPTIONS(SIC3),SIC CODE 4,SIC6_DESCRIPTIONS(SIC4),PARENT NUMBER
0,BENNETT DAIRY INC,1173 KOKOMO RD,KOKOMO,MS,39643,24101,DAIRIES (MILK),11212001.0,DAIRY CATTLE & MILK PRODUCTION,999966.0,FEDERAL GOVERNMENT CONTRACTORS,,,,,,,,,


In [31]:
# To see how many matches exist within the dataframe consisting of SIC Codes containing 0241: Dairy Farms
df_sic_0241 = pd.read_csv("../data/code_0241.csv")


for state in range(len(states)):
    infogroup_matches_0241, counterglow_matches_0241 = potential_farms(df_sic_0241, df_counterglow_poultry, states[state], 1.7 )

IA: Total number of matches: 1
OK: Total number of matches: 0
MO: Total number of matches: 0
OK: Total number of matches: 0
AL: Total number of matches: 0
LA: Total number of matches: 0
MS: Total number of matches: 1
IL: Total number of matches: 0
IN: Total number of matches: 3
OH: Total number of matches: 2
KY: Total number of matches: 0
TN: Total number of matches: 0
AL: Total number of matches: 0
NC: Total number of matches: 3
SC: Total number of matches: 0
GA: Total number of matches: 0


In [32]:
# To see how many matches exist within the dataframe consisting of SIC Codes containing 0241: Dairy Farms
df_sic_5144 = pd.read_csv("../data/code_5144.csv")


for state in range(len(states)):
    infogroup_matches_5144, counterglow_matches_5144 = potential_farms(df_sic_5144, df_counterglow_poultry, states[state], 1.7 )

IA: Total number of matches: 12
OK: Total number of matches: 0
MO: Total number of matches: 10
OK: Total number of matches: 0
AL: Total number of matches: 6
LA: Total number of matches: 0
MS: Total number of matches: 3
IL: Total number of matches: 0
IN: Total number of matches: 4
OH: Total number of matches: 0
KY: Total number of matches: 1
TN: Total number of matches: 0
AL: Total number of matches: 6
NC: Total number of matches: 1
SC: Total number of matches: 0
GA: Total number of matches: 1


In [33]:
# To see how many matches exist within the dataframe consisting of SIC Codes containing 0241: Dairy Farms
df_sic_5154 = pd.read_csv("../data/code_5154.csv")


for state in range(len(states)):
    infogroup_matches_5154, counterglow_matches_5154 = potential_farms(df_sic_5154, df_counterglow_poultry, states[state], 1.7 )

IA: Total number of matches: 3
OK: Total number of matches: 0
MO: Total number of matches: 0
OK: Total number of matches: 0
AL: Total number of matches: 3
LA: Total number of matches: 0
MS: Total number of matches: 0
IL: Total number of matches: 0
IN: Total number of matches: 2
OH: Total number of matches: 0
KY: Total number of matches: 0
TN: Total number of matches: 0
AL: Total number of matches: 3
NC: Total number of matches: 2
SC: Total number of matches: 0
GA: Total number of matches: 0
