In [1]:
import pandas as pd
import geopandas as gpd
from shapely import Point
from math import radians, cos, sin, asin, sqrt

In [36]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    # print("Distance:", c * r)
    return c * r

In [58]:
# funtion will take 4 inputs. infogroup_df & counterglow_df2 are the dataframes being analyzed.
# the third input is the specific state that we are choosing to analyze
# the fourth input is the threshold distance. this distance is the max allowable distance between 2
# datapoints. the output is a dataframe that in each row, we have a datapoint that represents a
# datapoint from infogroup and (x distance away) a farm spotted in the counterglow data. we are
# speculating that it is possible for a spotted farm in counterglow to not have the most precise address
# and actually exist in the Infogroup data, but with slightly different longitude and latitude

def potential_farms(infogroup_df, counterglow_df, two_letter_state, max_dist ):
    
    # ruduce dataframe to only consist rows with specified two_letter_state
    infogroup_state_df      = infogroup_df[infogroup_df["STATE"] == two_letter_state]
    counterglow_state_df    = counterglow_df[counterglow_df["State"] == two_letter_state]
    print("Complete step 1. infogroup state length is ", len(infogroup_state_df), "counterglow length:", len(counterglow_state_df) )
    print("ig size:", len(infogroup_state_df), "cg size:", len(counterglow_state_df))
    print()
    
    # print("Ig:", infogroup_state_df.head())
    # print("cg:", counterglow_state_df.head())
    
    # create two new empty dataframe that we will be adding to when match is found
    infogroup_trim     = pd.DataFrame(columns=infogroup_state_df.columns)
    counterglow_trim   = pd.DataFrame(columns=counterglow_state_df.columns)
    print("Created step 2: blank dataframes complete")
    
    counter = 0
    for i in range(len(infogroup_state_df)):
        
        infogroup_longitude  = infogroup_state_df.iloc[i]["LONGITUDE"]  # get longitude
        infogroup_latitude   = infogroup_state_df.iloc[i]["LATITUDE"]   # get latitude
        
        for j in range(len(counterglow_state_df)):
            
            counterglow_latitude    = counterglow_state_df.iloc[j]["Lat"]    # get latitude
            counterglow_longitude   = counterglow_state_df.iloc[j]["Lat.1"]  # get longitude
            
            dist_km                 = haversine(infogroup_longitude, infogroup_latitude, 
                                                counterglow_longitude, counterglow_latitude)
            
            if(dist_km <= max_dist):
                counter += 1 # delete later
            
                infogroup_trim.loc[len(infogroup_trim.index)]      = infogroup_state_df.iloc[i]
                counterglow_trim.loc[len(counterglow_trim.index)]  = counterglow_state_df.iloc[j]
        
    # change the name of the columns for Latitude & Longitude in each dataframe
    print("length of infogroup_trim", infogroup_trim)
    print("length of counterglow trim", len(counterglow_trim))
    print("all done bruv, total:", counter)        
    return infogroup_trim, counterglow_trim   
    

In [59]:
df_sic_0291 = pd.read_csv("../data/code_0291.csv")
df_counterglow = pd.read_csv("../data/Counterglow+Facility+List+Complete.csv")

# print(df_counterglow.columns)

infogroup_matches_0291, counterglow_matches_0291 = potential_farms(df_sic_0291, df_counterglow, 'IA', 5 )

Complete step 1. infogroup state length is  95 counterglow length: 6811
ig size: 95 cg size: 6811

Created step 2: blank dataframes complete
length of infogroup_trim     Unnamed: 0               COMPANY       ADDRESS LINE 1         CITY STATE  \
0         2508    HIDDEN ACRES RANCH         275 300TH ST  COON RAPIDS    IA   
1         2508    HIDDEN ACRES RANCH         275 300TH ST  COON RAPIDS    IA   
2         2508    HIDDEN ACRES RANCH         275 300TH ST  COON RAPIDS    IA   
3         2672     DOGGIE DUDE RANCH  26962 SAND HILL TRL         AMES    IA   
4         2764          JONBAR RANCH        34985 UTE AVE       WAUKEE    IA   
..         ...                   ...                  ...          ...   ...   
255      13054  ROCKN BOOT RANCH LLC     2505 BAYFIELD RD    MUSCATINE    IA   
256      13116    GREY BARN FARM LLC       1945 HIGHWAY 6     ATALISSA    IA   
257      13116    GREY BARN FARM LLC       1945 HIGHWAY 6     ATALISSA    IA   
258      13116    GREY BARN FARM L

In [60]:
infogroup_matches_0291.head()

counterglow_matches_0291.head()

Unnamed: 0,Name,Lat,Lat.1,Address,City,State,County,Description,Business/company name,Postal address,Phone number,Region,Facility name,Number of animals,Full address,Website URL,Postcode,Suburb/city,Contracted to,Farm Type
0,743 - Woodford Creek Farms Llc - (fka Nfp Farm...,41.878101,-94.610558,"2396 B Avenue Coon Rapids 50058, United States",,IA,Carroll County,,,,,,,,,,,,,Pigs (Meat)
1,746 - Woodford Creek Farms Llc (fka Nfp-farm 3...,41.907509,-94.642853,"33291 310th Street Coon Rapids 50058, United S...",,IA,Carroll County,,,,,,,,,,,,,Pigs (Meat)
2,Brian Hunter Farm,41.920311,-94.569908,"2104 D Avenue Scranton 51462, United States",,IA,Carroll County,,,,,,,,,,,,,Pigs (Meat)
3,Dave Henderson Finisher,41.948394,-93.531879,3273 280th St,Ames,IA,Story County,,,,,Story County,Dave Henderson Finisher,996.0,3273 280th St,,,Ames,,Pigs (Meat)
4,Brandon Burger #1,41.57576,-93.908188,"2939 Ashworth Road Adel 50003, United States",,IA,Dallas County,,,,,,,,,,,,,Pigs (Meat)


In [35]:
df_counterglow = pd.read_csv("../data/Counterglow+Facility+List+Complete.csv")

df_sic_0291 = pd.read_csv("../data/code_0291.csv")


### Finding the similarites in counterglow and 0291 data

In [None]:
# small sample test using the state of Iowa

# round Lat and Lat.1 to 2 decimnal place
df_counterglow_IA          = df_counterglow[df_counterglow["State"] == "IA"]
df_counterglow_IA["Lat"]   = df_counterglow_IA["Lat"].round(2)
df_counterglow_IA["Lat.1"] = df_counterglow_IA["Lat.1"].round(2)

# round long/lat to 2 decimal place
df_0291_IA              = df_0291[df_0291["STATE"] == "IA"]
df_0291_IA["LONGITUDE"] = df_0291_IA["LONGITUDE"].round(2)
df_0291_IA["LATITUDE"]  = df_0291_IA["LATITUDE"].round(2)
df_0291_IA.head()


In [37]:
# using the entire 0291 file and counterglow file

# round Lat and Lat.1 to 2 decimnal place
df_counterglow["Lat"]   = df_counterglow["Lat"].round(2)
df_counterglow["Lat.1"] = df_counterglow["Lat.1"].round(2)

# round long/lat to 2 decimal place
df_sic_0291["LONGITUDE"] = df_sic_0291["LONGITUDE"].round(2)
df_sic_0291["LATITUDE"]  = df_sic_0291["LATITUDE"].round(2)
df_sic_0291.head()

# create new empty dataframe using the columns from the pre-existing dfs
df_sic_0291_trimmed = pd.DataFrame(columns=df_sic_0291.columns)
df_counterglow_trimmed = pd.DataFrame(columns=df_counterglow.columns)

for i in range(len(df_sic_0291)):
    latitude  = df_sic_0291.iloc[i]["LATITUDE"]
    longitude = df_sic_0291.iloc[i]["LONGITUDE"]
    for j in range(len(df_counterglow)):
        if( (latitude == df_counterglow.iloc[j]["Lat"]) & (longitude == df_counterglow.iloc[j]["Lat.1"]) ):
            df_sic_0291_trimmed.loc[len(df_sic_0291_trimmed.index)] = df_sic_0291.iloc[i]
            df_counterglow_trimmed.loc[len(df_counterglow_trimmed.index)] = df_counterglow.iloc[j]


In [None]:
# finding the similarites in counterglow and 0241 data
