## <span style=color:blue>Fetching the more-or-less central lat lon for each county/state pair of interest in our ML pipeline    </span>


In [2]:
# This useful if I want to give unique names to directories or files
import datetime
def curr_timestamp():
    current_datetime = datetime.datetime.now()
    formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
    return formatted_datetime

<span style=color:blue>The year_state_county_yield.csv file holds all of the year-county-state triples of interest along with total soybean yield.</span>

In [3]:
import pandas as pd

archives_dir = './yield data/'
file = 'soybean_yield_data.csv'

df = pd.read_csv(archives_dir + file)
print('number of rows in csv cleaned for ML: ', len(df))
print()
print(df.head())


df1 = df[['state_name','county_name']].drop_duplicates()
print('\nNumber of state-county pairs is: ', len(df1))


number of rows in csv cleaned for ML:  9952

  source_desc sector_desc   group_desc commodity_desc   class_desc   
0      SURVEY       CROPS  FIELD CROPS       SOYBEANS  ALL CLASSES  \
1      SURVEY       CROPS  FIELD CROPS       SOYBEANS  ALL CLASSES   
2      SURVEY       CROPS  FIELD CROPS       SOYBEANS  ALL CLASSES   
3      SURVEY       CROPS  FIELD CROPS       SOYBEANS  ALL CLASSES   
4      SURVEY       CROPS  FIELD CROPS       SOYBEANS  ALL CLASSES   

        prodn_practice_desc         util_practice_desc statisticcat_desc   
0  ALL PRODUCTION PRACTICES  ALL UTILIZATION PRACTICES             YIELD  \
1  ALL PRODUCTION PRACTICES  ALL UTILIZATION PRACTICES             YIELD   
2  ALL PRODUCTION PRACTICES  ALL UTILIZATION PRACTICES             YIELD   
3  ALL PRODUCTION PRACTICES  ALL UTILIZATION PRACTICES             YIELD   
4  ALL PRODUCTION PRACTICES  ALL UTILIZATION PRACTICES             YIELD   

   unit_desc                               short_desc  ...   
0  BU / ACRE  S

<span style=color:blue>The function geocode_county defined below won't work on "DU PAGE" county in Illinois.  But it does work on "DUPAGE".  So, changing the name in both df and df1 </span>

In [4]:

# including the "or" (i.e., "|") so that I can this cell is eidenpotent
index = df.index[(df['county_name'] == 'DU PAGE') | (df['county_name'] == 'DUPAGE')].tolist()
print(index)
for ind in index:
    df.at[ind, 'county_name'] = 'DUPAGE'
    print(df.at[ind, 'county_name'])

index1 = df1.index[(df1['county_name'] == 'DU PAGE') | (df1['county_name'] == 'DUPAGE')].tolist()
print(index1)
for ind in index1:
    df1.at[ind, 'county_name'] = 'DUPAGE'
    print(df1.at[ind, 'county_name'])


[279, 280, 281, 282, 283]
DUPAGE
DUPAGE
DUPAGE
DUPAGE
DUPAGE
[279]
DUPAGE


<span style=color:blue>Using geopy to fetch lon-lat for (approx center of) each county.  This cell takes a while to run</span>

In [6]:
from geopy.geocoders import Nominatim

# Geocoding function to retrieve coordinates for a county
def geocode_county(state, county):
    geolocator = Nominatim(user_agent="county_geocoder")
    location = geolocator.geocode(county + ", " + state + ", USA")
    if location:
        return location.longitude, location.latitude
    else:
        print('no lat-lon found for ', state, county)
        return None, None
        
        
df1['lon'] = df1.apply(lambda x: geocode_county(x['state_name'], x['county_name'])[0], axis=1)
df1['lat'] = df1.apply(lambda x: geocode_county(x['state_name'], x['county_name'])[1], axis=1)

print(df1.head())
print()

print('lon-lat for ILLINOIS-BUREAU is: ', geocode_county('ILLINOIS', 'BUREAU'))


   state_name county_name        lon        lat
0    ILLINOIS      BUREAU -89.534118  41.401629
20   ILLINOIS     CARROLL -89.955679  42.064735
40   ILLINOIS       HENRY -90.117744  41.341855
60   ILLINOIS  JO DAVIESS -90.174374  42.350666
79   ILLINOIS         LEE -89.286030  41.747311

lon-lat for ILLINOIS-BUREAU is:  (-89.5341179, 41.4016294)


<span style=color:blue>Archiving df1 for later use </span>

In [8]:
archives_dir = './lon_lat data/'
filename = 'state_county_lon_lat.csv'
df1.to_csv(archives_dir + filename, index=False)
print('wrote file: ', archives_dir + filename)

wrote file:  ./lon_lat data/state_county_lon_lat.csv
