In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import geopandas as gpd
import keplergl
#from shapely.geometry import Point, Polygon

In [2]:
# DONE:
# Add employment data.
# Create 'ag' firm flag which can be changed to a different criterion at any time.
# ag-firm flag, rural flag, and employment in each spatial unit.

# What do I want?
# 1. Total employment in ag firms in the urban and non-urban areas of each state.
# 2. The ratio of urban to rural employment in ag firms in each county.

In [3]:
%%time
year = 2017
infile = 'data/df_%d_OMB_Census_HRSA_with_ag_flag.csv' % year
df = pd.read_csv(infile)
df['Longitude'] = df['Longitude'].astype(float)
df['Latitude'] = df['Latitude'].astype(float)
df.columns

CPU times: user 1min 11s, sys: 57.2 s, total: 2min 8s
Wall time: 1min 28s


Index(['Company', 'Address Line 1', 'City', 'State', 'ZipCode', 'County Code',
       'Primary NAICS Code', 'Employee Size (5) - Location', 'Census Tract',
       'Census Block', 'Latitude', 'Longitude', 'CBSA Code', 'CBSA Level',
       'CSA Code', 'FIPS Code', 'State FIPS', 'NAICS2', 'NAICS2 desc', 'UA',
       'NAICS6', 'NAICS6 desc', 'rural_OMB', 'rural_Census',
       'Full Census Tract', 'rural_HRSA', 'ag_flag'],
      dtype='object')

In [4]:
# Remove Alaska and Hawaii for now.
df = df[~df['State'].isin(['HI','AK'])]

In [5]:
# BEGIN -- state-level files with and without the rural_Census distinction with the locational
# variables being versions of the geographical midpoint of each state. Used for hexbin keplergl maps.

In [6]:
# Select only the ag businesses. These are defined very exclusively by NAICS in an earlier notebook and will be
# more numerous in the future when we have a better definition.
df_len = len(df)
ag_df = df[df['ag_flag'] == True]
print(str(df_len),'original records.', str(len(ag_df)),'ag firms selected.')

14621688 original records. 50091 ag firms selected.


In [7]:
# State midpoint coordinates are from Wikipedia.
state_midpoints = {
'Alabama' : [32.7794,-86.8287],
'Alaska' : [64.0685,-152.2782],
'Arizona' : [34.2744,-111.6602],
'Arkansas' : [34.8938,-92.4426],
'California' : [37.1841,-119.4696],
'Colorado' : [38.9972,-105.5478],
'Connecticut' : [41.6219,-72.7273],
'Delaware' : [38.9896,-75.5050],
'District of Columbia' : [38.9101,-77.0147],
'Florida' : [28.6305,-82.4497],
'Georgia' : [32.6415,-83.4426],
'Hawaii' : [20.2927,-156.3737],
'Idaho' : [44.3509,-114.6130],
'Illinois' : [40.0417,-89.1965],
'Indiana' : [39.8942,-86.2816],
'Iowa' : [42.0751,-93.4960],
'Kansas' : [38.4937,-98.3804],
'Kentucky' : [37.5347,-85.3021],
'Louisiana' : [31.0689,-91.9968],
'Maine' : [45.3695,-69.2428],
'Maryland' : [39.0550,-76.7909],
'Massachusetts' : [42.2596,-71.8083],
'Michigan' : [44.3467,-85.4102],
'Minnesota' : [46.2807,-94.3053],
'Mississippi' : [32.7364,-89.6678],
'Missouri' : [38.3566,-92.4580],
'Montana' : [47.0527,-109.6333],
'Nebraska' : [41.5378,-99.7951],
'Nevada' : [39.3289,-116.6312],
'New Hampshire' : [43.6805,-71.5811],
'New Jersey' : [40.1907,-74.6728],
'New Mexico' : [34.4071,-106.1126],
'New York' : [42.9538,-75.5268],
'North Carolina' : [35.5557,-79.3877],
'North Dakota' : [47.4501,-100.4659],
'Ohio' : [40.2862,-82.7937],
'Oklahoma' : [35.5889,-97.4943],
'Oregon' : [43.9336,-120.5583],
'Pennsylvania' : [40.8781,-77.7996],
'Rhode Island' : [41.6762,-71.5562],
'South Carolina' : [33.9169,-80.8964],
'South Dakota' : [44.4443,-100.2263],
'Tennessee' : [35.8580,-86.3505],
'Texas' : [31.4757,-99.3312],
'Utah' : [39.3055,-111.6703],
'Vermont' : [44.0687,-72.6658],
'Virginia' : [37.5215,-78.8537],
'Washington' : [47.3826,-120.4472],
'West Virginia' : [38.6409,-80.6227],
'Wisconsin' : [44.6243,-89.9941],
'Wyoming' : [42.9957,-107.551],
}

state_abbrevs = {
'AL':'Alabama',
'AK':'Alaska',
'AZ':'Arizona',
'AR':'Arkansas',
'CA':'California',
'CO':'Colorado',
'CT':'Connecticut',
'DE':'Delaware',
'DC':'District of Columbia',
'FL':'Florida',
'GA':'Georgia',
'HI':'Hawaii',
'ID':'Idaho',
'IL':'Illinois',
'IN':'Indiana',
'IA':'Iowa',
'KS':'Kansas',
'KY':'Kentucky',
'LA':'Louisiana',
'ME':'Maine',
'MD':'Maryland',
'MA':'Massachusetts',
'MI':'Michigan',
'MN':'Minnesota',
'MS':'Mississippi',
'MO':'Missouri',
'MT':'Montana',
'NE':'Nebraska',
'NV':'Nevada',
'NH':'New Hampshire',
'NJ':'New Jersey',
'NM':'New Mexico',
'NY':'New York',
'NC':'North Carolina',
'ND':'North Dakota',
'OH':'Ohio',
'OK':'Oklahoma',
'OR':'Oregon',
'PA':'Pennsylvania',
'RI':'Rhode Island',
'SC':'South Carolina',
'SD':'South Dakota',
'TN':'Tennessee',
'TX':'Texas',
'UT':'Utah',
'VT':'Vermont',
'VA':'Virginia',
'WA':'Washington',
'WV':'West Virginia',
'WI':'Wisconsin',
'WY':'Wyoming'
}

# How to get coordinates with only the abbreviation
#for k in state_abbrevs:
#    print(k, state_midpoints[state_abbrevs[k]])

In [8]:
# Eliminate unnecessary variables
ag_df2 = ag_df[['State','rural_Census','Employee Size (5) - Location']]

In [9]:
ag_df2.columns

Index(['State', 'rural_Census', 'Employee Size (5) - Location'], dtype='object')

In [10]:
# Get Polygon coordinates for each state.

states_df = states_df[['STUSPS','geometry']]
states_df.rename(columns={'STUSPS': 'State'},inplace=True)

In [11]:
# 1. Total employment in ag firms in the urban and non-urban areas of each state.
ag_state_grouped = ag_df2.groupby(['State']).agg({'Employee Size (5) - Location' : 'sum'}).reset_index()
ag_rural_grouped = ag_df2.groupby(['State','rural_Census']).agg({'Employee Size (5) - Location' : 'sum'}).reset_index()

In [12]:
ag_rural_grouped.head()

Unnamed: 0,State,rural_Census,Employee Size (5) - Location
0,AL,0,5999.0
1,AL,1,1416.0
2,AR,0,9254.0
3,AR,1,2640.0
4,AZ,0,9713.0


In [13]:
ag_state_grouped.rename(columns={'Employee Size (5) - Location': 'Total Employment'},inplace=True)
ag_state_grouped.head()

Unnamed: 0,State,Total Employment
0,AL,7415.0
1,AR,11894.0
2,AZ,10118.0
3,CA,103383.0
4,CO,10206.0


In [14]:
ag_rural_grouped.rename(columns={'Employee Size (5) - Location': 'Category Employment'},inplace=True)
ag_rural_grouped.head()

Unnamed: 0,State,rural_Census,Category Employment
0,AL,0,5999.0
1,AL,1,1416.0
2,AR,0,9254.0
3,AR,1,2640.0
4,AZ,0,9713.0


In [15]:
# Apply state total to each rural_Census category in each state, then compute the percentage of the total 
# in each category.
ag_rural_grouped2 = ag_rural_grouped.merge(ag_state_grouped,on='State',how='inner')

In [16]:
def pct(row):
    return round(row['Category Employment'] / row['Total Employment'] * 100.0,2)
ag_rural_grouped2['Category Percentage'] = ag_rural_grouped2.apply(pct,axis=1)

In [17]:
# For the first map, retain just the Category Percentage.
ag_rural_grouped2.drop(columns=['Category Employment','Total Employment'],inplace=True)

In [18]:
urban_pct_df = ag_rural_grouped2[ag_rural_grouped2['rural_Census'] == 0].copy()
urban_pct_df.drop('rural_Census',axis=1,inplace=True)

In [19]:
rural_pct_df = ag_rural_grouped2[ag_rural_grouped2['rural_Census'] == 1].copy()
rural_pct_df.drop('rural_Census',axis=1,inplace=True)

In [20]:
# Add midpoint coordinates to rural_pct_df and an adjacent point coordinates to urban_pct_df

In [21]:
def add_rural_x(abb):
    return state_midpoints[state_abbrevs[abb]][0]
def add_rural_y(abb):
    return state_midpoints[state_abbrevs[abb]][1]

rural_pct_df['Latitude'] = rural_pct_df['State'].apply(add_rural_x)
rural_pct_df['Longitude'] = rural_pct_df['State'].apply(add_rural_y)

In [22]:
rural_pct_df.head()

Unnamed: 0,State,Category Percentage,Latitude,Longitude
1,AL,19.1,32.7794,-86.8287
3,AR,22.2,34.8938,-92.4426
5,AZ,4.0,34.2744,-111.6602
7,CA,2.87,37.1841,-119.4696
9,CO,6.83,38.9972,-105.5478


In [23]:
def add_urban_x(abb):
    return state_midpoints[state_abbrevs[abb]][0] + 0.4
def add_urban_y(abb):
    return state_midpoints[state_abbrevs[abb]][1] - 0.4

urban_pct_df['Latitude'] = urban_pct_df['State'].apply(add_urban_x)
urban_pct_df['Longitude'] = urban_pct_df['State'].apply(add_urban_y)

In [24]:
urban_pct_df.head()

Unnamed: 0,State,Category Percentage,Latitude,Longitude
0,AL,80.9,33.1794,-87.2287
2,AR,77.8,35.2938,-92.8426
4,AZ,96.0,34.6744,-112.0602
6,CA,97.13,37.5841,-119.8696
8,CO,93.17,39.3972,-105.9478


In [25]:
rural_pct_df.to_csv('maps/rural_pct_df.csv',index=None)
urban_pct_df.to_csv('maps/urban_pct_df.csv',index=None)

In [26]:
# Bars of different heights indicating relative values of 'Category Percentage' for each combination of State
# and rural_Census. Configs created interactively on the website and modified as (or if) necessary to get
# the same maps to appear in the notebook.

In [29]:
# Make a basic map, modify it interactively, save the modified config, make a final map from that.
# Config output from the interactive website:
#  maps/urban-rural-bars-by-state.json
#  maps/rural-bars-by-state.json
#
# An interactive html map of the urban-rural-bars-by-state map:
# ../maps/urban-rural-bars-by-state.html

In [30]:
# END -- state-level files with and without the rural_Census distinction with the locational
# variables being versions of the coordinates of the geographical midpoint of each state. 

In [31]:
# BEGIN -- state-level files with and without the rural_Census distinction with the locational
# variables being a geoDataFrame Polygon. Suitable for choropleth maps.

In [32]:
df.columns

Index(['Company', 'Address Line 1', 'City', 'State', 'ZipCode', 'County Code',
       'Primary NAICS Code', 'Employee Size (5) - Location', 'Census Tract',
       'Census Block', 'Latitude', 'Longitude', 'CBSA Code', 'CBSA Level',
       'CSA Code', 'FIPS Code', 'State FIPS', 'NAICS2', 'NAICS2 desc', 'UA',
       'NAICS6', 'NAICS6 desc', 'rural_OMB', 'rural_Census',
       'Full Census Tract', 'rural_HRSA', 'ag_flag'],
      dtype='object')

In [33]:
df = df[['State','Employee Size (5) - Location','rural_Census']]
df.rename(columns={'Employee Size (5) - Location':'Employees'},inplace=True)
df.columns

Index(['State', 'Employees', 'rural_Census'], dtype='object')

In [34]:
# For now, just whole states...
state_file = 'map_files/tl_2017_us_state.shp'
states_gdf = gpd.read_file(state_file)

In [35]:
states_gdf = states_gdf[['STUSPS','geometry']]
states_gdf.rename(columns={'STUSPS':'State'},inplace=True)
states_gdf.head()

Unnamed: 0,State,geometry
0,WV,"POLYGON ((-81.747254 39.095379, -81.746354 39...."
1,FL,"(POLYGON ((-82.987477 24.625379, -82.987477 24..."
2,IL,"POLYGON ((-91.185295 40.637803, -91.1751 40.64..."
3,MN,"POLYGON ((-96.784381 46.63050399999999, -96.78..."
4,MD,"POLYGON ((-77.45880799999999 39.22027, -77.458..."


In [36]:
grouped = df.groupby(['State','rural_Census']).agg({'Employees':'sum'})

In [37]:
grouped.reset_index(inplace=True)
grouped.head()

Unnamed: 0,State,rural_Census,Employees
0,AL,0,1804744.0
1,AL,1,499509.0
2,AR,0,1059959.0
3,AR,1,448406.0
4,AZ,0,2822020.0


In [38]:
merged = grouped.merge(states_gdf,on='State',how='outer',indicator=True)

In [39]:
merged['_merge'].value_counts()

both          93
right_only     7
left_only      0
Name: _merge, dtype: int64

In [40]:
merged[merged['_merge']=='right_only']

Unnamed: 0,State,rural_Census,Employees,geometry,_merge
93,HI,,,"(POLYGON ((-166.386532 23.846363, -166.384284 ...",right_only
94,VI,,,"(POLYGON ((-64.95671399999999 17.678977, -64.9...",right_only
95,MP,,,"(POLYGON ((145.558836 16.374152, 145.571326 16...",right_only
96,GU,,,"POLYGON ((144.563426 13.448065, 144.563574 13....",right_only
97,AK,,,"(POLYGON ((172.34934 52.922194, 172.353886 52....",right_only
98,AS,,,"(POLYGON ((-171.141181 -11.046608, -171.141162...",right_only
99,PR,,,"(POLYGON ((-67.99869099999999 18.17076, -67.99...",right_only


In [44]:
merged = merged[merged['_merge']=='both']
merged.drop('_merge',axis=1,inplace=True)
merged.head()
# 'rural_Census' indicates whether the firm is within a Census-defined Urban Area or Urban Cluster (0)
# or not (1).

KeyError: '_merge'

In [45]:
merged['rural_Census'] = merged['rural_Census'].astype(int)
merged['rural_Census'].value_counts()

0    49
1    44
Name: rural_Census, dtype: int64

In [46]:
gdf = gpd.GeoDataFrame(merged,crs={'init':'EPSG:3857'})

In [47]:
# Compute the urban and rural employment percentages for each state.

In [48]:
temp_df = gdf.groupby(['State']).agg({"Employees":'sum'})
temp_df.head()

Unnamed: 0_level_0,Employees
State,Unnamed: 1_level_1
AL,2304253.0
AR,1508365.0
AZ,2911669.0
CA,17089702.0
CO,2893058.0


In [49]:
state_gdf = gdf.merge(temp_df,on='State',how='inner')
state_gdf.rename(columns={'Employees_x':'Employees','Employees_y':'Total Employees'},inplace=True)
state_gdf.head()

Unnamed: 0,State,rural_Census,Employees,geometry,Total Employees
0,AL,0,1804744.0,"POLYGON ((-88.139988 34.581703, -88.1399689999...",2304253.0
1,AL,1,499509.0,"POLYGON ((-88.139988 34.581703, -88.1399689999...",2304253.0
2,AR,0,1059959.0,"POLYGON ((-94.55217999999999 36.102236, -94.55...",1508365.0
3,AR,1,448406.0,"POLYGON ((-94.55217999999999 36.102236, -94.55...",1508365.0
4,AZ,0,2822020.0,"POLYGON ((-114.516454 33.027617, -114.516639 3...",2911669.0


In [50]:
state_gdf['Emp Pct'] = (state_gdf['Employees'] / state_gdf['Total Employees']) * 100.0
state_gdf.head()

Unnamed: 0,State,rural_Census,Employees,geometry,Total Employees,Emp Pct
0,AL,0,1804744.0,"POLYGON ((-88.139988 34.581703, -88.1399689999...",2304253.0,78.322302
1,AL,1,499509.0,"POLYGON ((-88.139988 34.581703, -88.1399689999...",2304253.0,21.677698
2,AR,0,1059959.0,"POLYGON ((-94.55217999999999 36.102236, -94.55...",1508365.0,70.27205
3,AR,1,448406.0,"POLYGON ((-94.55217999999999 36.102236, -94.55...",1508365.0,29.72795
4,AZ,0,2822020.0,"POLYGON ((-114.516454 33.027617, -114.516639 3...",2911669.0,96.921044


In [51]:
map = keplergl.KeplerGl(height=500,data={'states':state_gdf})

User Guide: https://github.com/keplergl/kepler.gl/blob/master/docs/keplergl-jupyter/user-guide.md


In [52]:
map

KeplerGl(data={'states':    State  rural_Census  Employees  \
0     AL             0  1804744.0   
1     AL   …

In [53]:
map_config = map.config

In [54]:
map_config

{}

In [55]:
# This established the filter requiring the value of 1 on the 'rural_Census' variable.
map_config['config']['visState']['filters'][0]['value'] = [1,1]

KeyError: 'config'

In [None]:
map_config['config']['visState']

In [None]:
map_2 = keplergl.KeplerGl(height=500,data={'states':state_gdf},config=map_config)

In [None]:
map_2

In [None]:
# The above is a choropleth map showing the percentage of employment in rural areas by state.

# From the user manual
# "this will save map with provided data and config"
# "map_1.save_to_html(data={'data_1': df}, config=config, file_name='first_map.html')"

# "You need to manually select Widget > Save Notebook Widget State before shut downing the kernel 
# to make sure it will be reloaded.""

In [None]:
map_2.save_to_html(data={'states':state_gdf},config=map_config,data_file='maps/rural_emp_by_state.html')

In [None]:
# BEGIN --  The ratio of urban to rural employment in ag firms in each county.

In [None]:
%%time
year = 2017
infile = 'data/df_%d_OMB_Census_HRSA_with_ag_flag.csv' % year
df = pd.read_csv(infile,dtype=object)
df['Longitude'] = df['Longitude'].astype(float)
df['Latitude'] = df['Latitude'].astype(float)

In [None]:
# Remove Alaska and Hawaii for now.
df = df[~df['State'].isin(['HI','AK'])]

In [None]:
df['rural_OMB'].value_counts()

In [None]:
df['rural_Census'].value_counts()

In [None]:
df['rural_HRSA'].value_counts()

In [None]:
df['Employee Size (5) - Location'] = df['Employee Size (5) - Location'].astype(float)

In [None]:
rural_OMB_grouped = df.groupby('rural_OMB').agg({'Employee Size (5) - Location':'sum'})
rural_OMB_grouped.head()

In [None]:
rural_Census_grouped = df.groupby('rural_Census').agg({'Employee Size (5) - Location':'sum'})
rural_Census_grouped.head()

In [None]:
rural_HRSA_grouped = df.groupby('rural_HRSA').agg({'Employee Size (5) - Location':'sum'})
rural_HRSA_grouped.head()

In [None]:
# All of the above show that the Census Bureau's definition of rural (actually non-UA) includes the largest
# number of InfoGroup firms and the largest quantity of rural employment.

In [None]:
xdf = df[['State FIPS','FIPS Code','Employee Size (5) - Location','rural_Census']].copy()
xdf.rename(columns={'Employee Size (5) - Location':'Employment'},inplace=True)

In [None]:
agr_df = xdf[xdf['rural_Census']=='1']

In [None]:
non_agr_df = xdf[xdf['rural_Census']=='0']
print(len(non_agr_df))
print(len(agr_df))

In [None]:
xdf['FIPS Code'] = xdf['FIPS Code'].astype(str).apply(lambda x: x[:-2])
xdf.head()

In [None]:
def prepend(code, num):
    if len(code) == num:
        return '0' + code
    else:
        return code

In [None]:
xdf['State FIPS'] = xdf['State FIPS'].apply(prepend,args=(1,))
xdf['FIPS Code'] = xdf['FIPS Code'].apply(prepend,args=(4,))

In [None]:
xdf.dtypes

In [None]:
xdf['Employment'] = xdf['Employment'].astype(float).fillna(0.0)

In [None]:
grouped = xdf.groupby(['State FIPS','FIPS Code','rural_Census']).agg({'Employment':'sum'})

In [None]:
grouped.reset_index(inplace=True)

In [None]:
len(grouped['FIPS Code'].drop_duplicates())

In [None]:
len(grouped)

In [None]:
# Add the Polygon geometry.
counties = gpd.read_file('map_files/tl_2017_us_county.shp')

In [None]:
counties = counties[['GEOID','geometry']]

In [None]:
merged = grouped.merge(counties,left_on='FIPS Code',right_on='GEOID',how='left')

In [None]:
merged.drop('GEOID',axis=1,inplace=True)

In [None]:
merged.head()

In [None]:
total = merged.groupby('FIPS Code').agg({'Employment':'sum'})
len(total)

In [None]:
merged['rural_Census'].value_counts()
# Thus there are only 411 counties (or parts of counties) with any InfoGroup agricultural employment.

In [None]:
total.head()