# Geocode

Mapping the H2A visa work sites

*Added by Steve* 

Essentially this notebook takes the various, often dirty, geolocation data from the transformed data and resolves these geolocation entitites by city and state. When the city column contains dirty data, it has the added benefit of "cleaning" that column.

In [1]:
import os
import csv
import time
import random
import calculate
import numpy as np
import pandas as pd
import timeout_decorator
from geopy import Location
from geopy.geocoders import Bing

In [2]:
import warnings
warnings.filterwarnings("ignore")

Read in all the visas

In [3]:
df = pd.concat([
    pd.read_csv("./output/transformed_master_cases.csv"),
    pd.read_csv("./output/transformed_sub_cases.csv"),
])

In [4]:
# Added by Steve, peek at df
df.head()

Unnamed: 0,case_number,case_status,certification_start_date,certification_start_year,city,crop,employer,fiscal_year,job_title,latimes_crop,latimes_id,master_case,row_number,state,workers_certified
0,C-07318-06766,Certified - Full,2008-01-01,2008.0,BASILE,,TEPETATE GATOR CORP,2008,"LABORER, AQUATIC LIFE",,2008-3,False,3,LA,5
1,C-07271-06353,Certified - Full,2008-01-02,2008.0,CENTURIA,,HANJO FARMS,2008,"FARMWORKER, DIVERSIFIED CROPS I",,2008-4,False,4,WI,1
2,C-07261-06276,Certified - Full,2008-01-01,2008.0,SPANISH FARK,,SOUTH SHORE FARMS,2008,"FARMWORKER, FRUIT II",,2008-5,False,5,UT,20
3,C-07310-06649,Certified - Full,2008-01-01,2008.0,ENID,,BRYAN KROEKER,2008,"FARMWORKER, GENERAL I",,2008-6,False,6,OK,8
4,C-07317-06700,Certified - Full,2008-01-01,2008.0,LAKE CHARLES,,PAUL HEINEN FARMS,2008,"FARMWORKER, RICE",,2008-7,False,7,LA,5


Extract the distinct locations

In [5]:
# Added by Steve, city column still requires cleaning
df.groupby(['city', 'state']).size()

city                                                state
1) WENDEN 2)TONOPAH  3) DATELAND                    AZ         1
1.- TONOPAH                           2.- DATELAND  AR         1
1090 STEVENS RANCH ROAD                             TX         1
1561 RAYBURN RD                                     KY         1
16001 311TH AVE                                     SD         1
18 MILES NORTH OF CAREY                             ID         2
208 Hesperian Way                                   WA         2
2279 S.E. REYNOLDS STREET                           FL         1
230 CAMERON AVENUE                                  NC         5
28525                                               NC         1
33 CADDO                                            TX         1
7317 N. Andre Rd.                                   KS         1
93219                                               CA         2
ABBEVILLE                                           AL         7
                                

In [6]:
locations = df.groupby(['city', 'state']).size().reset_index().rename(columns={0: "count"})

In [7]:
# Added by Steve, what do we have?
locations.head()

Unnamed: 0,city,state,count
0,1) WENDEN 2)TONOPAH 3) DATELAND,AZ,1
1,1.- TONOPAH 2.- DATE...,AR,1
2,1090 STEVENS RANCH ROAD,TX,1
3,1561 RAYBURN RD,KY,1
4,16001 311TH AVE,SD,1


Read in previously geocoded locations

In [8]:
geocoded = pd.read_csv("./output/geocoded.csv")

In [9]:
# Added by Steve
geocoded.head()

Unnamed: 0,key,geocoder_address,lat,lng,geocoder_type
0,"1) WENDEN 2)TONOPAH 3) DATELAND, AZ","Dateland, AZ, United States",32.799599,-113.540932,bing
1,1.- TONOPAH 2.- DATE...,"AR, United States",34.899921,-92.438873,bing
2,"1090 STEVENS RANCH ROAD, TX","1090 Stevens Ranch Rd, Pipe Creek, TX 78063, USA",29.774711,-99.078929,RANGE_INTERPOLATED
3,"1561 RAYBURN RD, KY","1561 Rayburn Rd, Murray, KY 42071, USA",36.569439,-88.441078,RANGE_INTERPOLATED
4,"16001 311TH AVE, SD","16001 311th Ave, Agar, SD 57520, USA",44.896279,-99.939442,RANGE_INTERPOLATED


In [10]:
geocode_cache = dict(
    (d['key'], d) for i, d in geocoded.iterrows()
)

Identify how many remain unmapped

In [11]:
df['key'] = df.apply(lambda x: "{}, {}".format(x.city, x.state), axis=1)

In [18]:
# Added by Steve, how unique are these keys? Semi-unique
df.key.value_counts().head(5)

Vass, NC           6676
VASS, NC           1078
CASA GRANDE, AZ     686
MURRAY, KY          679
Casa Grande, AZ     446
Name: key, dtype: int64

In [47]:
df.sort_values(['city']).head()

Unnamed: 0,case_number,case_status,certification_start_date,certification_start_year,city,crop,employer,fiscal_year,job_title,latimes_crop,latimes_id,master_case,row_number,state,workers_certified,key
3071,C-08066-08800,Certified - Partial,2008-05-15,2008.0,1) WENDEN 2)TONOPAH 3) DATELAND,,TEAM PACKING INC,2008,HARVEST WORKER FRUIT,,2008-5105,False,5105,AZ,460,"1) WENDEN 2)TONOPAH 3) DATELAND, AZ"
1341,C-08023-08004,Certified - Full,2008-03-09,2008.0,1.- TONOPAH 2.- DATE...,,TEAM PACKING INC,2008,FARMWORKER FRUIT II,,2008-1630,False,1630,AR,35,1.- TONOPAH 2.- DATE...
7599,C-09064-18476,Certified - Full,2009-04-20,2009.0,1090 STEVENS RANCH ROAD,,JOHN E. FIGUEROA D.B.A. RUNNING L. RANCH,2009,STABLE ATTENDANT,,2009-4340,False,4340,TX,5,"1090 STEVENS RANCH ROAD, TX"
10847,C-10008-21978,Certified - Full,2010-03-01,2010.0,1561 RAYBURN RD,,ROB MORTON FARMS,2010,"FARMWORKERS AND LABORERS, CROP, NURSERY, AND G...",,2010-1654,False,1654,KY,9,"1561 RAYBURN RD, KY"
14891,C-11006-26359,Certified - Full,2011-03-01,2011.0,16001 311TH AVE,Cultivating,AHLEMEIER FARMS INC.,2011,AGRICULTURAL EQUIPMENT OPERATOR,Cultivating,2011-1602,False,1602,SD,2,"16001 311TH AVE, SD"


In [48]:
# Comment by Steve, these two cell lookup how many bad addresses are untranslatable
not_geocoded = df[~df.key.isin(geocoded.key)]

In [49]:
print "{:,} of {:,} geocoded ({}%)".format(
    len(df) - len(not_geocoded),
    len(df),
    calculate.percentage(len(df) - len(not_geocoded), len(df))
)

83,088 of 83,088 geocoded (100.0%)


Extract the unmapped locations

In [50]:
unmapped = not_geocoded.groupby(['key']).size().reset_index().rename(columns={0: "count"})

In [51]:
df_list = list(unmapped.iterrows())

In [52]:
random.shuffle(df_list)

In [53]:
# Added by Steve, we don't have any unmapped values
unmapped.size

0

Try to geocode them

In [54]:
@timeout_decorator.timeout(10)
def bingit(key):
    bing = Bing(os.getenv("BING_API_KEY"), timeout=10)
    address = "{}, United States".format(key)
    print "Geocoding {}".format(address)
    try:
        geocode_cache[key]
        print "Already mapped"
        return
    except KeyError:
        pass

    result = bing.geocode(address, exactly_one=False)
    if not result:
        return
    first_result = result[0]

    print "Mapped to {}".format(first_result)
    geocode_cache[key] = first_result
    time.sleep(0.5)

In [55]:
for i, row in df_list:
    try:
        bingit(row.key)
    except:
        print "TIMEOUT"
        continue

Merged the newly geocoded locations with the old ones

In [56]:
def transform_geocode(key, value):
    if isinstance(value, pd.Series):
        return [key, value['geocoder_address'], value['lat'], value['lng'], value['geocoder_type']]
    return [key, value.address, value.latitude, value.longitude, "bing"]

In [57]:
rows = [transform_geocode(k, v) for k, v in geocode_cache.items()]

In [58]:
rows.sort(key=lambda x:x[0])

Save the geocoded locations

In [59]:
with open("./output/geocoded.csv", 'w') as f:
    w = csv.writer(f)
    w.writerow(["key", "geocoder_address", "lat", "lng", "geocoder_type"])
    w.writerows(rows)

Merge geocoded points onto cases

In [60]:
mapped = pd.read_csv("./output/geocoded.csv")

In [66]:
# Added by Steve, what was `mapped` schema?
mapped.head()

Unnamed: 0,key,geocoder_address,lat,lng,geocoder_type
0,"1) WENDEN 2)TONOPAH 3) DATELAND, AZ","Dateland, AZ, United States",32.799599,-113.540932,bing
1,1.- TONOPAH 2.- DATE...,"AR, United States",34.899921,-92.438873,bing
2,"1090 STEVENS RANCH ROAD, TX","1090 Stevens Ranch Rd, Pipe Creek, TX 78063, USA",29.774711,-99.078929,RANGE_INTERPOLATED
3,"1561 RAYBURN RD, KY","1561 Rayburn Rd, Murray, KY 42071, USA",36.569439,-88.441078,RANGE_INTERPOLATED
4,"16001 311TH AVE, SD","16001 311th Ave, Agar, SD 57520, USA",44.896279,-99.939442,RANGE_INTERPOLATED


In [61]:
def create_key(row):
    # Skip any nulls
    if row.city in [np.NaN, 'nan', '']:
        return ''
    elif row.state in [np.NaN, 'nan', '']:
        return ''
    else:
        return "{}, {}".format(row.city, row.state)

In [62]:
def add_points(name):
    df = pd.read_csv("./output/transformed_{}.csv".format(name))
    df['key'] = df.apply(create_key, axis=1)
    mapped_df = df.merge(mapped, on=["key"], how="left")
    mapped_df.drop('key', axis=1, inplace=True)
    mapped_df.to_csv("./output/geocoded_{}.csv".format(name), index=False, encoding="utf-8")

In [67]:
# Added by Steve, what is `add_points` doing?
# It's just geocoding the entries

kas = pd.read_csv("./output/transformed_{}.csv".format('sub_cases'))
kas['key'] = kas.apply(create_key, axis=1)
mapped_kas = kas.merge(mapped, on=["key"], how="left")
mapped_kas.drop('key', axis=1, inplace=True)
mapped_kas

Unnamed: 0,case_number,case_status,certification_start_date,certification_start_year,city,crop,employer,fiscal_year,job_title,latimes_id,row_number,state,workers_certified,latimes_crop,geocoder_address,lat,lng,geocoder_type
0,A-07297-04840,Certified - Full,2008-01-06,2008.0,BRANDON,,RIVERS PLANT FARM,2008,HORTICULTURAL WORKER,2008-108,108,MS,21,,"Brandon, MS, USA",32.273202,-89.985916,APPROXIMATE
1,A-07297-04840,Certified - Full,2008-01-06,2008.0,LUCEDALE,,DEEP SOUTH NURSERY,2008,HORTICULTURAL WORKER,2008-109,109,MS,8,,"Lucedale, MS, United States",30.924320,-88.592888,bing
2,A-07297-04840,Certified - Full,2008-01-06,2008.0,LUCEDALE,,P & L NURSERY,2008,HORTICULTURAL WORKER,2008-110,110,MS,8,,"Lucedale, MS, United States",30.924320,-88.592888,bing
3,A-07297-04840,Certified - Full,2008-01-06,2008.0,LUCEDALE,,W & W NURSERY,2008,HORTICULTURAL WORKER,2008-111,111,MS,3,,"Lucedale, MS, United States",30.924320,-88.592888,bing
4,A-07297-04840,Certified - Full,2008-01-06,2008.0,LUCEDALE,,"DUTCH BROTHERS GREENHOUSES, INC.",2008,HORTICULTURAL WORKER,2008-112,112,MS,3,,"Lucedale, MS, United States",30.924320,-88.592888,bing
5,A-07299-04849,Certified - Full,2008-02-01,2008.0,ISOLA,,DUTCH BRAKE FISHERIES,2008,FISH HATCHERY WORKER,2008-533,533,MS,8,,"Isola, MS 38754, USA",33.262066,-90.592314,APPROXIMATE
6,A-07325-04931,Certified - Full,2008-01-10,2008.0,HOMERVILLE,,"HORNER FARMS, INC.",2008,"FARMWORKER, FRUIT",2008-190,190,GA,4,,"Homerville, GA, United States",31.036900,-82.746582,bing
7,A-07327-04937,Certified - Full,2008-01-27,2008.0,PERKINSTON,,GREENFOREST NURSERY,2008,HORTICULTURAL WORKER,2008-405,405,MS,12,,"Perkinston, MS, United States",30.782030,-89.140961,bing
8,A-07327-04937,Certified - Full,2008-01-27,2008.0,LUCEDALE,,BARNHILL FARMS,2008,HORTICULTURAL WORKER,2008-406,406,MS,5,,"Lucedale, MS, United States",30.924320,-88.592888,bing
9,A-07337-04977,Certified - Full,2008-01-25,2008.0,LENOIR,,JOHN S. COFFEY,2008,HORTICULTURAL WORKER II,2008-921,921,NC,12,,"Lenoir, NC, USA",35.914020,-81.538985,APPROXIMATE


In [63]:
add_points("master_cases")

In [64]:
add_points("sub_cases")

In [65]:
add_points("all_cases")