# Data 2 | Geocode Voters

I look at the cleaned files and check whether they're already geocoded. I geocode them using batch_geocode() if they're not geocoded.

The function batch_geocode() takes the voter dataframe, then
1. Drops unnecessary variables
2. Creates batches
3. Geocodes each batch
4. Merges the batches

In [1]:
from data2_geocode import *

""" Step 0 | Setup """
batch_size = 500

future_files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if x.split('.')[-1] == 'pkl'])
files = [x for x in future_files if x+'_geo.pkl' not in os.listdir(path_2)]
for file in sorted(files):
    state, year = file.split('_')
    print_log[file] = {'log':[],'sublog':[]}
    printer(print_log)
    
    if file + '_geo.pkl' in os.listdir(path_2):
        print_log[file]['log'].append('  Done')
        printer(print_log)
    
    if int(year) < 2010:
        print_log[file]['log'].append('  Too Old (for now)')
        printer(print_log)
        
    if (file + '_geo.pkl' not in os.listdir(path_2)) & (int(year) >= 2010):
        path_2_batch = path_2+str(year)+'_batches/'
        if not os.path.exists(path_2_batch):
            os.makedirs(path_2_batch)
        
        """ Step 1 | Open Chunked Data """
        t0 = time.time()
        print_log[file]['log'].append('  Step 1 | Open Chunked Data')
        printer(print_log)
        chunk_file_names, chunk_files = [x for x in os.listdir(path_1) if x.split('_chunk_')[0] == file], []
        for chunk_file_name in chunk_file_names:
            with open(path_1 + chunk_file_name,'rb') as f: 
                chunk = pickle.load(f)
                chunk = chunk[['idu','address','city','state','zipcode']]
            chunk_files.append(chunk)
        voters = pd.concat(chunk_files, ignore_index=True)
        runtime = str(round(( time.time() - t0 ) / 60 ))
        print_log[file]['log'][-1] = '  Step 1 | Open Chunked Data (Runtime: ' + runtime + ' mins)'
        printer(print_log)
        
        """ Step 2 | Create Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 2 | Create Batches')
        printer(print_log)
        id_list = voters.idu.unique()
        batch_ids = [id_list[x:x+batch_size] for x in range(0, len(id_list), batch_size)]
        batches = list(zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids]))
        finished_batches = os.listdir(path_2_batch)
        run_batches, error_batches = [(b,d,printer,pl) for (b,d,pl) in batches if str(b)+'.pkl' not in finished_batches], []
        runtime = str(round(( time.time() - t0 ) / 60 ))
        print_log[file]['log'][-1] = '  Step 2 | Create Batches (Runtime: ' + runtime + ' mins)'
        printer(print_log)
        
        """ Step 3 | Geocode Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 3 | Geocode Batches (' + str(len(run_batches)) + ' of ' + str(len(batches))+')')
        printer(print_log)
        
        def batch_geocode(batch):
            """ Take a list of idus and geocode them to a file. """
            
            batch_num, batch_idus, printer, print_log = batch
            batch_path = path_2_batch+str(batch_num)
            send_geo = voters[voters.idu.isin(batch_idus)]
            send_geo.to_csv(batch_path+'.csv', index=False, header=False)
            try:
                results = cg.addressbatch(batch_path+'.csv')
                results = pd.DataFrame(results)
                results.lat = results.lat.astype(float)
                results.lon = results.lon.astype(float)
                with open(batch_path+'.pkl','wb') as f:
                    pickle.dump(results, f)
                os.remove(batch_path+'.csv')
            except:
                error_batches.append(batch)
            finished_batches = [x for x in os.listdir(path_2_batch) if x.split('.')[-1] == 'pkl']
            if len(finished_batches)%10 == 0:
                print_log[file]['sublog'] = ['   || Finished ' + str(len(finished_batches))]
                printer(print_log)
        
        if __name__ == '__main__':
            with mp.Pool(processes = 500) as pool:
                pool.map(batch_geocode, run_batches)
        
        print_log[file]['sublog'] = ''
        runtime = str(round(( time.time() - t0 ) / 60 ))
        print_log[file]['log'][-1] = '  Step 3 | Geocode Batches (' + str(len(run_batches)) + ' of ' + str(len(batches))+') in ' + runtime + ' mins'
        print_log[file]['log'].append('    Batch Errors | ' + str(len(error_batches)))
        printer(print_log)
        
        """ Step 4 | Merge Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 4 | Merge Batches')
        printer(print_log)
        geofiles_to_load, geodata_to_merge = [x for x in os.listdir(path_2_batch) if x.split('.')[-1] == 'pkl'], []
        for geofile in geofiles_to_load:
            if os.path.getsize(path_2_batch + geofile) > 0:
                with open(path_2_batch + geofile,'rb') as f: 
                    results = pickle.load(f)
                    geodata_to_merge.append(results)
        geodata = pd.concat(geodata_to_merge, ignore_index=True)
        geodata = geodata.rename(columns = {'id':'idu'})
        with open(path_2 + file + '_geo.pkl','wb') as f:
            pickle.dump(geodata, f)
        runtime = str(round(time.time()-t0,0)/60)
        print_log[file]['log'][-1] = '  Step 4 | Merge Batches (Runtime: ' + runtime + ' mins)'
        printer(print_log)
        
        """ Saving print_log """
        now = datetime.now()
        savedate = ''.join([str(now.year),str(now.strftime('%m')),str(now.strftime('%d'))])#,str(now.strftime('%H'))])
        file = open(path_3 + 'print_log_' + savedate + '.txt', 'w')
        file.write(string_printer(print_log))
        file.close()

NC_2021
  Step 1 | Open Chunked Data (Runtime: 1 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode Batches (255 of 16499) in 6 mins
    Batch Errors | 33
  Step 4 | Merge Batches (Runtime: 0.8166666666666667 mins)


## Analysis

I look at:
1. How successful the geocoding was
2. Whether there are systematic biases in the errors

## Other Methods

Some voters aren't geocoded using the CBG api above. One option to increase the geocoding rate is to geocode these errors with the Google Maps and Zillow geocoders.

### Google Maps

In [None]:
col_names = ['lng','lat']+list(pa_data.columns.values)
points = pd.read_csv('DATA0/pa.csv')
#points = pd.DataFrame(columns = col_names)
for row in pa_data.iterrows():
    full_address = row[1]['House Number'] + ' ' + row[1]['Street Name'] +' ' + row[1]['City'] + ' ' + row[1]['Zip']
    coordinates = gmaps.geocode(address=full_address)
    if coordinates != []:
        coordinates = gmaps.geocode(address=full_address)[0]['geometry']['location']
        coordinates = pd.DataFrame([list(coordinates.values())+list(row[1])],columns=list(col_names))
    else:
        coordinates = pd.DataFrame([[np.nan,np.nan]+list(row[1])],columns=list(col_names))
    points = points.append(coordinates)
points = points.reset_index()
points['Coordinates'] = list(zip(points.lng, points.lat))
points['Coordinates'] = points['Coordinates'].apply(Point)
#points.to_csv('DATA0/pa.csv')

### Zillow

In [None]:
x = data1[(data1['Last Name'] == '"KRAYBILL"') & (data1['First Name'] == '"MOLLY"')]
address = list(x['House Number'])[0].strip('""') + ' ' + list(x['Street Name'])[0].strip('""')
zipcode = list(x['Zip'])[0].strip('""')
deep_search_response=zillow_data.get_deep_search_results(address,zipcode)
result = pyz.GetDeepSearchResults(deep_search_response)
print(result.longitude)
print(result.latitude)
print(result.zestimate_valuation_range_high)