# Data 2 | Geocode Voters

I look at the cleaned files and check whether they're already geocoded. I geocode them using batch_geocode() if they're not geocoded.

The function batch_geocode() takes the voter dataframe, then
1. Drops unnecessary variables
2. Creates batches
3. Geocodes each batch
4. Merges the batches

## Low Key To Do
1. Maybe do everything with the export date instead of year?
2. Use idu as the index?

In [None]:
from data2_geocode import *

""" Step 0 | Setup """
batch_size = 500

future_files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x])
files = [file for file in future_files if f'{file}_geo.pkl' not in os.listdir(path_2)]
for file in sorted(files):
    state, year = file.split('_')
    save_file = f'{file}_geo.pkl'
    print_log[file] = {'log':[],'sublog':[]}
    printer(print_log)
    
    if save_file in os.listdir(path_2):
        print_log[file]['log'].append('  Done')
        printer(print_log)
    if int(year) < 2010:
        print_log[file]['log'].append('  Too Old (for now)')
        printer(print_log)
    
    if (save_file not in os.listdir(path_2)) & (int(year) >= 2010):
        path_2_batch = f'{path_2}{year}_batches/'
        if not os.path.exists(path_2_batch):
            os.makedirs(path_2_batch)
        
        """ Step 1 | Open Chunked Data """
        t0 = time.time()
        print_log[file]['log'].append('  Step 1 | Open Chunked Data')
        printer(print_log)
        chunk_paths, chunk_files = [f'{path_1}{chunk}' for chunk in os.listdir(path_1) if file in chunk], []
        for chunk_path in chunk_paths:
            chunk = pd.read_pickle(chunk_path)
            #with open(chunk_path,'rb') as f: 
            #    chunk = pickle.load(f)
            chunk = chunk[['idu','address','city','state','zipcode']]
            chunk_files.append(chunk)
        voters = pd.concat(chunk_files, ignore_index=True)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 1 | Open Chunked Data (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 2 | Create Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 2 | Create Batches')
        printer(print_log)
        id_list = voters.idu.unique()
        batch_ids = [id_list[x:x + batch_size] for x in range(0, len(id_list), batch_size)]
        batches = list(zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids]))
        finished_batches = os.listdir(path_2_batch)
        run_batches, error_batches = [(b,d,printer,pl) for (b,d,pl) in batches if f'{b}.pkl' not in finished_batches], []
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 2 | Create Batches (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 3 | Geocode Batches """
        t0 = time.time()
        print_log[file]['log'].append(f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)})')
        printer(print_log)
        
        def batch_geocode(batch):
            """ Take a list of idus and geocode them to a file. """
            
            batch_num, batch_idus, printer, print_log = batch
            batch_path = f'{path_2_batch}{batch_num}.csv'
            send_geo = voters[voters.idu.isin(batch_idus)]
            send_geo.to_csv(batch_path, index=False, header=False)
            try:
                results = cg.addressbatch(batch_path)
                results = pd.DataFrame(results)
                results.lat = results.lat.astype(float)
                results.lon = results.lon.astype(float)
                with open(batch_path.replace('.csv','.pkl'),'wb') as f:
                    pickle.dump(results, f)
                os.remove(batch_path)
            except:
                error_batches.append(batch)
            finished_batches = [batch for batch in os.listdir(path_2_batch) if 'pkl' in batch]
            if len(finished_batches)%10 == 0:
                print_log[file]['sublog'] = [f'   || Finished {len(finished_batches)}']
                printer(print_log)
        
        if __name__ == '__main__':
            with mp.Pool(processes = 500) as pool:
                pool.map(batch_geocode, run_batches)
        
        print_log[file]['sublog'] = ''
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)}) in {runtime} mins'
        print_log[file]['log'].append(f'    Batch Errors | {len(error_batches)}')
        printer(print_log)
        
        """ Step 4 | Merge Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 4 | Merge Batches')
        printer(print_log)
        geo_paths, geodata_to_merge = [f'{path_2_batch}{batch}' for batch in os.listdir(path_2_batch) if 'pkl' in batch], []
        for geo_path in geo_paths:
            if os.path.getsize(geo_path) > 0:
                with open(geo_path,'rb') as f: 
                    results = pickle.load(f)
                    geodata_to_merge.append(results)
        geodata = pd.concat(geodata_to_merge, ignore_index=True)
        geodata = geodata.rename(columns = {'id':'idu'})
        with open(f'{path_2}{file}_geo.pkl','wb') as f:
            pickle.dump(geodata, f)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 4 | Merge Batches (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Saving print_log """
        now = datetime.now()
        savedate = ''.join([str(now.year),str(now.strftime('%m')),str(now.strftime('%d'))])
        file = open(f'{path_2}{print_log}_{savedate}.txt', 'w')
        file.write(string_printer(print_log))
        file.close()

## Analysis

I look at:
1. How successful the geocoding was
2. Whether there are systematic biases in the errors

## Other Methods

Some voters aren't geocoded using the CBG api above. One option to increase the geocoding rate is to geocode these errors with the Google Maps and Zillow geocoders.

### Google Maps

In [None]:
col_names = ['lng','lat']+list(pa_data.columns.values)
points = pd.read_csv('DATA0/pa.csv')
#points = pd.DataFrame(columns = col_names)
for row in pa_data.iterrows():
    full_address = row[1]['House Number'] + ' ' + row[1]['Street Name'] +' ' + row[1]['City'] + ' ' + row[1]['Zip']
    coordinates = gmaps.geocode(address=full_address)
    if coordinates != []:
        coordinates = gmaps.geocode(address=full_address)[0]['geometry']['location']
        coordinates = pd.DataFrame([list(coordinates.values())+list(row[1])],columns=list(col_names))
    else:
        coordinates = pd.DataFrame([[np.nan,np.nan]+list(row[1])],columns=list(col_names))
    points = points.append(coordinates)
points = points.reset_index()
points['Coordinates'] = list(zip(points.lng, points.lat))
points['Coordinates'] = points['Coordinates'].apply(Point)
#points.to_csv('DATA0/pa.csv')

### Zillow

In [None]:
x = data1[(data1['Last Name'] == '"KRAYBILL"') & (data1['First Name'] == '"MOLLY"')]
address = list(x['House Number'])[0].strip('""') + ' ' + list(x['Street Name'])[0].strip('""')
zipcode = list(x['Zip'])[0].strip('""')
deep_search_response=zillow_data.get_deep_search_results(address,zipcode)
result = pyz.GetDeepSearchResults(deep_search_response)
print(result.longitude)
print(result.latitude)
print(result.zestimate_valuation_range_high)