# Data 2 | Geocode Voters

### New Version 

The new version solves the problem of low geocoding success in some years and improves the linking rate. This is done by creating a list of all geocoding successes and merging this coder file 1:m with each voter file.

1. Create a file of all addresses: addresses.
2. Geocode as many as possible to create a file of geocoded address: geocoded. Since I've already geocoded every year, I can just use these data without needing to use the Census Bureau API.
3. Link geocoded addresses with their geocoded variables: coder.
4. Iterate over the ungeocoded addresses (unfinished). This is basically like running the old version on the ungeocoded file. 
5. Then merge each voter snapshot with coder 1:m.

### Old Version

I look at the cleaned files and check whether they're already geocoded. I geocode them using batch_geocode() if they're not geocoded.

The function batch_geocode() takes the voter dataframe, then
1. Drops unnecessary variables
2. Creates batches
3. Geocodes each batch
4. Merges the batches


In [None]:
""" No need to rerun this. """

from data2_geocode import *

""" Step 0 | Setup """
batch_size = 250
address_cols = ['parsed', 'address', 'city', 'state', 'zipcode']

""" Step 1 | Create Addresses """

if 'addresses.pkl' not in os.listdir(path_2):
    addresses = pd.DataFrame(columns=address_cols)
    addresses.to_pickle(path_2 + 'addresses.pkl')

files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x])
for file in sorted(files):
    print(file)
    
    v = voters_from_chunks(path_1, file + '_', keep_cols=['address','city','state','zipcode'])
    for col in v.columns.values:
        v[col] = v[col].astype('str')
    v['parsed'] = v['address'] + ', ' + v['city'] + ', ' + v['state'] + ', ' + v['zipcode']
    v = v.drop_duplicates(subset=['parsed'])
    
    addresses = pd.read_pickle(path_2+'addresses.pkl')
    addresses = pd.concat([addresses, v])
    addresses = addresses.drop_duplicates(subset=['parsed'])
    addresses.to_pickle(path_2 + 'addresses.pkl')
    print('',len(addresses))
    
    del addresses, v
    
""" Step 2 | Create Geocoded """

geo_cols = ['matchtype', 'parsed', 'tigerlineid', 'side', 'statefp', 'countyfp', 'tract', 'block', 'lat', 'lon']

if 'geocoded.pkl' in os.listdir(path_2):
    geocoded = pd.DataFrame(columns=geo_cols)
    geocoded.to_pickle(path_2 + 'geocoded.pkl')
    
""" Add Geocoded from Previous Files """

files__ = [x for x in os.listdir(path_2+'/archive/') if '_geo__.pkl' in x]
files_ = [x for x in os.listdir(path_2+'/archive/') if '_geo_.pkl' in x]
for file in sorted(files__ + files_):
    print(file)
    geocoded = pd.read_pickle(path_2+'geocoded.pkl')
    geo = pd.read_pickle(path_2+'/archive/'+file)[geo_cols]
    geo = geo[geo.matchtype.isin(['Exact', 'Non_Exact'])]
    geo = geo.drop_duplicates(subset=['parsed'])
    
    geocoded = pd.concat([geocoded, geo])
    geocoded = geocoded.drop_duplicates(subset=['parsed'])
    geocoded.to_pickle(path_2 + 'geocoded.pkl')
    print('',len(geocoded))
    
    del geo, geocoded
    
""" Step 3 | Create Coder and Uncoded """

from data2_geocode import *

geocoded = pd.read_pickle(path_2+'geocoded.pkl').drop_duplicates(subset=['parsed'])
addresses = pd.read_pickle(path_2+'addresses.pkl').drop_duplicates(subset=['parsed'])

addresses.index = addresses.parsed
geocoded.index = geocoded.parsed
coder = pd.concat([addresses, geocoded], axis=1, join="inner")
coder = coder.drop('parsed', axis=1)
coder.to_pickle(path_2 + 'coder.pkl')

uncoded = addresses[~addresses.parsed.isin(geocoded.parsed.unique())]
uncoded = uncoded.drop('parsed', axis=1)
uncoded.to_pickle(path_2 + 'uncoded.pkl')

In [2]:
""" Step 5 | Use Coder with Years """

from data2_geocode import *

batch_size = 250

coder = pd.read_pickle(path_2 + 'coder.pkl').reset_index()
for col in ['address', 'city', 'state', 'zipcode']:
    coder[col] = coder[col].astype('str').str.split().str.join(' ')
coder['parsed'] = coder['address'] + ', ' + coder['city'] + ', ' + coder['state'] + ', ' + coder['zipcode']
coder = coder.drop(['address', 'city', 'state', 'zipcode'], axis=1)

files = list(set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x]))
#done_files = [x.split('_')[0] for x in os.listdir(path_2) if '_geo.pkl' in x]
#files = [x for x in files if x not in done_files]
for file in sorted(files):
    year = file
    save_file = f'{file}_geo.pkl'
    print(file)
    
    if save_file in os.listdir(path_2):
        print('  Done')
    
    if save_file not in os.listdir(path_2):
        print('  Merge 1:m')
        
        voters = voters_from_chunks(path_1, file, keep_cols=['idu','address','city','state','zipcode'])
        for col in voters.columns.values:
            voters[col] = voters[col].astype('str').str.split().str.join(' ')
        voters['parsed'] = voters['address'] + ', ' + voters['city'] + ', ' + voters['state'] + ', ' + voters['zipcode']
        
        geodata = pd.merge(coder, voters, left_on='parsed', right_on='parsed', validate='1:m')
        geodata.to_pickle(path_2 + save_file)
        print('  ',len(geodata))

20051125
  Done
20061020
  Done
20070119
  Done
20090101
  Merge 1:m
   4631861
20101102
  Done
20110101
  Merge 1:m
   4584142
20121106
  Done
20130101
  Done
20141231
  Merge 1:m
   5568082
20151103
  Done
20161108
  Done
20171107
  Done
20181106
  Done
20191105
  Merge 1:m
   5774976
20201103
  Merge 1:m
   6217404


### Step 4 | Geocode the Ungeocoded in Addresses (unfinished)

I still don't understand this error:

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
""" Step 4 | Run Batches on Uncoded"""

from data2_geocode import *

""" Create Batches """
batch_size = 500
path_2_batch = path_2 + 'batches/'

uncoded = pd.read_pickle(path_2 + 'uncoded.pkl').reset_index()
uncoded['parsed'] = uncoded.index

id_list = uncoded.index
batch_ids = [id_list[x:x + batch_size] for x in range(0, len(id_list), batch_size)]
#batches = zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids])
batches = list(zip(range(len(batch_ids)), batch_ids))[:10]
finished_batches = os.listdir(path_2_batch)

""" Step 4.a | Geocode Batches """

t0 = time.time()
#print_log[file]['log'].append(f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)})')
#printer(print_log)

def batch_geocode(batch):
    """ Take a list of idus and geocode them to a file. """

    #batch_num, batch_idus, printer, print_log = batch
    batch_num, batch_idus = batch
    batch_path = f'{path_2_batch}{batch_num}.csv'
    send_geo = uncoded[uncoded.index.isin(batch_idus)]
    #send_geo['id'] = send_geo.index
    send_geo.to_csv(batch_path, index=False, header=False)
    try:
        results = cg.addressbatch(batch_path)
        results = pd.DataFrame(results)
        results.lat = results.lat.astype(float)
        results.lon = results.lon.astype(float)
        
        results_path = batch_path.replace('.csv','.pkl')
        results.to_pickle(results_path)
        #os.remove(batch_path)
    except:
        #os.remove(batch_path)
        print(batch_num)
        #error_batches.append(batch)
    finished_batches = [batch for batch in os.listdir(path_2_batch) if 'pkl' in batch]
    #if len(finished_batches)%10 == 0:
        #print_log[file]['sublog'] = [f'   || Finished {len(finished_batches)}']
        #printer(print_log)

if __name__ == '__main__':
    with mp.Pool(processes = 500) as pool:
        pool.map(batch_geocode, batches)

#print_log[file]['sublog'] = ''
#runtime = round(( time.time() - t0 ) / 60 )
#print_log[file]['log'][-1] = f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)}) in {runtime} mins'
#print_log[file]['log'].append(f'    Batch Errors | {len(error_batches)}')
#printer(print_log)



In [1]:
""" Merge """

from data2_geocode import *

merge_files = [x for x in os.listdir(path_2 + 'batches/') if '.pkl' in x]

f = []
for file in merge_files:
    try:
        f.append(pd.read_pickle(path_2 + 'batches/' + file))
    except:
        pass
merged = pd.concat(f)
merged = merged[merged.matchtype.isin(['Exact', 'Non_Exact'])]

from datetime import date
today = date.today()
savedate = ''.join([str(today.year),str(today.strftime('%m')),str(today.strftime('%d'))])

merged.to_pickle(path_2 + 'batches/merged_' + savedate)

In [1]:
from data2_geocode import *
from datetime import date

In [2]:
geocoded = pd.read_pickle(path_2+'geocoded.pkl')
geocoded = geocoded.drop_duplicates(subset=['parsed'])

In [3]:
addresses = pd.read_pickle(path_2+'addresses.pkl')
addresses = addresses.drop_duplicates(subset=['parsed'])

In [7]:
today = date.today()
savedate = ''.join([str(today.year),str(today.strftime('%m')),str(today.strftime('%d'))])
merged = pd.read_pickle(path_2 + 'batches/merged_' + savedate)

In [8]:
geo_cols = ['matchtype', 'parsed', 'tigerlineid', 'side', 'statefp', 'countyfp', 'tract', 'block', 'lat', 'lon']
merged = merged[geo_cols].drop_duplicates(subset=['parsed'])

In [14]:
addresses.index = addresses.parsed
merged.index = merged.parsed

In [20]:
recoder = pd.concat([addresses, merged], axis=1, join="inner").reset_index(drop=True)

In [22]:
recoder.to_pickle(path_2 + 'batches/recoded_' + savedate)

In [79]:
recoder = pd.read_pickle(path_2 + 'batches/recoded_' + savedate)

In [81]:
recoder = recoder.reset_index(drop=True)

In [69]:
recoder = recoder.drop_duplicates(subset='parsed')

In [23]:
coder = pd.read_pickle(path_2 + 'coder.pkl')

In [27]:
coder = coder.reset_index()

In [29]:
coder = coder.drop_duplicates(subset=['parsed'])

In [30]:
coder.index = coder.parsed

In [40]:
recoder = recoder[~recoder.index.isin(coder.index)]

In [39]:
recoder.index = recoder.parsed

In [42]:
new_coder = pd.concat([coder, recoder])

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

## Tried and True Version

In [None]:
from data2_geocode import *

""" Step 0 | Setup """
batch_size = 250

future_files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x])
files = [file for file in future_files if f'{file}_geo.pkl' not in os.listdir(path_2)]
for file in sorted(files):
    year = file
    save_file = f'{file}_geo.pkl'
    print_log[file] = {'log':[],'sublog':[]}
    printer(print_log)
    
    if save_file in os.listdir(path_2):
        print_log[file]['log'].append('  Done')
        printer(print_log)
    if int(year) < 2010:
        print_log[file]['log'].append('  Too Old (for now)')
        printer(print_log)
    
    if (save_file not in os.listdir(path_2)) & (int(year) >= 2010):
        path_2_batch = f'{path_2}{year}_batches/'
        if not os.path.exists(path_2_batch):
            os.makedirs(path_2_batch)
        
        """ Step 1 | Open Chunked Data """
        t0 = time.time()
        print_log[file]['log'].append('  Step 1 | Open Chunked Data')
        printer(print_log)
        chunk_paths, chunk_files = [f'{path_1}{chunk}' for chunk in os.listdir(path_1) if file in chunk], []
        for chunk_path in chunk_paths:
            chunk = pd.read_pickle(chunk_path)
            #with open(chunk_path,'rb') as f: 
            #    chunk = pickle.load(f)
            chunk = chunk[['idu','address','city','state','zipcode']]
            chunk_files.append(chunk)
        voters = pd.concat(chunk_files, ignore_index=True)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 1 | Open Chunked Data (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 2 | Create Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 2 | Create Batches')
        printer(print_log)
        id_list = voters.idu.unique()
        batch_ids = [id_list[x:x + batch_size] for x in range(0, len(id_list), batch_size)]
        batches = list(zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids]))
        finished_batches = os.listdir(path_2_batch)
        run_batches, error_batches = [(b,d,printer,pl) for (b,d,pl) in batches if f'{b}.pkl' not in finished_batches], []
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 2 | Create Batches (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 3 | Geocode Batches """
        t0 = time.time()
        print_log[file]['log'].append(f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)})')
        printer(print_log)
        
        def batch_geocode(batch):
            """ Take a list of idus and geocode them to a file. """
            
            batch_num, batch_idus, printer, print_log = batch
            batch_path = f'{path_2_batch}{batch_num}.csv'
            send_geo = voters[voters.idu.isin(batch_idus)]
            send_geo.to_csv(batch_path, index=False, header=False)
            try:
                results = cg.addressbatch(batch_path)
                results = pd.DataFrame(results)
                results.lat = results.lat.astype(float)
                results.lon = results.lon.astype(float)
                with open(batch_path.replace('.csv','.pkl'),'wb') as f:
                    pickle.dump(results, f)
                os.remove(batch_path)
            except:
                error_batches.append(batch)
            finished_batches = [batch for batch in os.listdir(path_2_batch) if 'pkl' in batch]
            if len(finished_batches)%10 == 0:
                print_log[file]['sublog'] = [f'   || Finished {len(finished_batches)}']
                printer(print_log)
        
        if __name__ == '__main__':
            with mp.Pool(processes = 500) as pool:
                pool.map(batch_geocode, run_batches)
        
        print_log[file]['sublog'] = ''
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)}) in {runtime} mins'
        print_log[file]['log'].append(f'    Batch Errors | {len(error_batches)}')
        printer(print_log)
        
        """ Step 4 | Merge Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 4 | Merge Batches')
        printer(print_log)
        geo_paths, geodata_to_merge = [f'{path_2_batch}{batch}' for batch in os.listdir(path_2_batch) if 'pkl' in batch], []
        for geo_path in geo_paths:
            if os.path.getsize(geo_path) > 0:
                with open(geo_path,'rb') as f: 
                    results = pickle.load(f)
                    geodata_to_merge.append(results)
        geodata = pd.concat(geodata_to_merge, ignore_index=True)
        geodata = geodata.rename(columns = {'id':'idu'})
        with open(f'{path_2}{file}_geo.pkl','wb') as f:
            pickle.dump(geodata, f)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 4 | Merge Batches (Runtime: {runtime} mins)'
        printer(print_log)

        """ Saving print_log """
        now = datetime.now()
        savedate = ''.join([str(now.year),str(now.strftime('%m')),str(now.strftime('%d'))])
        file = open(f'{path_2}print_log_{savedate}.txt', 'w')
        file.write(string_printer(print_log))
        file.close()

20181106
  Step 1 | Open Chunked Data (Runtime: 0 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode Batches (31818 of 31818) in 156 mins
    Batch Errors | 21
  Step 4 | Merge Batches (Runtime: 1 mins)
2019
  Step 1 | Open Chunked Data (Runtime: 7 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode Batches (31071 of 31071) in 741 mins
    Batch Errors | 1293
  Step 4 | Merge Batches (Runtime: 1 mins)
20190430
  Step 1 | Open Chunked Data (Runtime: 1 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode Batches (29961 of 29961) in 153 mins
    Batch Errors | 59
  Step 4 | Merge Batches (Runtime: 1 mins)
20190514
  Step 1 | Open Chunked Data (Runtime: 1 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode Batches (29961 of 29961) in 170 mins
    Batch Errors | 82
  Step 4 | Merge Batches (Runtime: 1 mins)
20190709
  Step 1 | Open Chunked Data (Runtime: 1 mins)
  Step 2 | Create Batches (Runtime: 0 mins)
  Step 3 | Geocode

# Experiments with Fixing Batch Errors

In [105]:
from data2_geocode import *

""" Step 0 | Setup """

future_files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x])
files = [file for file in future_files if f'{file}_geo.pkl']# not in os.listdir(path_2)]
file = sorted(files)[0]
print(file)
year = file

path_2_batch = f'{path_2}{year}_batches/'

chunk_paths, chunk_files = [f'{path_1}{chunk}' for chunk in os.listdir(path_1) if file in chunk], []
for chunk_path in chunk_paths[:1]:
    chunk = pd.read_pickle(chunk_path)
    #with open(chunk_path,'rb') as f: 
    #    chunk = pickle.load(f)
    #chunk = chunk[['idu','address','city','state','zipcode']]
    chunk.index = chunk.idu
    chunk_files.append(chunk)
#voters = pd.concat(chunk_files)

2010


In [110]:
chunk[['house_number', 'street_name', 'street_type', 'state', 'county',
       'city', 'zipcode', 'address']]

Unnamed: 0_level_0,house_number,street_name,street_type,state,county,city,zipcode,address
idu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DS1391414721,900,FERN,ST,NC,ROCKINGHAM,MADISON,27025,900 FERN ST
DS1388914688,818,RIDGE,ST,NC,ROCKINGHAM,MADISON,27025,818 RIDGE ST
DS1389014689,818,RIDGE,ST,NC,ROCKINGHAM,MADISON,27025,818 RIDGE ST
DS1393914755,119,VALLEY FIELD,RD,NC,ROCKINGHAM,STONEVILLE,27048,119 VALLEY FIELD RD
DS1393814754,1229,ELLERBE,CT,NC,ROCKINGHAM,EDEN,27288,1229 ELLERBE CT
...,...,...,...,...,...,...,...,...
EH776021100057436,8320,SHILOH CREEK,CT,NC,WAKE,RALEIGH,27616,8320 SHILOH CREEK CT
EH776578100058107,801,SOUTHAMPTON,DR,NC,WAKE,KNIGHTDALE,27545,801 SOUTHAMPTON DR
EH776579100058109,5412,ORCHARD ORIOLE,TRL,NC,WAKE,WAKE FOREST,27587,5412 ORCHARD ORIOLE TRL
EH776580100058110,2813,FERRETT,CT,NC,WAKE,RALEIGH,27610,2813 FERRETT CT


In [95]:
batch_size = 100

id_list = list(set(voters.index))
batch_ids = [id_list[x:x + batch_size] for x in range(0, len(id_list), batch_size)]
batches = list(zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids]))
run_batches, error_batches = [(b,d,printer,pl) for (b,d,pl) in batches if f'{b}.pkl'], []

In [96]:
batch = batches[10]
batch_num, batch_idus, print_log = batch
batch_path = f'{path_2_batch}{batch_num}.csv'
send_geo = voters[voters.index.isin(batch_idus)]
send_geo.to_csv(batch_path, index=False, header=False)
results = cg.addressbatch(batch_path)
results = pd.DataFrame(results)
results

Unnamed: 0,id,address,match,matchtype,parsed,tigerlineid,side,statefp,countyfp,tract,block,lat,lon
0,CZ6373090845,"155 LAKE FOREST DR, PINEHURST, NC, 28374",True,Exact,"155 LAKE FOREST DR, PINEHURST, NC, 28374",20298847,L,37,125,950704,2015,35.184982,-79.485830
1,CJ625023034407,"605 WOOD ST, SELMA, NC, 27576",True,Exact,"605 WOOD ST, SELMA, NC, 27576",26525633,L,37,101,040301,1005,35.540028,-78.273460
2,BR1259217615303,"1204 SOUTH POINT RD, BELMONT, NC, 28012",False,,,,,,,,,,
3,AB2514629796,"180 CEDAR RD, HICKORY, NC, 28601",True,Exact,"180 CEDAR RD, HICKORY, NC, 28601",45793069,R,37,003,040700,3006,35.812172,-81.310840
4,BF2252724498,"103 RED WOOD ST, MOYOCK, NC, 27958",True,Exact,"103 RED WOOD ST, MOYOCK, NC, 27958",81385223,R,37,053,110301,1000,36.478210,-76.047000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,BW931110009,"201 ALLEN ST, CREEDMOOR, NC, 27522",True,Exact,"201 ALLEN ST, CREEDMOOR, NC, 27522",77106736,L,37,077,970606,2006,36.117520,-78.681526
96,BN29499630028675,"3401 OLD VINEYARD RD, WINSTON SALEM, NC, 27103",True,Non_Exact,"3401 OLD VINEYARD RD, WINSTON SALEM, NC, 27103",642174510,R,37,067,003805,1004,36.077100,-80.311190
97,AS6633475725,"311 FAIRVIEW DR, EMERALD ISLE, NC, 28594",True,Exact,"311 FAIRVIEW DR, EMERALD ISLE, NC, 28594",33865571,L,37,031,970904,1017,34.667300,-77.036460
98,AW1180059093384,"1207 21ST AVE, HICKORY, NC, 28601",False,,,,,,,,,,


In [100]:
failed_ids = list(results[results['match'] == False].id)
failed_ids

['BR1259217615303',
 'DZ5109030043544',
 'DH3267088189',
 'AX2590744610',
 'BB3846763856',
 'BL173802239628',
 'BL30683730069560',
 'EP1504618909',
 'BB865581024767',
 'BB1161518574',
 'AK4053648623',
 'AM2134519322',
 'DL3266330694',
 'CM3296242233',
 'DE2152819506',
 'AW555629028918',
 'AW1180059093384']

In [198]:
results.columns.values

array(['id', 'address', 'match', 'matchtype', 'parsed', 'tigerlineid',
       'side', 'statefp', 'countyfp', 'tract', 'block', 'lat', 'lon'],
      dtype=object)

In [129]:
send_geo = voters[voters.index.isin(failed_ids)]
send_geo.to_csv(batch_path.replace('.csv','_.csv'), index=False, header=False)
results = cg.addressbatch(batch_path.replace('.csv','_.csv'))
results = pd.DataFrame(results)
results

Unnamed: 0,id,address,match,matchtype,parsed,tigerlineid,side,statefp,countyfp,tract,block,lat,lon
0,BL173802239628,"403 LAKE HOGAN FARM RD, CHAPEL HILL, NC, 27516",False,,,,,,,,,,
1,BB1161518574,"2615 BALL PARK RD, LAWNDALE, NC, 28090",False,,,,,,,,,,
2,CM3296242233,"303 RURAL RTE 7 , KINSTON, NC, 28501",False,,,,,,,,,,
3,DH3267088189,"300 MAIN ST, BURGAW, NC, 28425",False,,,,,,,,,,
4,BR1259217615303,"1204 SOUTH POINT RD, BELMONT, NC, 28012",False,,,,,,,,,,
5,DE2152819506,"1205 THE OAKS APTS , CHAPEL HILL, NC, 27514",False,,,,,,,,,,
6,DZ5109030043544,"185 CORBAN AVE, CONCORD, NC, 28025",False,,,,,,,,,,
7,BL30683730069560,"1 DUKE UNIVERSITY WEST CAMPUS DORM, DURHAM, NC...",False,,,,,,,,,,
8,AK4053648623,"10161 CREEKSIDE DR, LELAND, NC, 28451",False,,,,,,,,,,
9,AW555629028918,"0 ROUTE 1 , CONOVER, NC, 28613",False,,,,,,,,,,


In [189]:
send_geo

Unnamed: 0_level_0,idu,address,city,state,zipcode
idu,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BR1259217615303,BR1259217615303,1204 SOUTH POINT RD,BELMONT,NC,28012
CM3296242233,CM3296242233,303 RURAL RTE 7,KINSTON,NC,28501
BL30683730069560,BL30683730069560,1 DUKE UNIVERSITY WEST CAMPUS DORM,DURHAM,NC,27708
DE2152819506,DE2152819506,1205 THE OAKS APTS,CHAPEL HILL,NC,27514
AW555629028918,AW555629028918,0 ROUTE 1,CONOVER,NC,28613
AW1180059093384,AW1180059093384,1207 21ST AVE,HICKORY,NC,28601
AX2590744610,AX2590744610,190 C D THOMAS RD,SILER CITY,NC,27344
BB865581024767,BB865581024767,3814 HARRIS CREEK RD,LAWNDALE,NC,28090
BB1161518574,BB1161518574,2615 BALL PARK RD,LAWNDALE,NC,28090
BB3846763856,BB3846763856,1956 PLEASANT HILL CH RD,SHELBY,NC,28152


In [230]:
failed_idu_list = []

In [221]:
dd_list = []

In [231]:
def relabel(x):
    return x.replace('CH','CHURCH')

v = send_geo.iloc[10]
d = cg.address(relabel(v.address), city=v.city, state=v.state, zipcode=v.zipcode)
if len(d)>0: 
    dd = {
        'idu':[v.idu],
        'address':[relabel(v.address)],
        'city':[v.city],
        'state':[v.state],
        'zipcode':[v.zipcode],
        'parsed':[d[0]['matchedAddress']],
        'lat':[d[0]['coordinates']['y']],
        'lon':[d[0]['coordinates']['x']]
    }
    dd_list.append(pd.DataFrame(dd))

In [232]:
dd_list[1]

Unnamed: 0,idu,address,city,state,zipcode,parsed,lat,lon
0,AK4053648623,10161 CREEKSIDE DR,LELAND,NC,28451,"10161 CREEKSIDE DR SE, LELAND, NC, 28451",34.198963,-77.98203


In [None]:
['id', 'address', 'match', 'matchtype', 'parsed', 'tigerlineid',
       'side', 'statefp', 'countyfp', 'tract', 'block', 'lat', 'lon']

In [234]:
dd_con = pd.concat(dd_list)
dd_con

Unnamed: 0,idu,address,city,state,zipcode,parsed,lat,lon
0,DH3267088189,300 MAIN ST,BURGAW,NC,28425,"300 W MAIN ST, BURGAW, NC, 28425",34.514664,-77.919876
0,AK4053648623,10161 CREEKSIDE DR,LELAND,NC,28451,"10161 CREEKSIDE DR SE, LELAND, NC, 28451",34.198963,-77.98203


In [None]:
address_rename_dict = {
    'CH':'CHURCH',
    
}

In [186]:
cg.onelineaddress('403 LAKE HOGAN FARM RD, CHAPEL HILL, NC, 27516', returntype='locations')

[]

In [33]:
voters = pd.read_pickle(path_2 + '2013_geo.pkl')

In [2]:
from data2_geocode import *

""" Step 0 | Setup """
batch_size = 500

future_files = set([x.split('_chunk_')[0] for x in os.listdir(path_1) if 'pkl' in x])
files = [file for file in future_files if f'{file}_geo.pkl' not in os.listdir(path_2)]
for file in sorted(files):
    year = file
    save_file = f'{file}_geo.pkl'
    print_log[file] = {'log':[],'sublog':[]}
    printer(print_log)
    
    if save_file in os.listdir(path_2):
        print_log[file]['log'].append('  Done')
        printer(print_log)
    if int(year) < 2010:
        print_log[file]['log'].append('  Too Old (for now)')
        printer(print_log)
    
    if (save_file not in os.listdir(path_2)) & (int(year) >= 2010):
        path_2_batch = f'{path_2}{year}_batches/'
        if not os.path.exists(path_2_batch):
            os.makedirs(path_2_batch)
        
        """ Step 1 | Open Chunked Data """
        t0 = time.time()
        print_log[file]['log'].append('  Step 1 | Open Chunked Data')
        printer(print_log)
        chunk_paths, chunk_files = [f'{path_1}{chunk}' for chunk in os.listdir(path_1) if file in chunk], []
        for chunk_path in chunk_paths:
            chunk = pd.read_pickle(chunk_path)
            #with open(chunk_path,'rb') as f: 
            #    chunk = pickle.load(f)
            chunk = chunk[['idu','address','city','state','zipcode']]
            chunk_files.append(chunk)
        voters = pd.concat(chunk_files, ignore_index=True)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 1 | Open Chunked Data (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 2 | Create Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 2 | Create Batches')
        printer(print_log)
        id_list = voters.idu.unique()
        batch_ids = [id_list[x:x + batch_size] for x in range(0, len(id_list), batch_size)]
        batches = list(zip(range(len(batch_ids)), batch_ids, [print_log for b in batch_ids]))
        finished_batches = os.listdir(path_2_batch)
        run_batches, error_batches = [(b,d,printer,pl) for (b,d,pl) in batches if f'{b}.pkl' not in finished_batches], []
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 2 | Create Batches (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 3 | Geocode Batches """
        t0 = time.time()
        print_log[file]['log'].append(f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)})')
        printer(print_log)
        
        def batch_geocode(batch):
            """ Take a list of idus and geocode them to a file. """
            
            batch_num, batch_idus, printer, print_log = batch
            batch_path = f'{path_2_batch}{batch_num}.csv'
            send_geo = voters[voters.idu.isin(batch_idus)]
            send_geo.to_csv(batch_path, index=False, header=False)
            try:
                results = cg.addressbatch(batch_path)
                results = pd.DataFrame(results)
                results.lat = results.lat.astype(float)
                results.lon = results.lon.astype(float)
                with open(batch_path.replace('.csv','.pkl'),'wb') as f:
                    pickle.dump(results, f)
                os.remove(batch_path)
            except:
                error_batches.append(batch)
            finished_batches = [batch for batch in os.listdir(path_2_batch) if 'pkl' in batch]
            if len(finished_batches)%10 == 0:
                print_log[file]['sublog'] = [f'   || Finished {len(finished_batches)}']
                printer(print_log)
        
        if __name__ == '__main__':
            with mp.Pool(processes = 500) as pool:
                pool.map(batch_geocode, run_batches)
        
        print_log[file]['sublog'] = ''
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 3 | Geocode Batches ({len(run_batches)} of {len(batches)}) in {runtime} mins'
        print_log[file]['log'].append(f'    Batch Errors | {len(error_batches)}')
        printer(print_log)
        
        """ Step 4 | Merge Batches """
        t0 = time.time()
        print_log[file]['log'].append('  Step 4 | Merge Batches')
        printer(print_log)
        geo_paths, geodata_to_merge = [f'{path_2_batch}{batch}' for batch in os.listdir(path_2_batch) if 'pkl' in batch], []
        for geo_path in geo_paths:
            if os.path.getsize(geo_path) > 0:
                with open(geo_path,'rb') as f: 
                    results = pickle.load(f)
                    geodata_to_merge.append(results)
        geodata = pd.concat(geodata_to_merge, ignore_index=True)
        geodata = geodata.rename(columns = {'id':'idu'})
        with open(f'{path_2}{file}_geo.pkl','wb') as f:
            pickle.dump(geodata, f)
        runtime = round(( time.time() - t0 ) / 60 )
        print_log[file]['log'][-1] = f'  Step 4 | Merge Batches (Runtime: {runtime} mins)'
        printer(print_log)
        
        """ Step 5 | Fix Errors """
        t0 = time.time()
        print_log[file]['log'].append('  Step 5 | Fix Errors')
        printer(print_log)
        
        geo_id_list = geodata.idu.unique()
        fix_list = [idu for idu in id_list if idu not in geo_id_list]
        fix_id_batches = [fix_list[x:x + batch_size] for x in range(0, len(fix_list), batch_size)]
        batches = list(zip(range(len(fix_id_batches)), fix_id_batches, [print_log for b in fix_id_batches]))
        
        def fix_errors(batch):
            batch_num, batch_idus, print_log = batch
            batch_path = f'{path_2_batch}{batch_num}_fix'
            fix_voters = voters[voters.idu.isin(batch_idus)]
        
            def relabel(x):
                return x.replace('CH','CHURCH')

            fix_data_list, fix_error_list = [], []
            for v in fix_voters.iterrows():
                #v = fix_voters.iloc[15]
                result = cg.address(relabel(v.address), city=v.city, state=v.state, zipcode=v.zipcode)
                if len(result)>0: 
                    result_dict = {
                        'idu':[v.idu],
                        'address':[relabel(v.address)],
                        'city':[v.city],
                        'state':[v.state],
                        'zipcode':[v.zipcode],
                        'parsed':[result[0]['matchedAddress']],
                        'lat':[result[0]['coordinates']['y']],
                        'lon':[result[0]['coordinates']['x']]
                    }
                    fix_data_list.append(pd.DataFrame(result_dict))
                else:
                    fix_error_list.append(v.idu)

            fixed_batch = pd.concat(fix_data_list)
            with open(f'{batch_path}.pkl','wb') as f:
                pickle.dump(fixed_batch, f)
            with open(f'{batch_path}_errors.pkl','wb') as f:
                pickle.dump(fix_error_list, f)
                
            finished_fixed_batches = [batch for batch in os.listdir(path_2_batch) if '_fix.pkl' in batch]
            if len(finished_fixed_batches)%10 == 0:
                print_log[file]['sublog'] = [f'   || Finished {len(finished_fixed_batches)}']
                printer(print_log)
                
            
        #runtime = round(( time.time() - t0 ) / 60 )
        #print_log[file]['log'][-1] = f'  Step 5 | Fix Errors (Runtime: {runtime} mins)'
        #printer(print_log)
        
        """ Saving print_log """
        now = datetime.now()
        savedate = ''.join([str(now.year),str(now.strftime('%m')),str(now.strftime('%d'))])
        file = open(f'{path_2}{print_log}_{savedate}.txt', 'w')
        file.write(string_printer(print_log))
        file.close()

20051125
  Step 1 | Open Chunked Data


KeyboardInterrupt: 

In [238]:
geo_id_list = geodata.idu.unique()


In [242]:
len(geo_id_list)

6785676

In [243]:
len(id_list)

6863676

In [244]:
#fix_list = np.isin(geo_id_list, id_list)
# this takes awhile, so use isna instead

KeyboardInterrupt: 

In [None]:
#fix_list = [idu for idu in id_list if idu not in geo_id_list]


In [None]:
fix_id_batches = [fix_list[x:x + batch_size] for x in range(0, len(fix_list), batch_size)]
batches = list(zip(range(len(fix_id_batches)), fix_id_batches, [print_log for b in fix_id_batches]))

In [None]:
def fix_errors(batch):
    batch_num, batch_idus, print_log = batch
    batch_path = f'{path_2_batch}{batch_num}_fix'
    fix_voters = voters[voters.idu.isin(batch_idus)]

    def relabel(x):
        return x.replace('CH','CHURCH')

    fix_data_list, fix_error_list = [], []
    for v in fix_voters.iterrows():
        #v = fix_voters.iloc[15]
        result = cg.address(relabel(v.address), city=v.city, state=v.state, zipcode=v.zipcode)
        if len(result)>0: 
            result_dict = {
                'idu':[v.idu],
                'address':[relabel(v.address)],
                'city':[v.city],
                'state':[v.state],
                'zipcode':[v.zipcode],
                'parsed':[result[0]['matchedAddress']],
                'lat':[result[0]['coordinates']['y']],
                'lon':[result[0]['coordinates']['x']]
            }
            fix_data_list.append(pd.DataFrame(result_dict))
        else:
            fix_error_list.append(v.idu)

    fixed_batch = pd.concat(fix_data_list)
    with open(f'{batch_path}.pkl','wb') as f:
        pickle.dump(fixed_batch, f)
    with open(f'{batch_path}_errors.pkl','wb') as f:
        pickle.dump(fix_error_list, f)

    finished_fixed_batches = [batch for batch in os.listdir(path_2_batch) if '_fix.pkl' in batch]
    if len(finished_fixed_batches)%10 == 0:
        print_log[file]['sublog'] = [f'   || Finished {len(finished_fixed_batches)}']
        printer(print_log)

In [None]:
if __name__ == '__main__':
    with mp.Pool(processes = 500) as pool:
        pool.map(fix_errors, batches)

## Analysis

I look at:
1. How successful the geocoding was
2. Whether there are systematic biases in the errors