# Record Linkage of the NCVR dataset "in the clear"

In [1]:
import pandas as pd
import recordlinkage
import time

We use a subset of the North Carolina voter registration dataset. This is a collection of entries that changed over time. Thus, a bit more challenging for record linkage.

In [2]:
dfa = pd.read_csv('data/ncvoter-20140619-temporal-balanced-ratio-1to1-a.csv', encoding='latin-1')
dfa.drop_duplicates(subset='voter_id', keep='last', inplace=True)
dfa.reset_index(drop=True, inplace=True)
dfb = pd.read_csv('data/ncvoter-20140619-temporal-balanced-ratio-1to1-b.csv', encoding='latin-1')
dfb.drop_duplicates(subset='voter_id', keep='last', inplace=True)
dfb.reset_index(drop=True, inplace=True)

In [3]:
dfb.head()

Unnamed: 0,voter_id,first_name,middle_name,last_name,age,gender,street_address,city,state,zip_code,full_phone_num
0,5168123,joseph,gregory,colley,53,m,5859 us 421 n,lillington,nc,27546.0,
1,5168120,charles,darin,smith,47,m,6045 charles ave,fayetteville,nc,28311.0,910 818 3015
2,415078,larry,keith,jenkins,62,m,219 mill knob mountain trl,otto,nc,28763.0,828 369 8789
3,5673331,michael,anthony,williams,40,m,320 maryland ave,wilmington,nc,28401.0,
4,4295395,john,daniel,larned,84,m,404 runnymede dr,fayetteville,nc,28314.0,910 999 9999


We only use 100 000 records from each dataset in order to give my aging laptop a chance to succeed.

In [4]:
dfa = dfa[:100000]
dfb = dfb[:100000]

In [5]:
indexer = recordlinkage.SortedNeighbourhoodIndex('last_name', window=9)
pairs = indexer.index(dfa, dfb)
print(f'we have {len(pairs)} pairs to consider')

we have 14231529 pairs to consider


In [6]:
compare_cl = recordlinkage.Compare()
compare_cl.string('first_name', 'first_name', method='jarowinkler', threshold=0.9, label='first_name')
compare_cl.string('middle_name', 'middle_name', method='jarowinkler', threshold=0.9, label='middle_name')
compare_cl.string('last_name', 'last_name', method='jarowinkler', threshold=0.9, label='last_name')
compare_cl.numeric('age', 'age', scale=1, label='age')
compare_cl.exact('gender', 'gender', label='gender')
compare_cl.string('street_address', 'street_address', method='jarowinkler', threshold=0.9, label='street_address')
compare_cl.string('city', 'city', method='jarowinkler', threshold=0.9, label='city')
compare_cl.exact('zip_code', 'zip_code', label='zip_code')

<Compare>

In [7]:
print('comparing the record pairs. Please stand by...')
start = time.time()
res = compare_cl.compute(pairs, dfa, dfb)
print(f'comparing took {time.time()-start} seconds.')

comparing the record pairs. Please stand by...
comparing took 312.49057388305664 seconds.


In [8]:
print('training an ECM classifier. Please stand by...')
start = time.time()
cl = recordlinkage.ECMClassifier(binarize=0.9)
cl.fit(res)
print(f'training took {time.time()-start} seconds.')

training an ECM classifier. Please stand by...
training took 60.556395053863525 seconds.


In [9]:
preds = cl.predict(res)

In [10]:
len(preds)

68602

In [11]:
merge = pd.merge(dfa['voter_id'].reset_index(), dfb['voter_id'].reset_index(), how='inner', on='voter_id')
true_links = pd.MultiIndex.from_frame(merge[['index_x', 'index_y']])

In [12]:
cm = recordlinkage.confusion_matrix(true_links, preds, total=len(dfa))
fscore = recordlinkage.fscore(cm)
print('fscore', fscore)
recall = recordlinkage.recall(true_links, preds)
print('recall', recall)
precision = recordlinkage.precision(true_links, preds)
print('precision', precision)

fscore 0.9424351238292673
recall 0.9576004694341212
precision 0.927742631410163
