# Privacy Preserving Record Linkage of the NCVR dataset using Anonlink

In [1]:
import io
import json
import pprint
import pandas as pd
import time

import anonlink
from clkhash.clk import generate_clk_from_csv
from clkhash import schema

In [2]:
with open('data/ncvr_optimal_schema.json', 'r') as f:
    linkage_schema = schema.from_json_file(f)

In [3]:
dfa = pd.read_csv('data/ncvoter-20140619-temporal-balanced-ratio-1to1-a.csv', encoding='latin-1')
dfa.drop_duplicates(subset='voter_id', keep='last', inplace=True)
dfb = pd.read_csv('data/ncvoter-20140619-temporal-balanced-ratio-1to1-b.csv', encoding='latin-1')
dfb.drop_duplicates(subset='voter_id', keep='last', inplace=True)
dfa = dfa.reset_index(drop=True)[:100000]
dfb = dfb.reset_index(drop=True)[:100000]

In [4]:
a_csv = io.StringIO()
dfa.drop(columns=['voter_id']).to_csv(a_csv, index=False)
a_csv.seek(0)
b_csv = io.StringIO()
dfb.drop(columns=['voter_id']).to_csv(b_csv, index=False)
b_csv.seek(0)

0

In [5]:
secret = "password1234"
clks_a = generate_clk_from_csv(a_csv, secret, linkage_schema)
clks_b = generate_clk_from_csv(b_csv, secret, linkage_schema)

generating CLKs: 100%|██████████| 100k/100k [00:29<00:00, 3.39kclk/s, mean=626, std=14.1] 
generating CLKs: 100%|██████████| 100k/100k [00:33<00:00, 2.96kclk/s, mean=626, std=14.1] 


In [6]:
def mapping_from_clks(clks_a, clks_b, threshold):
    results_candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
            [clks_a, clks_b],
            anonlink.similarities.dice_coefficient,
            threshold
    )
    solution = anonlink.solving.greedy_solve(results_candidate_pairs)
    print('Found {} matches'.format(len(solution)))
    # each entry in `solution` looks like this: '((0, 4039), (1, 2689))'.
    # The format is ((dataset_id, row_id), (dataset_id, row_id))
    # As we only have two parties in this example, we can remove the dataset_ids.
    # Also, turning the solution into a set will make it easier to assess the
    # quality of the matching.
    return set((a, b) for ((_, a), (_, b)) in solution)


In [7]:
start = time.time()
found_matches = mapping_from_clks(clks_a, clks_b, 0.86)
print(f'comparing and solving took {time.time()-start} seconds.')

Found 66706 matches
comparing and solving took 156.66797494888306 seconds.


In [8]:
merge = pd.merge(dfa['voter_id'].reset_index(), dfb['voter_id'].reset_index(), how='inner', on='voter_id')

true_matches = set((row[0], row[1]) for row in merge.drop(columns=['voter_id']).itertuples(index=False))

In [9]:
def describe_matching_quality(found_matches, true_matches):
           
    tp = len(found_matches & true_matches)
    fp = len(found_matches - true_matches)
    fn = len(true_matches - found_matches)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    print('Precision: {:.3f}, Recall: {:.3f}'.format(precision, recall))

describe_matching_quality(found_matches, true_matches)

Precision: 0.988, Recall: 0.992
