# Learning Active Learning

## Setup

In [1]:
!pip install numpy --upgrade



In [2]:
# Check if we're running locally, or in Google Colab.
try:
    import google.colab
    COLAB = True
except ModuleNotFoundError:
    COLAB = False
    
# If we're running in Colab, download the tutorial functions file 
# to the Colab session local directory, and install required libraries.
if COLAB:
    import requests
    
    tutorial_functions_url = "https://raw.githubusercontent.com/rachhouse/intro-to-data-linking/main/tutorial_notebooks/linking_tutorial_functions.py"
    r = requests.get(tutorial_functions_url)
    
    with open("linking_tutorial_functions.py", "w") as fh:
        fh.write(r.text)
    
    !pip install -q altair dedupe dedupe-variable-name jellyfish recordlinkage 

In [3]:
import datetime
import itertools
import os
import pathlib
import re
from typing import Any, Dict, Optional

import dedupe
import pandas as pd

import linking_tutorial_functions as tutorial

INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/Grammar.txt
INFO:root:Generating grammar tables from /usr/lib/python3.7/lib2to3/PatternGrammar.txt


In [4]:
WORKING_DIR = pathlib.Path(os.path.abspath(''))
WORKING_DIR

PosixPath('/content')

## Dataset Setup!

In [5]:
df_A, df_B, df_ground_truth = tutorial.load_febrl_training_data(COLAB)

## Data Augmentation

In [6]:
def format_dob(dob: str) -> Optional[str]:
    """ Transform date of birth format from YYYYMMDD to mm/dd/yy.
        If DOB cannot be transformed, return None.
    """
    try:
        if re.match(r"\d{8}", dob):
            return (datetime.datetime.strptime(dob, "%Y%m%d")).strftime("%m/%d/%y")
    except:
        pass

    return None

def strip_and_null(x: Any) -> Optional[str]:
    """ Stringify incoming variable, remove trailing/leading whitespace
        and return resulting string. Return None if resulting string is empty.
    """
    x = str(x).strip()
    
    if x == "":
        return None
    else:
        return x
    
def convert_df_to_dict(df: pd.DataFrame) -> Dict[str, Dict]:
    """ Convert pandas DataFrame to dict keyed by record id.
        Convert all fields to strings or Nones to satisfy dedupe.
        Transform date format of date_of_birth field.
    """    

    for col in df.columns:
        df[col] = df[col].apply(lambda x: strip_and_null(x))

    df["date_of_birth"] = df["date_of_birth"].apply(lambda x: format_dob(x))    

    return df.to_dict("index")

In [7]:
records_A = convert_df_to_dict(df_A)
records_B = convert_df_to_dict(df_B)

## Training Setup

In [8]:
%%time

fields = [
    { "field" : "first_name", "type" : "Name" },
    { "field" : "surname", "type" : "Name" },
    { "field" : "address_1", "type" : "ShortString" },
    { "field" : "address_2", "type" : "ShortString" },
    { "field" : "suburb", "type" : "ShortString" },
    { "field" : "postcode", "type" : "Exact" },
    { "field" : "state", "type" : "Exact" },
    { "field" : "date_of_birth", "type" : "DateTime" },
    { "field" : "soc_sec_id", "type" : "Exact" },
]

linker = dedupe.RecordLink(fields)
linker.prepare_training(records_A, records_B)

INFO:dedupe.canopy_index:Removing stop word re
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)


CPU times: user 57.7 s, sys: 870 ms, total: 58.6 s
Wall time: 58 s


## Active Learning Labeling Session

In [9]:
# This is cool
dedupe.console_label(linker)

first_name : jorja
surname : goldsworthy
address_1 : menzies court
address_2 : rosetta village
suburb : None
postcode : 2061
state : sa
date_of_birth : 05/22/66
soc_sec_id : 8895851

first_name : lachlan
surname : goldsworthy
address_1 : None
address_2 : None
suburb : williamsptown
postcode : None
state : vic
date_of_birth : None
soc_sec_id : 5468655

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


u


first_name : hayden
surname : campbell
address_1 : None
address_2 : None
suburb : sherwood
postcode : 3177
state : nsw
date_of_birth : 11/02/72
soc_sec_id : 7296377

first_name : sarsha
surname : soutggate
address_1 : forbes street
address_2 : None
suburb : sherwood
postcode : 6107
state : nsw
date_of_birth : None
soc_sec_id : 5209645

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


first_name : benjamin
surname : coleman
address_1 : bennelong crescent
address_2 : warrol
suburb : port macquarie
postcode : 2101
state : nsw
date_of_birth : None
soc_sec_id : 4225212

first_name : jakob
surname : mayer
address_1 : None
address_2 : None
suburb : port macquarie
postcode : 4163
state : sa
date_of_birth : 05/25/07
soc_sec_id : 2080665

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : kiandra
surname : dunstone
address_1 : None
address_2 : None
suburb : oaklands park
postcode : 6163
state : wa
date_of_birth : 10/29/11
soc_sec_id : 5277244

first_name : kiandra
surname : dunstone
address_1 : None
address_2 : None
suburb : oaklands park
postcode : 6163
state : wa
date_of_birth : None
soc_sec_id : 5277244

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : shelbey
surname : stanfield
address_1 : duffy street
address_2 : None
suburb : bowen mountain
postcode : 6110
state : vic
date_of_birth : 06/03/94
soc_sec_id : 8681044

first_name : jessiy
surname : paterson
address_1 : mault place
address_2 : None
suburb : belzot
postcode : 4005
state : vic
date_of_birth : 06/05/94
soc_sec_id : 6273963

1/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : wade
surname : hoepfner
address_1 : perry drive
address_2 : None
suburb : sunshine
postcode : 7306
state : nsw
date_of_birth : 02/27/75
soc_sec_id : 6631640

first_name : wd ae
surname : dixon
address_1 : perrhdive
address_2 : None
suburb : sunshibe
postcode : 7306
state : nsw
date_of_birth : 02/27/75
soc_sec_id : 6631140

1/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : zali
surname : nan
address_1 : marulda street
address_2 : locn 141
suburb : warnbro
postcode : 3059
state : qld
date_of_birth : 06/29/38
soc_sec_id : 6340336

first_name : zalb
surname : nan
address_1 : marulda street
address_2 : locn 141
suburb : None
postcode : 3059
state : qld
date_of_birth : 06/29/38
soc_sec_id : 6340335

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : isabella
surname : dixon
address_1 : None
address_2 : None
suburb : casino
postcode : 2090
state : vic
date_of_birth : None
soc_sec_id : 5310955

first_name : matisse
surname : dixon
address_1 : macgregor street
address_2 : promenade arcade
suburb : salisbury
postcode : 5606
state : vic
date_of_birth : 12/29/95
soc_sec_id : 8372943

2/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : kane
surname : grimm
address_1 : None
address_2 : None
suburb : moorabbin
postcode : 3767
state : nsw
date_of_birth : 11/01/31
soc_sec_id : 6949084

first_name : edwrd
surname : mcgeegor
address_1 : None
address_2 : None
suburb : nyah
postcode : 7170
state : nsw
date_of_birth : None
soc_sec_id : 2379004

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : alexandra
surname : van rensburg
address_1 : astelia place
address_2 : None
suburb : woodcroft
postcode : 2756
state : nsw
date_of_birth : 01/31/22
soc_sec_id : 3123032

first_name : alexandra
surname : van rensburg
address_1 : None
address_2 : None
suburb : woodcroft
postcode : 2756
state : nsw
date_of_birth : None
soc_sec_id : 3123034

2/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : logan
surname : mac onochie
address_1 : mackellar crescent
address_2 : None
suburb : oatlands
postcode : 4207
state : vic
date_of_birth : 08/13/84
soc_sec_id : 4647965

first_name : logt
surname : mac onochie
address_1 : None
address_2 : None
suburb : oatlands
postcode : 4207
state : vic
date_of_birth : None
soc_sec_id : 4648775

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, surname)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : georgia
surname : nguyen
address_1 : None
address_2 : brentwood vlge
suburb : sefton
postcode : 3101
state : wa
date_of_birth : None
soc_sec_id : 4084643

first_name : georgia
surname : nguyen
address_1 : None
address_2 : brentwoom vlge
suburb : sefton
postcode : 3101
state : wa
date_of_birth : None
soc_sec_id : 2139336

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : leon
surname : son
address_1 : jacka crescent
address_2 : None
suburb : thornlie
postcode : 2795
state : vic
date_of_birth : 10/12/31
soc_sec_id : 3299242

first_name : leon
surname : son
address_1 : jackacrescent
address_2 : None
suburb : thornlie
postcode : 2795
state : vic
date_of_birth : None
soc_sec_id : 3292924

5/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, surname)
INFO:dedupe.training:TfidfNGramSearchPredicate: (0.8, address_2)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : stephanie
surname : braithwaite
address_1 : pelsart street
address_2 : kilburnie
suburb : coonabarabran
postcode : 3139
state : nsw
date_of_birth : None
soc_sec_id : 6096045

first_name : stefanie
surname : braitnwagite
address_1 : pelsart street
address_2 : roseleigh farms
suburb : coonabarabran
postcode : 3139
state : nsw
date_of_birth : None
soc_sec_id : 6096054

6/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, surname)
INFO:dedupe.training:TfidfNGramSearchPredicate: (0.8, address_2)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
INFO:dedupe.training:SimplePredicate: (twoGramFingerprint, address_1)
first_name : hugo
surname : thorpe
address_1 : ingham place
address_2 : None
suburb : padstow
postcode : 2195
state : nsw
date_of_birth : 06/07/12
soc_sec_id : 3688367

first_name : hugo
surname : thorpe
address_1 : None
address_2 : None
suburb : padstow
postcode : 2195
state : nsw
date_of_birth : None
soc_sec_id : 3685206

7/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


u


first_name : holly
surname : beelitz
address_1 : jacobs street
address_2 : phillip lodge
suburb : anglesea
postcode : 2560
state : nsw
date_of_birth : 06/21/58
soc_sec_id : 7557245

first_name : jake
surname : rei
address_1 : wambool street
address_2 : ferndale
suburb : None
postcode : 5158
state : wa
date_of_birth : None
soc_sec_id : 6387027

7/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : amber
surname : hobson
address_1 : whitfield circuit
address_2 : new chum
suburb : mount evelyn
postcode : 3196
state : vic
date_of_birth : 04/10/72
soc_sec_id : 5935678

first_name : jade
surname : chorley
address_1 : weddin circuit
address_2 : kurrajong
suburb : None
postcode : 2752
state : nsw
date_of_birth : 04/07/72
soc_sec_id : 8088272

7/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : jayden
surname : mccarthy
address_1 : clift crescent
address_2 : willaroo
suburb : south melbourne
postcode : 6333
state : sa
date_of_birth : None
soc_sec_id : 9763449

first_name : jarvis
surname : wilknis
address_1 : None
address_2 : None
suburb : south melbourne
postcode : None
state : qld
date_of_birth : 03/23/00
soc_sec_id : 9621658

7/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : emiily
surname : campbell
address_1 : None
address_2 : unt 2
suburb : western gardens
postcode : 2422
state : vic
date_of_birth : None
soc_sec_id : 8700179

first_name : sophie
surname : campbell
address_1 : howitt street
address_2 : None
suburb : None
postcode : 3789
state : vic
date_of_birth : None
soc_sec_id : 7764578

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [10]:
%%time
linker.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.217154588733865
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, postcode), SimplePredicate: (sameFiveCharStartPredicate, suburb), PartialIndexLevenshteinSearchPredicate: (1, first_name, Surname))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, surname), PartialIndexLevenshteinSearchPredicate: (2, surname, CorporationName), SimplePredicate: (oneGramFingerprint, surname))


CPU times: user 5.32 s, sys: 587 ms, total: 5.91 s
Wall time: 5.3 s


In [11]:
ACTIVE_LEARNING_DIR = WORKING_DIR / "dedupe_active_learning"
ACTIVE_LEARNING_DIR.mkdir(parents=True, exist_ok=True)

SETTINGS_FILE = ACTIVE_LEARNING_DIR / "dedupe_learned_settings"
TRAINING_FILE = ACTIVE_LEARNING_DIR / "dedupe_training.json"

with open(TRAINING_FILE, "w") as fh:
    linker.write_training(fh)
    
with open(SETTINGS_FILE, "wb") as sf:
    linker.write_settings(sf)

## Analyze Blockers Learned 

In [12]:
linker.predicates

((SimplePredicate: (wholeFieldPredicate, postcode),
  SimplePredicate: (sameFiveCharStartPredicate, suburb),
  PartialIndexLevenshteinSearchPredicate: (1, first_name, Surname)),
 (SimplePredicate: (commonTwoTokens, surname),
  PartialIndexLevenshteinSearchPredicate: (2, surname, CorporationName),
  SimplePredicate: (oneGramFingerprint, surname)))

In [13]:
candidate_pairs = [x for x in linker.pairs(records_A, records_B)]
print(f"{len(candidate_pairs):,} candidate pairs generated from blocking.")

1,738 candidate pairs generated from blocking.


In [14]:
candidate_pairs[0]

(('48a56cad-7ba6-45e1-97cd-517ba65bdab5',
  {'address_1': 'kambalda crescent',
   'address_2': 'villa 427',
   'age': '27',
   'date_of_birth': '01/08/26',
   'first_name': 'lachlan',
   'phone_number': None,
   'postcode': '5109',
   'soc_sec_id': '9937958',
   'state': None,
   'street_number': '36',
   'suburb': 'auburn',
   'surname': 'eglinton'}),
 ('c77c2c04-4415-4c4d-b248-18dc28fd63d0',
  {'address_1': 'kambalda crescent',
   'address_2': None,
   'age': None,
   'date_of_birth': '01/08/26',
   'first_name': 'lachlan',
   'phone_number': None,
   'postcode': '5109',
   'soc_sec_id': '9937958',
   'state': None,
   'street_number': '366',
   'suburb': 'auburn',
   'surname': 'eglinton'}))

In [15]:
df_candidate_links = pd.DataFrame(
    [(x[0][0], x[1][0]) for x in candidate_pairs]
).rename(columns={0 : "person_id_A", 1 : "person_id_B"}).set_index(["person_id_A", "person_id_B"])

df_candidate_links.head()

person_id_A,person_id_B
48a56cad-7ba6-45e1-97cd-517ba65bdab5,c77c2c04-4415-4c4d-b248-18dc28fd63d0
050a4ce1-8fc9-410d-bae1-65a70a518e34,e36dc4e4-c33c-4021-9dba-ceed3a4956d7
7264bfb0-bbcb-4f68-b9bf-03619237cfb2,8e5d98b8-9611-480e-8c65-b0e56520307b
67f406b1-ddbe-4dff-b725-f6653f8af0a6,97bbf64b-d893-4af2-91d8-86215fe0a4f7
4091b2cd-f68c-447e-80ff-5ee4dde4f057,de64cc87-e3f4-4546-8e20-1294b19f9cac


In [16]:
max_candidate_pairs = df_A.shape[0]*df_B.shape[0]

print(f"{max_candidate_pairs:,} total possible pairs.")

# Calculate search space reduction.
search_space_reduction = round(1 - len(candidate_pairs)/max_candidate_pairs, 6)
print(f"\n{len(candidate_pairs):,} pairs after full blocking: {search_space_reduction}% search space reduction.")

# Calculate retained true links percentage.
total_true_links = df_ground_truth.shape[0]
true_links_after_blocking = pd.merge(
    df_ground_truth,
    df_candidate_links,
    left_index=True,
    right_index=True,
    how="inner"
).shape[0]

retained_true_link_percent = round((true_links_after_blocking/total_true_links) * 100, 2)
print(f"{retained_true_link_percent}% true links retained after blocking.")

10,562,500 total possible pairs.

1,738 pairs after full blocking: 0.999835% search space reduction.
57.33% true links retained after blocking.


## Analyzing Scores and Classifier

In [17]:
%%time
linked_records = linker.join(records_A, records_B, threshold=0.0, constraint="one-to-one")

CPU times: user 260 ms, sys: 72.9 ms, total: 333 ms
Wall time: 1.58 s


In [18]:
df_predictions = pd.DataFrame(
    [ {"person_id_A" : x[0][0], "person_id_B" : x[0][1], "model_score" : x[1]} for x in linked_records]
)

df_predictions = df_predictions.set_index(["person_id_A", "person_id_B"])

df_predictions = pd.merge(
    df_predictions,
    df_ground_truth,
    left_index=True,
    right_index=True,
    how="left",
)

df_predictions["ground_truth"].fillna(False, inplace=True)
df_predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,model_score,ground_truth
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1
ff35c358-732f-40ed-a686-bd228089323a,79160577-d48f-4434-9ff1-af16a4eebf4c,1.0,True
fe1d4173-363a-4870-83fe-4eab54816bb9,b13aac10-f04c-4ea6-983f-f29067772b45,1.0,True
fcbfdeea-781b-4f16-9986-25504ae1a5fa,e86fc51d-03d9-41ca-8a05-c90715f25ea4,1.0,True
fcb0aaf9-24b0-4831-9086-2c2449d75b3b,548bdbbb-2528-4705-8b2e-76e99d3def77,1.0,True
f817fb18-672c-41d6-8dab-ae787179a58f,ee6311ac-7d28-43b9-84c0-fc0b9f7bd35e,1.0,True
...,...,...,...
11465bee-9867-42b0-ab90-522025b2f103,dac40fb0-9c46-46fd-91e3-a70e0e1f2f34,0.0,True
10f42ddc-d1c4-4666-a3ed-a75ee51f42b8,b87a7962-b039-4b53-adc7-cd7769804ec9,0.0,True
0f88cbd7-a3f2-4dda-a8a5-b560dde6bfda,02173c5c-5c55-4729-b43f-c6fd9eb86531,0.0,True
0f46ef96-d086-4b5e-a406-e4e7fdbb7d48,7c8bb796-4892-40dc-9939-f17f9aa39d0f,0.0,True


## Linking Model Score Threshold

In [20]:
tutorial.plot_model_score_distribution(df_predictions)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


## Precision/Recall/Model Score

In [21]:
df_eval = tutorial.evaluate_linking(
    df=df_predictions
)

In [22]:
df_eval.head()

Unnamed: 0,threshold,tp,fp,tn,fn,precision,recall,f1
0,0.0,1720,2,0,0,0.998839,1.0,0.999419
1,0.020408,1541,0,2,179,1.0,0.89593,0.945109
2,0.040816,1537,0,2,183,1.0,0.893605,0.943813
3,0.061224,1532,0,2,188,1.0,0.890698,0.942189
4,0.081633,1526,0,2,194,1.0,0.887209,0.940234


In [23]:
tutorial.plot_precision_recall_vs_threshold(df_eval)

## Tweaking the linker and using existing data

In [24]:
%%time

fields = [
    { "field" : "first_name", "type" : "Name" },
    { "field" : "surname", "type" : "Name" },
    { "field" : "suburb", "type" : "ShortString" },
    { "field" : "postcode", "type" : "Exact" },
    { "field" : "state", "type" : "Exact" },
    { "field" : "date_of_birth", "type" : "DateTime" },
    { "field" : "soc_sec_id", "type" : "Exact" },
]

linker2 = dedupe.RecordLink(fields)

with open(TRAINING_FILE, "r") as fh:
    linker2.prepare_training(records_A, records_B, training_file=fh)

INFO:dedupe.api:reading training from file
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, postcode)


CPU times: user 41.9 s, sys: 654 ms, total: 42.5 s
Wall time: 42 s


In [25]:
dedupe.console_label(linker2)

first_name : lauren
surname : de lucia
suburb : bunya mountain
postcode : 5152
state : vic
date_of_birth : 08/10/55
soc_sec_id : 7275131

first_name : lauren
surname : de lucia
suburb : bunya mountain
postcode : 5112
state : vic
date_of_birth : 08/10/55
soc_sec_id : 7275131

7/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


first_name : holly
surname : ryan
suburb : mosman
postcode : 2455
state : vic
date_of_birth : 02/17/51
soc_sec_id : 1408056

first_name : holly
surname : ryan
suburb : mosma
postcode : 2549
state : vic
date_of_birth : 02/17/51
soc_sec_id : 1408056

8/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, postcode)
INFO:dedupe.training:SimplePredicate: (firstTwoTokensPredicate, surname)
first_name : sophie
surname : broadby
suburb : None
postcode : 5038
state : nsw
date_of_birth : 04/04/01
soc_sec_id : 6307537

first_name : sophie
surname : broadby
suburb : None
postcode : 5108
state : nsw
date_of_birth : 04/04/01
soc_sec_id : 6307537

9/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (fingerprint, suburb)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
first_name : chloe
surname : hare
suburb : None
postcode : 3195
state : qld
date_of_birth : 02/03/65
soc_sec_id : 9880358

first_name : chloe
surname : hare
suburb : None
postcode : 3195
state : qld
date_of_birth : None
soc_sec_id : 9880358

10/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : isabella
surname : carbone
suburb : port kembla
postcode : 6018
state : qld
date_of_birth : 06/08/48
soc_sec_id : 8019906

first_name : isabella
surname : carbone
suburb : None
postcode : 6018
state : qkdx
date_of_birth : None
soc_sec_id : 8019906

11/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, postcode)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
first_name : isabella
surname : tu
suburb : albion park
postcode : 4405
state : nsw
date_of_birth : 09/17/07
soc_sec_id : 2108268

first_name : kyratzoulis
surname : jarrod
suburb : albion park
postcode : 6056
state : nsw
date_of_birth : None
soc_sec_id : 5314660

12/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


first_name : holly
surname : flatman
suburb : seymour
postcode : 4870
state : vic
date_of_birth : 09/25/88
soc_sec_id : 9424419

first_name : holly
surname : flatman
suburb : seymour
postcode : 4730
state : vic
date_of_birth : None
soc_sec_id : 9424419

12/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : tyrone
surname : plane
suburb : lalor
postcode : 4869
state : sa
date_of_birth : None
soc_sec_id : 4461124

first_name : tyrone
surname : plane
suburb : lalor
postcode : 4896
state : sa
date_of_birth : None
soc_sec_id : 4461124

13/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, postcode)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
first_name : emma
surname : greenland
suburb : springvale south
postcode : 3284
state : tas
date_of_birth : 01/08/60
soc_sec_id : 1428020

first_name : emma
surname : greenland
suburb : springvale south
postcode : 3248
state : tas
date_of_birth : 01/08/60
soc_sec_id : 1482200

14/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


first_name : lily
surname : clarke
suburb : woodend
postcode : None
state : vic
date_of_birth : 08/05/80
soc_sec_id : 8508546

first_name : lily
surname : clarke
suburb : woodend
postcode : None
state : vic
date_of_birth : 08/05/80
soc_sec_id : 8501264

15/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:SimplePredicate: (fingerprint, suburb)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, soc_sec_id)
INFO:dedupe.training:SimplePredicate: (wholeFieldPredicate, date_of_birth)
first_name : james
surname : stancliffe
suburb : merino
postcode : 4655
state : nsw
date_of_birth : None
soc_sec_id : 4818725

first_name : james
surname : stancliffe
suburb : merno
postcode : 4655
state : nsw
date_of_birth : None
soc_sec_id : 2931265

16/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


## Retrain linker and examine blocks like before

In [26]:
%%time
linker2.train()

INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.2845801384296647
INFO:dedupe.training:Final predicate set:
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, suburb), TfidfNGramSearchPredicate: (0.8, surname), SimplePredicate: (metaphoneToken, first_name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, soc_sec_id), PartialIndexTfidfNGramSearchPredicate: (0.8, surname, Surname), SimplePredicate: (suffixArray, first_name))
INFO:dedupe.training:(SimplePredicate: (wholeFieldPredicate, date_of_birth), SimplePredicate: (oneGramFingerprint, first_name), LevenshteinSearchPredicate: (1, suburb))
INFO:dedupe.training:(SimplePredicate: (commonTwoTokens, surname), LevenshteinSearchPredicate: (4, first_name))


CPU times: user 5.39 s, sys: 389 ms, total: 5.78 s
Wall time: 5.47 s


In [27]:
candidate_pairs = [x for x in linker2.pairs(records_A, records_B)]
print(f"{len(candidate_pairs):,} candidate pairs generated from blocking.")

df_candidate_links = pd.DataFrame(
    [(x[0][0], x[1][0]) for x in candidate_pairs]
).rename(columns={0 : "person_id_A", 1 : "person_id_B"}).set_index(["person_id_A", "person_id_B"])

max_candidate_pairs = df_A.shape[0]*df_B.shape[0]

print(f"{max_candidate_pairs:,} total possible pairs.")

# Calculate search space reduction.
search_space_reduction = round(1 - len(candidate_pairs)/max_candidate_pairs, 6)
print(f"\n{len(candidate_pairs):,} pairs after full blocking: {search_space_reduction}% search space reduction.")

# Calculate retained true links percentage.
total_true_links = df_ground_truth.shape[0]
true_links_after_blocking = pd.merge(
    df_ground_truth,
    df_candidate_links,
    left_index=True,
    right_index=True,
    how="inner"
).shape[0]

retained_true_link_percent = round((true_links_after_blocking/total_true_links) * 100, 2)
print(f"{retained_true_link_percent}% true links retained after blocking.")

1,531 candidate pairs generated from blocking.
10,562,500 total possible pairs.

1,531 pairs after full blocking: 0.999855% search space reduction.
50.93% true links retained after blocking.


## Evaluate Performance

In [28]:
%%time
linked_records = linker2.join(records_A, records_B, threshold=0.0, constraint="one-to-one")

CPU times: user 2.43 s, sys: 104 ms, total: 2.53 s
Wall time: 3.53 s


In [29]:
df_predictions = pd.DataFrame(
    [ {"person_id_A" : x[0][0], "person_id_B" : x[0][1], "model_score" : x[1]} for x in linked_records]
)

df_predictions = df_predictions.set_index(["person_id_A", "person_id_B"])

df_predictions = pd.merge(
    df_predictions,
    df_ground_truth,
    left_index=True,
    right_index=True,
    how="left",
)

df_predictions["ground_truth"].fillna(False, inplace=True)
df_predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,model_score,ground_truth
person_id_A,person_id_B,Unnamed: 2_level_1,Unnamed: 3_level_1
fff044ab-8dca-4946-bfa4-1675ee7d56b5,99060a0c-e1bf-4869-bf08-2e15389193b6,1.0,True
ffd668ac-2f63-4c05-a6a3-58ebcf1f4a80,3e8c4b67-3611-4a08-84c8-b082b627bb21,1.0,True
ffba4e8b-1f2a-46f7-b18b-41b84de5ec5e,a5fce2df-9fdc-48c5-90ea-9e942e17158c,1.0,True
ffa0b533-d09f-4291-b960-27975a078587,fcd967d6-f8b1-4fe9-80ab-d458e14d5686,1.0,True
ff86e492-166d-4652-bf5b-61b9eef60e51,0e4b371a-3c2b-4e2d-bae3-1e37058855b7,1.0,True
...,...,...,...
435a6ee0-46fc-4bb1-a275-f1bea5d31271,c917c323-3e36-46b7-b28d-351b94a77196,0.0,True
3d3a085a-e06b-4942-aa87-6545a6112b53,1ca7ff61-8fc1-41d0-b6d7-e732595bfc15,0.0,True
3ae0c76a-b655-46ce-9d1c-03045952c528,f5b27611-ffa4-4044-b09b-b51879d0316a,0.0,True
1163ce8d-ace0-4635-9440-4fa51d7896b7,7d62fe95-4065-4000-96fa-9f5b8701a36f,0.0,True


In [30]:
df_predictions["ground_truth"].value_counts()

True     1528
False       2
Name: ground_truth, dtype: int64

In [31]:
tutorial.plot_model_score_distribution(df_predictions)

In [32]:
df_eval = tutorial.evaluate_linking(
    df=df_predictions
)

tutorial.plot_precision_recall_vs_threshold(df_eval)