In [2]:
# !pip install recordlinkage

In [3]:
import recordlinkage
import pandas as pd
from collections import defaultdict
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from ensemble import ActiveLearner
import recordlinkage.datasets as rl_data

In [6]:
data = rl_data.load_febrl4(return_links=True)
df_org, df_dup, df_links = data[0], data[1], data[2]

In [7]:
# create dict of true links
def def_value():
    return "Not Present"
      
links = defaultdict(def_value)

for org, dup in df_links:
  links[org] = dup

In [8]:
# create indices to compare and set blocking  variable
indexer = recordlinkage.Index()
indexer.block(left_on='postcode', right_on='postcode')
candidate_pairs = indexer.index(df_org, df_dup)

In [9]:
# initialise class
comp = recordlinkage.Compare()

# initialise similarity measurement algorithms
comp.string('given_name', 'given_name', method='jarowinkler')
comp.string('surname', 'surname', method='jarowinkler')
comp.exact('street_number', 'street_number')
comp.string('address_1', 'address_1', method='jarowinkler')
comp.string('address_2', 'address_2', method='jarowinkler')
comp.string('suburb', 'suburb', method='jarowinkler')
# comp.string('postcode', 'postcode', method='jarowinkler')
comp.string('state', 'state', method='jarowinkler')
comp.string('date_of_birth', 'date_of_birth', method='jarowinkler')
# comp.string('soc_sec_id', 'soc_sec_id', method='jarowinkler')

# the method .compute() returns the DataFrame with the feature vectors.
df_compare = comp.compute(candidate_pairs, df_org, df_dup)

In [10]:
def true_class(org, dup):
  '''returns true if org and dup are true pair
  false otherwise'''
  if links[org] == dup:
    return True
  else:
     return False
     
# add true label column to dataframe
df_compare['true_class'] = [true_class(org, dup) for org, dup in df_compare.index]

In [11]:
n=50
while True:
  X_train = df_compare.sample(n=n)
  y_train = X_train['true_class']
  if np.sum(y_train) > 0:
    break
X_train = X_train.drop('true_class', axis=1)

X_test = df_compare.drop(X_train.index, axis=0)
y_test = X_test['true_class']
X_test = X_test.drop('true_class', axis=1)

# check that rows add up and that X_train index not in X_test
assert X_train.shape[0] + X_test.shape[0] == df_compare.shape[0]
assert X_train.index[np.random.randint(n)] not in X_test.index

In [12]:
# test rule based approach (score are ONLY WITHIN BLOCKING!)
f1_list = []
for thresh in np.linspace(0,10, 101):
  df_train = X_train.copy()
  df_train['pred_class'] = df_train[[col for col in df_train.columns 
                                             if col != 'true_class']].sum(axis=1).values > thresh

  f1_list.append((f1_score(y_train, df_train['pred_class']), thresh))

thresh_best = sorted(f1_list, key=lambda x: x[0])[-1][1]

df_test = X_test.copy()
df_test['pred_class'] = df_test[[col for col in df_test.columns 
                                            if col != 'true_class']].sum(axis=1).values > thresh_best
print('thresh: ', thresh_best)
print('accuracy: ', accuracy_score(y_test, df_test['pred_class']))
#print('fscore', f1_score(y_train, df_train['pred_class']))
print('recall: ',recall_score(y_test, df_test['pred_class']))
print('precision: ', precision_score(y_test, df_test['pred_class']))

thresh:  6.5
accuracy:  0.9692916418642109
recall:  0.7918347970567292
precision:  1.0


In [12]:
# the classes are relatively separable just from summing the distance measures
# df_test = X_test.copy()
# df_test['sum'] = df_test[[col for col in df_test.columns 
#                                            if col != 'true_class']].sum(axis=1).values
# df_test['true_class'] = y_test

# fig = px.violin(df_test, y="sum", x="true_class", color="true_class", box=True, points="all")
# fig.show()

In [13]:
lr = LogisticRegression(penalty='l2')
AL = ActiveLearner(df_org, df_dup, X_train, y_train, X_test, lr, 1000, 10)

In [17]:
AL.train()

pre train
 training data shape:  (1060, 8)
 training labels shape:  (1060,)
 unlabeled data shape:  (27549, 8)
post train
 training data shape:  (2060, 8)
 training labels shape:  (2060,)
 unlabeled data shape:  (26549, 8)


In [18]:
y_pred = AL.model.predict(AL.X_nolabel)
y_true = [true_class(org, dup) for org, dup in AL.X_nolabel.index]
print('accuracy', accuracy_score(y_pred, y_true))
print('recall', recall_score(y_true, y_pred))
print('precision', precision_score(y_true, y_pred))

accuracy 0.994802064107876
recall 0.9752712591471108
precision 0.9897567221510883


In [16]:
AL.clerical_review()

Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3304-org,adam,pichugin,16,tiwi place,ettrick,yackandandah,3116,qld,19650328,1748977
rec-3304-dup-0,adam,coulls,41,tiwi place,ettrick,yacknadandah,3116,qld,19650328,1748977


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4216-org,lauren,harrington,17,leibnitz place,rosetta village,granville,6066,vic,19550719,2207203
rec-4216-dup-0,lauren,harrington,28,leibnitzplace,rosetta village,newcastle,6066,vic,19550719,2207203


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2772-org,william,george,31.0,kidman close,ocean breeze,labrador,3438,sa,19250214,9555232
rec-2772-dup-0,william,george,,kidman pclose,ocean breeze,yambs,3438,sa,19250214,9555322


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3784-org,connor,hammill,99,goyder street,burnside,angaston,2304,nsw,19700803.0,7089797
rec-3784-dup-0,connor,hammill,99,goyder street,,angsgon,2304,nsw,,7089797


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2354-org,connor,shepherd,7,bulloo place,,kangaroo flat,4621,wa,19681115,1737778
rec-2354-dup-0,connor,shepherd,7,bulloo place,,wyoming,4621,wa,19100906,1737778


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1053-org,jade,quast,4,hodgkinson street,raworth cottage medical practice,east ryde,6210,vic,19390121,5098690
rec-1053-dup-0,lkue,quast,4,,raworth cottage meedical practice,east ryde,6210,vic,19760328,5098690


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1410-org,kyle,colantuono,8,moynihan street,cosy corner,lennox head,6081,vic,19960130,1684057
rec-1410-dup-0,kyle,faull,38,moynihan street,cosy corner,lennox head,6081,vic,19960130,1684057


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1708-org,taliah,mccarthy,,bourne street,locn 3439,greenacre,3198,nsw,19730822,7901066
rec-1708-dup-0,talia,mccarhy,,bournestmreet,locn 3439,deer park,3198,nsw,19730822,7901066


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-4270-org,bethany,finlay,353,capella crescent,haigh park,broken hill,2111,vic,19660601,3594609
rec-4270-dup-0,finlay,beghany,353,haighpark,capella crescent,broken hill,2111,vic,19660601,3594609


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3618-org,erin,hearn,,port jackson circuit,gowan,elwood,2605,nt,19101004,5064504
rec-3618-dup-0,erin,hearn,,port jackson circuit,gowan,preston,2605,nt,19101004,5064504


Enter label (True/False) :True
all data labeled
