In [4]:
# !pip install recordlinkage

In [16]:
import recordlinkage
import pandas as pd
from collections import defaultdict
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
from sklearn.linear_model import LogisticRegression
from ensemble import ActiveLearner

In [7]:
# data: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/JKBULA
names = ['id', 'first', 'last', 'address', 'city']
start_index = 0
df_org = pd.read_csv('Q.txt', delimiter=',', names=names)
df_dup = pd.read_csv('A.txt', delimiter=',', encoding="ISO-8859-1", names=names)
df_dup = df_dup.iloc[[i for i in range(start_index,df_dup.shape[0],10)]]
assert df_org.shape ==  df_dup.shape

In [8]:
# create dict of true links
def def_value():
    return False
      
links = defaultdict(def_value)

for i in range(df_org.shape[0]):
  assert df_org.iloc[i]['id'][1:] == df_dup.iloc[i]['id'][1:-2]
  links[df_org.iloc[i]['id']] = df_dup.iloc[i]['id']

In [9]:
# set index as id
df_org = df_org.set_index('id')
df_dup = df_dup.set_index('id')

In [None]:
# df_org['block'] = df_org['last'].apply(lambda name: name[0].lower())
# df_dup['block'] = df_dup['last'].apply(lambda name: name[0].lower())

In [10]:
indexer = recordlinkage.SortedNeighbourhoodIndex(
        'address', window=225
    )

candidate_pairs = indexer.index(df_org, df_dup)

# initialise class
comp = recordlinkage.Compare()

# initialise similarity measurement algorithms
comp.string('first', 'first', method='jarowinkler')
comp.string('last', 'last', method='jarowinkler')
# comp.string('address', 'address', method='jarowinkler')
comp.string('city', 'city', method='jarowinkler')

# the method .compute() returns the DataFrame with the feature vectors.
df_compare = comp.compute(candidate_pairs, df_org, df_dup)

In [11]:
# percent of true matches that end up in the same block

sum(l in candidate_pairs for l in links.items()) / len(links)

0.8673

In [12]:
def true_class(org, dup):
  '''returns true if org and dup are true pair
  false otherwise'''
  if links[org] == dup:
    return True
  else:
     return False
     
# add true label column to dataframe
true_list = []
for org, dup in df_compare.index:
  true_list.append(true_class(org, dup))
df_compare['true_class'] = true_list

In [14]:
# randomly select n rows from 1,317,575 row df_compare
# ensure there is at least 1 match
n=200
while True:
  X_train = df_compare.sample(n=n)
  y_train = X_train['true_class']
  if np.sum(y_train) > 0:
    break
X_train = X_train.drop('true_class', axis=1)

X_test = df_compare.drop(X_train.index, axis=0)
y_test = X_test['true_class']
X_test = X_test.drop('true_class', axis=1)

# check that rows add up and that X_train index not in X_test
assert X_train.shape[0] + X_test.shape[0] == df_compare.shape[0]
assert X_train.index[np.random.randint(n)] not in X_test.index

In [17]:
# test rule based approach (score are ONLY WITHIN BLOCKING!)
# note: accuracy, precision and recall are all measured only considering
# the matches that ended up in the same block
# this is because currently we are just trying to work on the classifier
# investigating blocking strategies is a next step
f1_list = []
for thresh in np.linspace(0,4, 41):
  df_train = X_train.copy()
  df_train['pred_class'] = df_train[[col for col in df_train.columns 
                                             if col != 'true_class']].sum(axis=1).values > thresh

  f1_list.append((f1_score(y_train, df_train['pred_class']), thresh))

thresh_best = sorted(f1_list, key=lambda x: x[0])[-1][1]

df_test = X_test.copy()
df_test['pred_class'] = df_test[[col for col in df_test.columns 
                                            if col != 'true_class']].sum(axis=1).values > thresh_best
print('thresh: ', thresh_best)
print('accuracy: ', accuracy_score(y_test, df_test['pred_class']))
print('recall: ',recall_score(y_test, df_test['pred_class']))
print('precision: ', precision_score(y_test, df_test['pred_class']))

thresh:  2.8000000000000003
accuracy:  0.9980445962615049
recall:  0.7136761992619927
precision:  0.9851957975167144


In [20]:
lr = LogisticRegression(penalty='l2')
AL = ActiveLearner(df_org, df_dup, X_train, y_train, X_test, lr, 1000, 10)

In [27]:
AL.train()

pre train
 training data shape:  (220, 3)
 training labels shape:  (220,)
 unlabeled data shape:  (1317355, 3)
post train
 training data shape:  (1220, 3)
 training labels shape:  (1220,)
 unlabeled data shape:  (1316355, 3)


In [29]:
# note: accuracy, precision and recall are all measured only considering
# the matches that ended up in the same block
# this is because currently we are just trying to work on the classifier
# investigating blocking strategies is a next step
y_pred = AL.model.predict(AL.X_nolabel)
y_true = [true_class(org, dup) for org, dup in AL.X_nolabel.index]
print('accuracy', accuracy_score(y_pred, y_true))
print('recall', recall_score(y_true, y_pred))
print('precision', precision_score(y_true, y_pred))

accuracy 0.9997531061149918
recall 0.977911414363363
precision 0.9844004656577415


In [26]:
AL.clerical_review()

Unnamed: 0,first,last,address,city
b3175,ALLEN,JERRY,3122 BRYCEWOOD PL,BURLINGTON
a3175_0,ALLEN,JoERRY,3122x BRYCEWOOD PL,BURLINTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b5867,FRANCES,CHRISTINE,3134 TRUITT DR,BURLINGTON
a5867_0,FRANES,CHRISINE,3134 TRUITTtDR,BURLINwGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b8108,ZIMMERMAN,DOROTHY,4841 UNION RIDGE RD,BURLINGTON
a8108_0,ZIMMEMAN,DOROTpHY,4841 UNION RIDGEdRD,BURINGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b5610,LAWRENCE,BENJAMIN,1528 OVERBROOK RD,BURLINGTON
a5610_0,LAWRNECE,BENJAMcIN,1528 OVERBRuOK RD,BURINGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b5059,ANNETTE,CHRISTINE,1616 TROLLINGWOOD RD #9,HAW RIVER
a5059_0,ANNETE,CHRISTcINE,1616 TROLLIhGWOOD RD #9,HAW RIeVER


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b9203,CONNER,PHYLLIS,4233 ROBERT L BROOKS LN,BURLINGTON
a9203_0,CONNER,PHYLbLIS,4233 RBERT L BROOKS LN,BzRLINGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b94,MELINDAMAJID,YASMEEN,1528 S MEBANE ST #603,BURLINGTON
a94_0,MEINDAMAJID,YASMEkEN,1528 S MEaANE ST #603,BURLNGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b2991,JOHNSTON,JENNIFER,1139 KELSO LN,BURLINGTON
a2991_0,JOHNSTN,JENNzIFER,1139 KELSO LNs,BURLfINGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b2780,LOVRRICK,MICHAEL,1205 WHITSETT ST,BURLINGTON
a2780_0,LOVRRICK,uMICHAEL,1205 WHITSlTT ST,BURINGTON


Enter label (True/False) :True


Unnamed: 0,first,last,address,city
b4427,LATTA,CORINNE,606 GEORGIA AVE,BURLINGTON
a4427_0,LATTA,CcORINNE,606 GbORGIA AVE,BRLINGTON


Enter label (True/False) :True
all data labeled
