In [2]:
# !pip install recordlinkage
# !pip install unidecode

In [3]:
import recordlinkage
import recordlinkage.datasets as rl_data
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from collections import defaultdict
import pandas as pd
import numpy as np
import random
import string
from unidecode import unidecode
from ensemble import ActiveLearner, Ensemble

In [4]:
data = rl_data.load_febrl4(return_links=True)
df_org, df_dup, df_links = data[0], data[1], data[2]

In [5]:
# create dict of true links
def def_value():
    return "Not Present"
      
links = defaultdict(def_value)

for org, dup in df_links:
  links[org] = dup

In [6]:
# import name datasets to create dataset
# replace unknown frequencies ('..') with the min (100)
url = 'https://raw.githubusercontent.com/jvalhondo/spanish-names-surnames/master/'
female_df = pd.read_csv(url + 'female_names.csv')
male_df = pd.read_csv(url + 'male_names.csv')
surname_df = pd.read_csv(url +'surnames_freq_ge_100.csv' ).replace('..', 100)

In [7]:
# assuming 50/50 male/female split - sample from first and surnames based on frequency
first_f = female_df.sample(n=df_org.shape[0]//2, weights='frequency', 
                           random_state=1)['name'].reset_index(drop=True)
first_m = male_df.sample(n=df_org.shape[0]-first_f.shape[0], 
                         weights='frequency', random_state=1)['name'].reset_index(drop=True)
first_org = first_f.append(first_m).reset_index(drop=True)
sur_f = surname_df.sample(n=df_org.shape[0], weights='frequency_first', 
                          random_state=1)['surname'].reset_index(drop=True)
sur_l = surname_df.sample(n=df_org.shape[0], weights='frequency_second', 
                          random_state=2)['surname'].reset_index(drop=True)
surname_org = sur_f + ' ' + sur_l

In [8]:
# add new names to df
df_org = df_org.sort_index()
df_dup = df_dup.sort_index()
df_org['given_name'] = list(first_org)
df_org['surname'] = list(surname_org)
df_dup['given_name'] = list(first_org)
df_dup['surname'] = list(surname_org)

In [9]:
# create function to add noise
def add_noise(word, switch, flip):
  '''function to add noise to names - replace n letters w/ 
  prob switch^n and flip order if 2 names, w/prob flip'''
  if word != word:
    return word
  while True:
    if np.random.rand() > switch:
        break
    if word.isnumeric():
      new_char = str(np.random.randint(10))
    else:
      new_char = random.choice(string.ascii_lowercase)
    word = word.replace(word[np.random.randint(len(word))], new_char, 1)

  if ' ' in word and np.random.rand() < flip:
    word = word.split(' ')
    word = word[1] +' ' + word[0]
  return word

In [10]:
# add noise
# parameters in add_noise function can be adjusted to 
# customize the amount of noise 
for col in df_dup.columns:
  if col == 'postcode':
    continue
  df_dup[col] = df_dup[col].apply(lambda word: add_noise(word, .70, .70)) 

In [11]:
# cleaning
df_org['given_name'] = df_org['given_name'].apply(lambda name: unidecode(name).lower())
df_org['surname'] = df_org['surname'].apply(lambda name: unidecode(name).lower())
df_dup['given_name'] = df_dup['given_name'].apply(lambda name: unidecode(name).lower())
df_dup['surname'] = df_dup['surname'].apply(lambda name: unidecode(name).lower())

In [12]:
# compare dfs to examine amount of noise
df_org.head(5)

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-0-org,rocio,pardo merino,1,knox street,lakewood estate,byford,4129,vic,19280722.0,1683994
rec-1-org,alexandra,teran garcia,25,pike place,rowethorpe,marsden,2152,nsw,19110816.0,6653129
rec-10-org,maria carmen,garcia burgos,5,carrington road,legacy vlge,yagoona,2464,nsw,19500531.0,3232033
rec-100-org,concepcion,molina pardo,38,tindale street,villa 2,cromer heights,4125,vic,,4620080
rec-1000-org,maria teresa,sanchez crespo,70,wybalena grove,inverneath,paralowie,5065,nsw,19720503.0,1267612


In [13]:
df_dup.head(5)

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-0-dup-0,rjaso,lerqno pardd,4.0,street knox,lakdwood estate,syfojd,4129,vip,19280722.0,1682994
rec-1-dup-0,aloxandra,teraz garcia,4.0,tlace pike,ryuuthorek,maxsden,2152,ilw,19112816.0,6628109
rec-10-dup-0,carmen maria,garcia burgos,3.0,road carrington,vlge lppacy,yagbona,2446,rsh,19680531.0,9232033
rec-100-dup-0,iegcracion,zolifa tardo,,street tindale,viyoae2,cqoher xeights,4125,viy,,7370020
rec-1000-dup-0,maria teresa,crespo sandhez,53.0,wyvalena erove,ifverbeaph,paralowie,5065,nsw,19723503.0,1267612


In [14]:
# create indices to compare and set blocking variable
indexer = recordlinkage.Index()
indexer.block(left_on='postcode', right_on='postcode')
candidate_pairs = indexer.index(df_org, df_dup)

In [15]:
# compute similarity df using only jarowinkler
# initialise class
comp = recordlinkage.Compare()

# initialise similarity measurement algorithms
comp.string('given_name', 'given_name', method='jarowinkler')
comp.string('surname', 'surname', method='jarowinkler')
comp.exact('street_number', 'street_number')
comp.string('address_1', 'address_1', method='jarowinkler')
comp.string('address_2', 'address_2', method='jarowinkler')
comp.string('suburb', 'suburb', method='jarowinkler')
comp.string('postcode', 'postcode', method='jarowinkler')
comp.string('state', 'state', method='jarowinkler')
comp.string('date_of_birth', 'date_of_birth', method='jarowinkler')
comp.string('soc_sec_id', 'soc_sec_id', method='jarowinkler')

# the method .compute() returns the DataFrame with the feature vectors
df_compare = comp.compute(candidate_pairs, df_org, df_dup)

In [16]:
def true_class(org, dup):
  '''returns true if org and dup are true pair
  false otherwise'''
  if links[org] == dup:
    return True
  else:
     return False
     
# add true label column to dataframe
df_compare['true_class'] = [true_class(org, dup) for org, dup in df_compare.index]

In [25]:
# pull out n labeled pairs for training - 
# ensure at least 1 match
# this would correspond with hand labeling 50 pairs
n=50
while True:
  X_train = df_compare.sample(n=n, random_state=11)
  y_train = X_train['true_class']
  if np.sum(y_train) > 0:
    break
X_train = X_train.drop('true_class', axis=1)

X_test = df_compare.drop(X_train.index, axis=0)
y_test = X_test['true_class']
X_test = X_test.drop('true_class', axis=1)

# check that rows add up and that X_train index not in X_test
assert X_train.shape[0] + X_test.shape[0] == df_compare.shape[0]
assert X_train.index[np.random.randint(n)] not in X_test.index

In [26]:
# test simple model using the n labeled datapoints
lr = LogisticRegression(penalty='l2')
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# note: accuracy, precision and recall are all measured only considering
# the matches that ended up in the same block
# this is because currently we are just trying to work on the classifier,
# investigating blocking strategies is a next step

print('accuracy', accuracy_score(y_test, y_pred))
print('recall', recall_score(y_test, y_pred))
print('precision', precision_score(y_test, y_pred))

accuracy 0.8879862740292027
recall 0.24068359838594824
precision 1.0


In [27]:
# ititialize logistic regresssion model and Active Learner
# we will use the ActiveLearning class to increase our training
# data before using the ensemble class
lr = LogisticRegression(penalty='l2')
AL = ActiveLearner(df_org, df_dup, X_train, y_train, X_test, lr, 1000, 10)

In [31]:
AL.train()

pre train
 training data shape:  (1060, 10)
 training labels shape:  (1060,)
 unlabeled data shape:  (27549, 10)
post train
 training data shape:  (2060, 10)
 training labels shape:  (2060,)
 unlabeled data shape:  (26549, 10)


In [32]:
# note: accuracy, precision and recall are all measured only considering
# the matches that ended up in the same block
# this is because currently we are just trying to work on the classifier
# investigating blocking strategies is a next step

lr_AL = LogisticRegression(penalty='l2')
lr_AL.fit(AL.X_train, AL.y_train)
y_pred = lr_AL.predict(AL.X_nolabel)
y_true = [true_class(org, dup) for org, dup in AL.X_nolabel.index]
print('accuracy', accuracy_score(y_pred, y_true))
print('recall', recall_score(y_true, y_pred))
print('precision', precision_score(y_true, y_pred))

accuracy 0.9525405853327809
recall 0.6903860711582135
precision 0.9880823401950163


In [30]:
# method to perform clerical review
# after completing review iterate between training and clerical_review methods to improve performance
AL.clerical_review()

Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2033-org,carol andrea,illan amer,170,wittenoom crescent,ainslie house,malvern,3161,qld,19321127,7254550
rec-2033-dup-0,andrea cfrol,amev xllpn,170,crescent winteboom,ainsli w ohuse,ialfflb,3161,qjd,19321127,7254550


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3477-org,faustino,canellas jorda,193,frew close,howie circuit,darlington,3875,vic,19370321,1546559
rec-3477-dup-0,daustjnf,canellasujorda,693,zrfwfchcoe,howde cifcuit,darltagtjn,3875,vim,89740321,1346559


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2425-org,maria gertrudis,pedrajas belmonte,12,edman close,targonga,barcaldine,6028,nsw,19000812,3954979
rec-2425-dup-0,gertrulip earia,ledrajas belmonte,52,oue admanecz,badgonga,vale btrkeley,6028,nsw,49410818,3954979


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3224-org,lorena carmen,vilalta minano,2,chevalier street,,wantirna,3150,vic,19740724,6423373
rec-3224-dup-0,carmen lorena,vilalta minano,2,chuvajie r street,,weebevna,3150,vfc,19760724,6423373


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3534-org,jose eulogio,diamanka coto,2,jerrabomberra avenue,yambungan station,cecil hills,4702,vic,19481117,6961701
rec-3534-dup-0,eulogio jose,coto diamanka,2,avente jerrabomberra,station yambongan,recwlegcllj,4702,vif,39431117,6961701


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2584-org,isabel francisca,paez diz,2,lance hill avenue,rosemount,whitfield,4207,qld,19120203,3093917
rec-2584-dup-0,fravcisca isibel,diz paeo,5,avenue mabcsokicn,rosrmount,uhittield,4207,qld,10120203,3493917


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2008-org,antonia elvira,payo trillo,9,saunders street,scts crk,macquarie park,6323,nsw,19040601,8785526
rec-2008-dup-0,elwira antonia,payo trislo,7,scts rck,saundqrs ssreet,park macquarie,6323,nsw,19321604,8785526


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-1824-org,elizabet,pina cardo,559,george street,garden settlement,newmarket,4505,qld,19950329,4718403
rec-1824-dup-0,elizabet,capdo pina,559,afreet gqorgo,,newmarket,4505,qlz,19950329,818403


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-3408-org,esteban,garrigos carranza,13,parkhill street,myola street,mount pleasant,7248,tas,19510311,5036866
rec-3408-dup-0,esteban,varranza gvgrigov,16,pprkhillystreet,myolajdtreet,plepsant mount,7248,tas,19510311,5001566


Enter label (True/False) :True


Unnamed: 0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec-2578-org,hortensia,puerto gascon,113,companion crescent,parkdale lodge,cleveland,3556,vic,19030124,5666451
rec-2578-dup-0,rtrbevsik,gascon puzrto,112,crescent companiozn,parkqals lodge,cgivelnnd,3556,vic,18890126,5666451


Enter label (True/False) :True
all data labeled


In [None]:
# note: After one round of active learning the performance 
# of a simple linear regression was 
# accuracy 0.9525405853327809
# recall 0.6903860711582135
# precision 0.9880823401950163
# the training data has increased from 50 to 2060 with only 10 additional observations labeled by hand during AL
# lets see if we can improve with the ensemble

In [33]:
# initialize ensemble class
Ens = Ensemble(df_org, df_dup, candidate_pairs)

In [34]:
# compute highest diversity sets of distance measures
Ens.measurement_scheme(.5, True)

{'address_1': ['jaro', 'qgram'],
 'address_2': ['jaro', 'qgram'],
 'date_of_birth': ['cosine', 'jaro'],
 'given_name': ['jaro', 'qgram'],
 'postcode': ['jaro', 'jarowinkler'],
 'soc_sec_id': ['cosine', 'jaro'],
 'state': ['jaro', 'qgram'],
 'street_number': ['jaro', 'qgram'],
 'suburb': ['jaro', 'qgram'],
 'surname': ['jaro', 'qgram']}

In [35]:
# compute schemes of all combinations of distance measures
# the second parameter is optional and can be used to trigger early stopping after n schemes are built
# this can be used to save time when experimenting but the parameter should not be passed when
# it is time to make final predictions
# for example run Ens.build_schemes(False) when training final model
Ens.build_schemes(False, 100)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [46]:
# create list of classifiers to pass into ensemble
# model_list = [LogisticRegression(penalty='l2') for _ in range(100)]
model_list = [LogisticRegression(penalty='l2') if i%2 == 0 else RFC() for i in range(100)]
# model_list = [RFC() for _ in range(100)]

In [47]:
# method to train ensembles 
# we will use all the traning data that we now have saved in the ActiveLearner class
indices_train_AL = AL.X_train.index
y_train_AL = AL.y_train
Ens.train_ensemble(indices_train_AL, y_train_AL, model_list)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [48]:
# method to predict 
# we will use all the unlabeled data in the ActiveLearner class
indices_test_AL = AL.X_nolabel.index
y_pred = Ens.pred_ensemble(indices_test_AL)

In [49]:
# list of true labels to evaluate model performance
y_true_AL = [true_class(org, dup) for org, dup in indices_test_AL]

In [50]:
# note: accuracy, precision and recall are all measured only considering
# the matches that ended up in the same block
# this is because currently we are just trying to work on the classifier
# investigating blocking strategies is a next step

print('accuracy', accuracy_score(y_true_AL, y_pred))
print('recall', recall_score(y_true_AL, y_pred))
print('precision', precision_score(y_true_AL, y_pred))

accuracy 0.9803005762928924
recall 0.8682816048448145
precision 0.9997094712376525


In [None]:
# accuracy, precision and recall are all improved
# most notably with the ensemble the recall is improved significantly
# also its important to note that the data is extremely noisy
# likely noisier than real world data

In [None]:
# just lr
# accuracy 0.9710346905721496
# recall 0.8062074186222559
# precision 0.9996871088861077

# half lr half RF
# accuracy 0.9803005762928924
# recall 0.8682816048448145
# precision 0.9997094712376525

# only RF
# accuracy 0.9920524313533466
# recall 0.9475145092101943
# precision 0.9992017030335285