In [5]:
#import package and dataset
import recordlinkage
from recordlinkage.datasets import load_febrl4

In [80]:
df1, df2, true_links= load_febrl4(return_links=True)

## From the data preview below, the loaded dataset are already standardized. So there is no preprocessing. In reality, data cleaning is usually required to improve the accuracy of the linkage.

In [11]:
#data preview
df1

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-1070-org,michaela,neumann,8,stanley street,miami,winston hills,4223,nsw,19151111,5304218
rec-1016-org,courtney,painter,12,pinkerton circuit,bega flats,richlands,4560,vic,19161214,4066625
rec-4405-org,charles,green,38,salkauskas crescent,kela,dapto,4566,nsw,19480930,4365168
rec-1288-org,vanessa,parr,905,macquoid place,broadbridge manor,south grafton,2135,sa,19951119,9239102
rec-3585-org,mikayla,malloney,37,randwick road,avalind,hoppers crossing,4552,vic,19860208,7207688
...,...,...,...,...,...,...,...,...,...,...
rec-2153-org,annabel,grierson,97,mclachlan crescent,lantana lodge,broome,2480,nsw,19840224,7676186
rec-1604-org,sienna,musolino,22,smeaton circuit,pangani,mckinnon,2700,nsw,19890525,4971506
rec-1003-org,bradley,matthews,2,jondol place,horseshoe ck,jacobs well,7018,sa,19481122,8927667
rec-4883-org,brodee,egan,88,axon street,greenslopes,wamberal,2067,qld,19121113,6039042


In [12]:
df2

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
rec-561-dup-0,elton,,3,light setreet,pinehill,windermere,3212,vic,19651013,1551941
rec-2642-dup-0,mitchell,maxon,47,edkins street,lochaoair,north ryde,3355,nsw,19390212,8859999
rec-608-dup-0,,white,72,lambrigg street,kelgoola,broadbeach waters,3159,vic,19620216,9731855
rec-3239-dup-0,elk i,menzies,1,lyster place,,northwood,2585,vic,19980624,4970481
rec-2886-dup-0,,garanggar,,may maxwell crescent,springettst arcade,forest hill,2342,vic,19921016,1366884
...,...,...,...,...,...,...,...,...,...,...
rec-4495-dup-0,connor,belperio,15,,,ryde,2570,nsw,19170518,5394641
rec-4211-dup-0,daniel,maspn,9,derrington crescent,el pedro caravan park,sunnybank,4350,vic,19500705,5525378
rec-3131-dup-0,samuel,crofs,613,banjine street,kurrajong vlge,pengzin,2230,qld,19410531,4467228
rec-3815-dup-0,saah,beattih,60,kay's place,oldershaw court,ashfield,2047,vic,19500712,9435148


## In gernral, there are four steps in data linkage:
### 1 Compute the potential candidate for linkage. Since full method leads to too many candidates for linkage, we use blocking method which only include those candidates which has identical 'surname'.

In [26]:
#first step is to make potential candidate for linkage, using full possible pairs will lead to too many pairs. For 
#this example, full leads to 25000000 pairs (equals to 5000 * 5000).
indexer = recordlinkage.Index().full()
candidate_links = indexer.index(df1, df2)
len(candidate_links)



25000000

In [38]:
#since full method leads to too many pairs, block method is applied to reduce the number of potential candidates
#to those have same values for the attribute 'surname'. By using block method, the number of potential candidates for
#linkage reduced significantly (from 25000000 to 84831).
indexer = recordlinkage.Index().block('surname')
candidate_links = indexer.index(df1, df2)
len(candidate_links)

84831

### 2 Compute the vector for the candidates. Here, different comparison methods are used for different columns. For each candidate, elements of compare vector is either 1(match) or 0(not match) based on the comaprion method. Since we restrict the candidates to thoose have identical 'surname', the element of compare vector for 'surname' is always 1.

In [61]:
# compute the compare vector for candidate linkage.
compare = recordlinkage.Compare()

compare.exact('surname', 'surname', label='surname')
compare.string('given_name', 'given_name', method='jarowinkler', threshold=0.85, label='given_name')
compare.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
compare.exact('suburb', 'suburb', label='suburb')
compare.exact('state', 'state', label='state')
compare.string('address_1', 'address_1', threshold=0.85, label='address_1')
compare_vector= compare.compute(candidate_links, df1, df2)

In [62]:
compare_vector

Unnamed: 0_level_0,Unnamed: 1_level_0,surname,given_name,date_of_birth,suburb,state,address_1
rec_id_1,rec_id_2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
rec-1070-org,rec-2672-dup-0,1,0.0,0,0,0,0.0
rec-1070-org,rec-4387-dup-0,1,0.0,0,0,0,0.0
rec-1070-org,rec-787-dup-0,1,0.0,0,0,0,0.0
rec-1070-org,rec-2158-dup-0,1,0.0,0,0,0,0.0
rec-2797-org,rec-2672-dup-0,1,0.0,0,0,0,0.0
...,...,...,...,...,...,...,...
rec-2606-org,rec-3006-dup-0,1,0.0,0,0,1,0.0
rec-2694-org,rec-2694-dup-0,1,1.0,1,0,1,1.0
rec-2814-org,rec-2814-dup-0,1,1.0,0,1,1,1.0
rec-7-org,rec-7-dup-0,1,0.0,1,1,1,1.0


### 3 Classification of candidates based on the compare vector. Here we use two classifiers:
#### 1thres classifier: the candidates are classified as linked only if at least 4 attributes are mached.
#### 2 Expectation Maximization classifier: This probabilistic record linkage algorithm is used in combination with Fellegi and Sunter model. This classifier doesn’t need training data (unsupervised).

In [88]:
pred_links_thres = compare_vector[compare_vector.sum(axis=1)>=4].index
ecm = recordlinkage.ECMClassifier()
pred_links_EM = ecm.fit_predict(compare_vector)

### 4 Evaluation of performance: the F score and accuracy were clacuated. According to the results: under blocking method, the performace of EM classifier and thres classifier is very similar.

In [129]:
'EM classifier F score: {:.2%}'.format(\
recordlinkage.fscore(true_links, pred_links_EM))

'EM classifier F score: 79.53%'

In [130]:
'EM classifier accuracy: {:.2%}'.format(\
recordlinkage.accuracy(true_links, pred_links_EM, total=len(candidate_links)))

'EM classifier accuracy: 97.99%'

In [133]:
'Threshold F score: {:.2%}'.format(\
recordlinkage.fscore(true_links, pred_links_thres))

'Threshold F score: 79.05%'

In [132]:
'Threshold classifier accuracy: {:.2%}'.format(\
recordlinkage.accuracy(true_links, pred_links_thres, total=len(candidate_links)))

'Threshold classifier accuracy: 97.96%'