In [98]:
%load_ext autoreload
%autoreload 2
import time
import re
import string
import pandas as pd
from simhash import compute_simhash
from recordlinkage.datasets import load_febrl2, load_febrl3

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
df = load_febrl3()
exclude_columns = ['soc_sec_id', 'street_number']
df_processed = df.fillna('')
cols_to_drop = [col for col in exclude_columns if col in df_processed.columns]
merged_column = df_processed.drop(columns=cols_to_drop).apply(lambda x: ' '.join(x.astype(str)), axis=1)
df["text"] = merged_column
csv_file_path = 'febrl3_processed_with_text.csv'
df.to_csv(csv_file_path, index=True, encoding='utf-8')

In [100]:
N = 2
def get_ngrams(text, n=N):
    text = re.sub(r'\s+', ' ', str(text)).strip()
    tokens = []
    for i in range(len(text) - n + 1):
        tokens.append(text[i:i+n])      
    return tokens

def preprocess(text, n=N):
    remove_chars = string.punctuation + '@.'
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', remove_chars))
    tokens = get_ngrams(text, n)
    return tokens


In [101]:
print(f"Computing SimHash with {N}-grams...")
start_time = time.time()
df['tokens'] = df['text'].apply(lambda x: preprocess(x, N))
df['simhash'] = df['tokens'].apply(lambda x: compute_simhash(x))
end_time = time.time()
print(f"SimHash computation completed in {end_time - start_time:.2f} seconds.")

Computing SimHash with 2-grams...
SimHash computation completed in 4.89 seconds.


In [102]:
df.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,text,tokens,simhash
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
rec-1496-org,mitchell,green,7,wallaby place,delmar,cleveland,2119,sa,19560409,1804974,mitchell green wallaby place delmar cleveland ...,"[mi, it, tc, ch, he, el, ll, l , g, gr, re, e...",16974818697062130366
rec-552-dup-3,harley,mccarthy,177,pridhamstreet,milton,marsden,3165,nsw,19080419,6089216,harley mccarthy pridhamstreet milton marsden 3...,"[ha, ar, rl, le, ey, y , m, mc, cc, ca, ar, r...",18343589810354950704
rec-988-dup-1,madeline,mason,54,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,19081128,2185997,madeline mason hoseason street lakefront retrm...,"[ma, ad, de, el, li, in, ne, e , m, ma, as, s...",6621136958305624816
rec-1716-dup-1,isabelle,,23,gundulu place,currin ga,utakarra,2193,wa,19921119,4314184,isabelle gundulu place currin ga utakarra 219...,"[is, sa, ab, be, el, ll, le, e , g, gu, un, n...",18421294960864420282
rec-1213-org,taylor,hathaway,7,yuranigh court,brentwood vlge,,4220,nsw,19991207,9144092,taylor hathaway yuranigh court brentwood vlge ...,"[ta, ay, yl, lo, or, r , h, ha, at, th, ha, a...",4869050301418030256


In [103]:
K = 12
def hamming_distance(hash1, hash2):
    x =hash1 ^ hash2
    bin_x = bin(x)
    return bin_x.count('1')

search_start_time = time.time()
found_pairs = set()
hashes = df['simhash'].tolist()
indices = df.index.tolist()
num_records = len(hashes)
for i in range(num_records):
    for j in range(i + 1, num_records):
        dist = hamming_distance(hashes[i], hashes[j])
        if dist <= K:
            found_pairs.add(tuple(sorted((i, j))))
search_end_time = time.time()
print(f"Found {len(found_pairs)} similar pairs within Hamming distance {K}.")
print(f"Search completed in {search_end_time - search_start_time:.2f} seconds.")

Found 8747 similar pairs within Hamming distance 12.
Search completed in 11.49 seconds.


In [104]:
def generate_ground_truth_from_ids(df):
    print("Generating Ground Truth from IDs...")
    start_time = time.time()
    
    entity_groups = {}
    
    for int_idx, rec_id_str in enumerate(df.index):
        try:
            parts = rec_id_str.split('-')
            entity_id = parts[1]
            
            if entity_id not in entity_groups:
                entity_groups[entity_id] = []
            entity_groups[entity_id].append(int_idx)
            
        except IndexError:
            continue 
            
    true_pairs = set()
    for ent_id, indices in entity_groups.items():
        if len(indices) > 1:
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    pair = tuple(sorted((indices[i], indices[j])))
                    true_pairs.add(pair)
                    
    print(f"Done! Found {len(true_pairs)} actual duplicate pairs in {time.time() - start_time:.4f}s")
    return true_pairs

true_pairs_from_id = generate_ground_truth_from_ids(df)
true_positives = len(found_pairs.intersection(true_pairs_from_id))
precision = true_positives / len(found_pairs) if len(found_pairs) > 0 else 0
recall = true_positives / len(true_pairs_from_id) if len(true_pairs_from_id) > 0 else 0
print(f"Ground Truth: {len(true_pairs_from_id)}")
print(f"SimHash:        {len(found_pairs)}")
print(f"True Positives:{true_positives}")
print(f"Missed:           {len(true_pairs_from_id) - true_positives}")
print(f"PRECISION: {precision:.4f}")
print(f"RECALL:    {recall:.4f}")

Generating Ground Truth from IDs...
Done! Found 6538 actual duplicate pairs in 0.0107s
Ground Truth: 6538
SimHash:        8747
True Positives:5111
Missed:           1427
PRECISION: 0.5843
RECALL:    0.7817
