In [9]:
import time
import re
import string
import pandas as pd
from simhash import compute_simhash
from recordlinkage.datasets import load_febrl2, load_febrl3, load_febrl1

In [10]:
df = load_febrl1()
df_processed = df.fillna('')
merged_column = df_processed.apply(lambda x: ' '.join(x.astype(str)), axis=1)
df["text"] = merged_column
csv_file_path = 'febrl1_processed_with_text.csv'
df.to_csv(csv_file_path, index=True, encoding='utf-8')

In [11]:
N = 2
def get_ngrams(text, n=N):
    text = re.sub(r'\s+', ' ', str(text)).strip()
    tokens = []
    for i in range(len(text) - n + 1):
        tokens.append(text[i:i+n])      
    return tokens

def preprocess(text, n=N):
    remove_chars = string.punctuation + '@.'
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', remove_chars))
    tokens = get_ngrams(text, n)
    return tokens


In [12]:
print(f"Computing SimHash with {N}-grams...")
start_time = time.time()
df['tokens'] = df['text'].apply(lambda x: preprocess(x, N))
df['simhash'] = df['tokens'].apply(lambda x: compute_simhash(x))
end_time = time.time()
print(f"SimHash computation completed in {end_time - start_time:.2f} seconds.")

Computing SimHash with 2-grams...
SimHash computation completed in 1.33 seconds.


In [13]:
df.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,text,tokens,simhash
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
rec-223-org,,waller,6.0,tullaroop street,willaroo,st james,4011,wa,19081209,6988048,waller 6 tullaroop street willaroo st james 4...,"[wa, al, ll, le, er, r , 6, 6 , t, tu, ul, l...",286570266123114720421303426911524659372
rec-122-org,lachlan,berry,69.0,giblin street,killarney,bittern,4814,qld,19990219,7364009,lachlan berry 69 giblin street killarney bitte...,"[la, ac, ch, hl, la, an, n , b, be, er, rr, r...",38699459394609613432095594765472481467
rec-373-org,deakin,sondergeld,48.0,goldfinch circuit,kooltuo,canterbury,2776,vic,19600210,2635962,deakin sondergeld 48 goldfinch circuit kooltuo...,"[de, ea, ak, ki, in, n , s, so, on, nd, de, e...",266922920000695473676335319035485293810
rec-10-dup-0,kayla,harrington,,maltby circuit,coaling,coolaroo,3465,nsw,19150612,9004242,kayla harrington maltby circuit coaling coola...,"[ka, ay, yl, la, a , h, ha, ar, rr, ri, in, n...",294577098606146773960144782058193339828
rec-227-org,luke,purdon,23.0,ramsay place,mirani,garbutt,2260,vic,19831024,8099933,luke purdon 23 ramsay place mirani garbutt 226...,"[lu, uk, ke, e , p, pu, ur, rd, do, on, n , ...",330594832292163942997035362013976454713


In [14]:
# K = 30
# def hamming_distance(hash1, hash2):
#     x =hash1 ^ hash2
#     bin_x = bin(x)
#     return bin_x.count('1')

# search_start_time = time.time()
# found_pairs = set()
# hashes = df['simhash'].tolist()
# indices = df.index.tolist()
# num_records = len(hashes)
# for i in range(num_records):
#     for j in range(i + 1, num_records):
#         dist = hamming_distance(hashes[i], hashes[j])
#         if dist <= K:
#             found_pairs.add(tuple(sorted((i, j))))
# search_end_time = time.time()
# print(f"Found {len(found_pairs)} similar pairs within Hamming distance {K}.")
# print(f"Search completed in {search_end_time - search_start_time:.2f} seconds.")

In [15]:
K = 30
def hamming_distance(hash1, hash2):
        x = hash1 ^ hash2
        return bin(x).count('1') 
def brute_force(df, K=K):
    hashes = df['simhash'].tolist()
    num_records = len(hashes)
    found_pairs = set()

    # def hamming_distance(hash1, hash2):
    #     x = hash1 ^ hash2
    #     return bin(x).count('1')

    search_start_time = time.time()
    for i in range(num_records):
        for j in range(i + 1, num_records):
            dist = hamming_distance(hashes[i], hashes[j])
            if dist <= K:
                found_pairs.add(tuple(sorted((i, j))))
    search_end_time = time.time()

    print(f"Found {len(found_pairs)} similar pairs within Hamming distance {K}.")
    print(f"Search completed in {search_end_time - search_start_time:.2f} seconds.")
    return found_pairs
found_pairs_bf = brute_force(df, K=30)


Found 547 similar pairs within Hamming distance 30.
Search completed in 0.34 seconds.


In [16]:
def generate_ground_truth_from_ids(df):
    print("Generating Ground Truth from IDs...")
    start_time = time.time()
    
    entity_groups = {}
    
    for int_idx, rec_id_str in enumerate(df.index):
        try:
            parts = rec_id_str.split('-')
            entity_id = parts[1]
            
            if entity_id not in entity_groups:
                entity_groups[entity_id] = []
            entity_groups[entity_id].append(int_idx)
            
        except IndexError:
            continue 
            
    true_pairs = set()
    for ent_id, indices in entity_groups.items():
        if len(indices) > 1:
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    pair = tuple(sorted((indices[i], indices[j])))
                    true_pairs.add(pair)
                    
    print(f"Found {len(true_pairs)} actual duplicate pairs in {time.time() - start_time:.4f}s")
    return true_pairs

def lsh_candidates(simhashes, band_size=4, num_bands=31):
    buckets = [{} for _ in range(num_bands)]
    candidates = set()

    for idx, h in enumerate(simhashes):
        for b in range(num_bands):
            start = b * band_size
            mask = (1 << band_size) - 1
            band_value = (h >> start) & mask
            if band_value not in buckets[b]:
                buckets[b][band_value] = []
            buckets[b][band_value].append(idx)

    for b in range(num_bands):
        for bucket_indices in buckets[b].values():
            if len(bucket_indices) > 1:
                for i in range(len(bucket_indices)):
                    for j in range(i + 1, len(bucket_indices)):
                        pair = tuple(sorted((bucket_indices[i], bucket_indices[j])))
                        candidates.add(pair)

    return candidates

hashes = df['simhash'].tolist()
search_start_time = time.time()
candidates = lsh_candidates(hashes, band_size=4, num_bands=31)

found_pairs = set()
for i, j in candidates:
    dist = hamming_distance(hashes[i], hashes[j])
    if dist <= K:
        found_pairs.add((i, j))
true_pairs_from_id = generate_ground_truth_from_ids(df)
true_positives = len(found_pairs.intersection(true_pairs_from_id))
precision = true_positives / len(found_pairs) if len(found_pairs) > 0 else 0
recall = true_positives / len(true_pairs_from_id) if len(true_pairs_from_id) > 0 else 0
print(f"Ground truth pairs:{len(true_pairs_from_id)}")
print(f"Total candidates found by SimHash:{len(found_pairs)}")
print(f"True positives:{true_positives}")
print(f"Missed:{len(true_pairs_from_id) - true_positives}")
print(f"Precision:{precision:.4f}")
print(f"Recall:{recall:.4f}")

Generating Ground Truth from IDs...
Found 500 actual duplicate pairs in 0.0010s
Ground truth pairs:500
Total candidates found by SimHash:547
True positives:487
Missed:13
Precision:0.8903
Recall:0.9740
