In [19]:
import time
import re
import string
import pandas as pd
from simhash import SimHash
from recordlinkage.datasets import load_febrl2, load_febrl3, load_febrl1

In [20]:
df = load_febrl3()
df_processed = df.fillna('')
merged_column = df_processed.apply(lambda x: ' '.join(x.astype(str)), axis=1)
df["text"] = merged_column
csv_file_path = 'febrl3_processed_with_text.csv'
df.to_csv(csv_file_path, index=True, encoding='utf-8')

In [21]:
N = 2
def get_ngrams(text, n=N):
    text = re.sub(r'\s+', ' ', str(text)).strip()
    tokens = []
    for i in range(len(text) - n + 1):
        tokens.append(text[i:i+n])      
    return tokens

def preprocess(text, n=N):
    remove_chars = string.punctuation + '@.'
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', remove_chars))
    tokens = get_ngrams(text, n)
    return tokens


In [22]:
print(f"Computing SimHash with {N}-grams...")
start_time = time.time()
df['tokens'] = df['text'].apply(lambda x: preprocess(x, N))
df['simhash'] = df['tokens'].apply(lambda x: SimHash(x))
end_time = time.time()
print(f"SimHash computation completed in {end_time - start_time:.2f} seconds.")

Computing SimHash with 2-grams...
SimHash computation completed in 4.91 seconds.


In [23]:
df.head()

Unnamed: 0_level_0,given_name,surname,street_number,address_1,address_2,suburb,postcode,state,date_of_birth,soc_sec_id,text,tokens,simhash
rec_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
rec-1496-org,mitchell,green,7,wallaby place,delmar,cleveland,2119,sa,19560409,1804974,mitchell green 7 wallaby place delmar clevelan...,"[mi, it, tc, ch, he, el, ll, l , g, gr, re, e...",<Simhash value=2765990234110169005363551208914...
rec-552-dup-3,harley,mccarthy,177,pridhamstreet,milton,marsden,3165,nsw,19080419,6089216,harley mccarthy 177 pridhamstreet milton marsd...,"[ha, ar, rl, le, ey, y , m, mc, cc, ca, ar, r...",<Simhash value=2800069328978646748230465635171...
rec-988-dup-1,madeline,mason,54,hoseason street,lakefront retrmnt vlge,granville,4881,nsw,19081128,2185997,madeline mason 54 hoseason street lakefront re...,"[ma, ad, de, el, li, in, ne, e , m, ma, as, s...",<Simhash value=3929446027905468450972541104822...
rec-1716-dup-1,isabelle,,23,gundulu place,currin ga,utakarra,2193,wa,19921119,4314184,isabelle 23 gundulu place currin ga utakarra ...,"[is, sa, ab, be, el, ll, le, e , 2, 23, 3 , ...",<Simhash value=1071649693158806521435652878504...
rec-1213-org,taylor,hathaway,7,yuranigh court,brentwood vlge,,4220,nsw,19991207,9144092,taylor hathaway 7 yuranigh court brentwood vlg...,"[ta, ay, yl, lo, or, r , h, ha, at, th, ha, a...",<Simhash value=2029888763142476102721413535560...


In [24]:
K = 30
# def hamming_distance(hash1, hash2):
#         x = hash1 ^ hash2
#         return bin(x).count('1') 
def brute_force(df, K=K):
    hashes = df['simhash'].tolist()
    num_records = len(hashes)
    found_pairs = set()

    # def hamming_distance(hash1, hash2):
    #     x = hash1 ^ hash2
    #     return bin(x).count('1')

    search_start_time = time.time()
    for i in range(num_records):
        for j in range(i + 1, num_records):
            #dist = hamming_distance(hashes[i], hashes[j])
            dist = hashes[i].distance(hashes[j])
            if dist <= K:
                found_pairs.add(tuple(sorted((i, j))))
    search_end_time = time.time()

    print(f"Found {len(found_pairs)} similar pairs within Hamming distance {K}.")
    print(f"Search completed in {search_end_time - search_start_time:.2f} seconds.")
    return found_pairs
found_pairs_bf = brute_force(df, K=30)

Found 7434 similar pairs within Hamming distance 30.
Search completed in 7.51 seconds.


In [25]:
def generate_ground_truth(df):
    print("Generating Ground Truth from IDs...")
    start_time = time.time()
    
    entity_groups = {}
    
    for int_idx, rec_id_str in enumerate(df.index):
        try:
            parts = rec_id_str.split('-')
            entity_id = parts[1]
            
            if entity_id not in entity_groups:
                entity_groups[entity_id] = []
            entity_groups[entity_id].append(int_idx)
            
        except IndexError:
            continue 
            
    true_pairs = set()
    for ent_id, indices in entity_groups.items():
        if len(indices) > 1:
            for i in range(len(indices)):
                for j in range(i + 1, len(indices)):
                    pair = tuple(sorted((indices[i], indices[j])))
                    true_pairs.add(pair)
                    
    print(f"Found {len(true_pairs)} actual duplicate pairs in {time.time() - start_time:.4f}s")
    return true_pairs

def lsh_candidates(simhashes, band_size=4, num_bands=31):
    buckets = [{} for _ in range(num_bands)]
    candidates = set()

    for idx, h_obj in enumerate(simhashes):
        h = h_obj.value
        for b in range(num_bands):
            start = b * band_size
            mask = (1 << band_size) - 1
            band_value = (h >> start) & mask
            if band_value not in buckets[b]:
                buckets[b][band_value] = []
            buckets[b][band_value].append(idx)

    for b in range(num_bands):
        for bucket_indices in buckets[b].values():
            if len(bucket_indices) > 1:
                for i in range(len(bucket_indices)):
                    for j in range(i + 1, len(bucket_indices)):
                        pair = tuple(sorted((bucket_indices[i], bucket_indices[j])))
                        candidates.add(pair)

    return candidates

hashes = df['simhash'].tolist()
search_start_time = time.time()
candidates = lsh_candidates(hashes, band_size=4, num_bands=31)

found_pairs = set()
for i, j in candidates:
    # dist = hamming_distance(hashes[i], hashes[j])
    dist = hashes[i].distance(hashes[j])
    if dist <= K:
        found_pairs.add((i, j))
true_pairs_from_id = generate_ground_truth(df)
true_positives = len(found_pairs.intersection(true_pairs_from_id))
precision = true_positives / len(found_pairs) if len(found_pairs) > 0 else 0
recall = true_positives / len(true_pairs_from_id) if len(true_pairs_from_id) > 0 else 0
print(f"Ground truth pairs:{len(true_pairs_from_id)}")
print(f"Total candidates found by SimHash:{len(found_pairs)}")
print(f"True positives:{true_positives}")
print(f"Missed:{len(true_pairs_from_id) - true_positives}")
print(f"Precision:{precision:.4f}")
print(f"Recall:{recall:.4f}")

Generating Ground Truth from IDs...
Found 6538 actual duplicate pairs in 0.0046s
Ground truth pairs:6538
Total candidates found by SimHash:7434
True positives:6102
Missed:436
Precision:0.8208
Recall:0.9333
