In [103]:

import pandas as pd
from thefuzz import fuzz
import numpy as np
import time

start_time = time.perf_counter()
# Load the data
df = pd.read_csv("data.csv")
df = df.drop(columns=['id'])

# Preprocessing - normalize string case and fill missing value.
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.lower().fillna('')  


#  calculate the similarity between two rows
def row_similarity(row1, row2):
    similarity_sum = 0
    for col in df.columns:
        similarity_sum += fuzz.ratio(str(row1[col]), str(row2[col]))
    return similarity_sum / len(df.columns)

# Grouping based on row similarity
threshold = 75  # Similarity threshold 
group_ids = np.full(len(df), -1)
current_group = 0

for i in range(len(df)):
    if group_ids[i] == -1:
        group_ids[i] = current_group
        for j in range(i + 1, len(df)):
            if group_ids[j] == -1 and row_similarity(df.loc[i], df.loc[j]) >= threshold:
                group_ids[j] = current_group
        current_group += 1

df['fuzz_group'] = group_ids

end_time = time.perf_counter()   

elapsed = end_time - start_time
print(f"Elapsed time for Levenshtein distance : {elapsed:.4f} seconds")



df = df.sort_values(by='fuzz_group')
df.head(40)




Elapsed time for Levenshtein distance : 32.5106 seconds


Unnamed: 0,first_name,surname,dob,city,email,fuzz_group
0,,brown,2005-07-15,london,sarahbron@mckinney.com,0
843,emily,brwn,2005-07-15,london,sarahbrown@mckinney.com,0
611,emily,brown,2005-07-15,lndon,sarahbrown@mckinney.com,0
609,,brown,2005-07-15,london,sbraharown@ckinney.com,0
997,,brown,2005-07-15,london,sarahbroon@mckinneycwm,0
260,,brown,2005-10-06,london,sarahbrown@mckinney.com,0
695,isablle,brown,1979-01-22,ttke-on-srent,josephwatson@smith.com,1
657,isabelle,brown,1979-01-22,stoke-on-trent,josephwatson@smith.com,1
342,isabelle,brown,1979-01-22,stoke-on-trent,josephwatson@smith.com,1
319,isabelle,born,1979-01-22,stte-on-trenk,josephwatson@smith.com,1


In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import NearestNeighbors
import time

start_time = time.perf_counter()

# Load the data
df = pd.read_csv("data.csv")

# normalize strings to lowercase
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.lower()

# Preprocessing: add placeholder for mising value. Id is not needed for string matching
df = df.fillna("missing")
df = df.drop(columns=['id'])


# Identify column types
text_cols = [col for col in df.columns if df[col].dtype == 'object']

# Build column transformer
transformers = []
if text_cols:
    transformers.append(('text', TfidfVectorizer(), text_cols))


# For text columns, TfidfVectorizer expects 1D input, so process each separately and concatenate
from scipy.sparse import hstack

X_parts = []
for col in text_cols:
    vect = TfidfVectorizer()
    X_parts.append(vect.fit_transform(df[col]))


X = hstack(X_parts)
# Now use NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=15, metric='cosine').fit(X)
distances, indices = nbrs.kneighbors(X)

# Grouping based on nearest neighbors
group_ids = np.full(X.shape[0], -1)
current_group = 0
for i in range(X.shape[0]):
    if group_ids[i] == -1:
        group_ids[i] = current_group
        for neighbor in indices[i]:
            group_ids[neighbor] = current_group
        current_group += 1

df['KNN_group'] = group_ids

end_time = time.perf_counter()   

elapsed = end_time - start_time
print(f"Elapsed time for KNN  : {elapsed:.4f} seconds")

df = df.sort_values(by='KNN_group')
df.head(40)


Elapsed time for KNN  : 0.0939 seconds


Unnamed: 0,first_name,surname,dob,city,email,KNN_group
319,isabelle,born,1979-01-22,stte-on-trenk,josephwatson@smith.com,1
385,isabelle,orbw,1979-01-22,stoke-on-trent,joshphwatson@smite.co,1
2,iilwam,missing,2000-06-27,missing,taylor70@fisher.nfo,2
3,carter,alfie,1997-03-13,london,jennifermarshall@browning.com,3
967,carter,alfie,1997-03-13,nonol,jennifermarshal@brownilg.com,3
128,carter,alfie,1997-06-06,london,jennifermarshall@browning.com,3
867,alfi,carter,1997-03-13,london,jennifermarshall@browning.com,3
437,watson,noah,2008-03-23,bolton,matthbw78eallard-mcdonald.net,4
109,watson,noah,2008-01-21,bolno,matthea78@bwllar-mcdonald.net,4
596,onah,watson,2008-03-23,bolton,matthew78@ballard-mcdonald.net,4
