In [13]:
import pandas as pd
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q

## All information is randomly generated, contains no PII whatsoever

# Import CSV to df
df_clients_with_duplicates = pd.read_csv('csv/clients_with_duplicates.csv')

# Replace all NaN values with an empty string
df_clients_with_duplicates.fillna('', inplace=True)

# Reduce number of entries
df_clients_with_duplicates = df_clients_with_duplicates.head(2000)

def setup_elasticsearch_index(df, index_name):
    # Initialize Elasticsearch client
    es = Elasticsearch(
        'https://elastic:9200',
        basic_auth=('elastic', 'password'),
        verify_certs=False,
        ssl_show_warn=False
    )
    # Delete the index if it exists
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Deleted existing index {index_name}")

    # Create the index with optimized settings for text analysis
    settings = {
        "settings": {
            "analysis": {
                "analyzer": {
                    "custom_text_analyzer": {
                        "type": "custom",
                        "tokenizer": "whitespace",
                        "filter": ["lowercase", "asciifolding"]
                    }
                },
                "normalizer": {
                    "custom_normalizer": {
                        "type": "custom",
                        "filter": ["lowercase", "asciifolding"]
                    }
                }
            }
        },
        "mappings": {
            "properties": {
                "name": {"type": "text", "analyzer": "custom_text_analyzer"},
                "email1": {"type": "text", "analyzer": "custom_text_analyzer"},
                "email2": {"type": "text", "analyzer": "custom_text_analyzer"},
                "legal": {"type": "keyword", "normalizer": "custom_normalizer"}
            }
        }
    }
    es.indices.create(index=index_name, body=settings)
    print(f"Created index {index_name} with custom settings and mappings")

    # Create an iterable of actions to be executed in bulk
    actions = [
        {
            "_index": index_name,
            "_id": str(record['id']),
            "_source": record,
        }
        for record in df.to_dict(orient='records')
    ]
    
    # Perform bulk insert
    helpers.bulk(es, actions)
    # Refresh the index to make the changes searchable
    es.indices.refresh(index=index_name)
    print(f"Data uploaded to index {index_name}")

setup_elasticsearch_index(df_clients_with_duplicates, 'es_clients_with_duplicates')

Deleted existing index es_clients_with_duplicates
Created index es_clients_with_duplicates with custom settings and mappings
Data uploaded to index es_clients_with_duplicates


In [14]:
# Perform a cross-comparison among all documents in the index for any given field
def find_potential_duplicates(index_name):
    es = Elasticsearch(
        'https://elastic:9200',
        basic_auth=('elastic', 'password'),
        verify_certs=False,
        ssl_show_warn=False
    )
    
    # Fetch all documents to use each as a basis for comparison
    search = Search(using=es, index=index_name).source(['name', 'legal', 'email1', 'email2', 'id', 'date'])
    all_docs = [doc.to_dict() for doc in search.scan()]
    results_df = pd.DataFrame()

    # Define the fields to be searched for duplicates and the type of search
    fields_to_search = {
        'name': 'fuzzy',
        'email1': 'exact',
        'email2': 'exact',
        'legal': 'exact'
    }

    # Check each document against all others for each field
    for doc in all_docs:
        for field, match_type in fields_to_search.items():
            if match_type == 'fuzzy':
                query = Q('fuzzy', **{field: {'value': doc[field], 'fuzziness': 'AUTO'}})
            else:  # Exact match for emails
                query = Q('match', **{field: doc[field]})

            search_results = Search(using=es, index=index_name).query(query).exclude('ids', values=[doc['id']])
            duplicates = search_results.execute()

            # Create a dataframe from hits and concatenate it with the results DataFrame
            hits_df = pd.DataFrame([hit.to_dict() for hit in duplicates])
            if not hits_df.empty:
                # Add a grouping identifier
                hits_df['group_id'] = f"{field}_{doc[field]}"
                results_df = pd.concat([results_df, hits_df]).drop_duplicates().reset_index(drop=True)

    # Sort by group_id to cluster related entries together
    if not results_df.empty:
        results_df = results_df.sort_values(by='group_id')
        return results_df
    else:
        return pd.DataFrame()  # Return an empty dataframe if no duplicates found

duplicate_results = find_potential_duplicates('es_clients_with_duplicates')
duplicate_results

Unnamed: 0,id,name,date,email1,email2,legal,group_id
82,44779,Alma Miranda,15/6/2022,amiranda@523344123.net,amiranda@outlook.456,Y9245502,email1_amiranda@523344123.net
9,47578,Ariel Miranda,13/6/2019,amiranda@523344123.net,amiranda@yahoo.123,Y8885086,email1_amiranda@523344123.net
73,46148,Evan Griffith,5/2/2023,egriffith@yahoo.123,eva_gri@523344123.net,Y9601865,email1_egriffith@yahoo.123
10,51148,Evan Grìffith,05/02/2020,egriffith@yahoo.123,egriffith@yahoo.123,X4792429,email1_egriffith@yahoo.123
128,49676,Enoch Hendérson,16/03/2018,ehenderson@523344123.net,,Y1923489,email1_ehenderson@523344123.net
...,...,...,...,...,...,...,...
125,44144,Ezra Stephens,19/2/2024,estephens@hotmail.abc,ezrastephens90@outlook.456,Z6441458,legal_Z6441458
92,47502,Amari Khan,9/2/2023,akhan@yahoo.123,akhan@gmx.bbb,Z7069861,legal_Z7069861
55,44041,Denisse Li,6/9/2024,dli@yahoo.123,den_li@gmail.xyz,Z7069861,legal_Z7069861
163,43301,Whitley McLaughlin,28/11/2020,whitleymclaughlin54@523344123.net,whi_mcl@523344123.net,Z9120239,legal_Z9120239
