In [1]:
import datetime as dt
import numpy as np
import pandas as pd
import random

from fuzzywuzzy import fuzz

# Create Data

In [2]:
M = 100 # number of generated examples

## Set dates


In [3]:
np.random.seed(1)

DATES = pd.date_range(start='01-01-2020', end='31-12-2020', periods=M)
DATES = list(map(lambda x: dt.datetime.strftime(x, format='%B %d, %Y'), DATES)) # Change the format
DATES = np.random.permutation(DATES) # Generate a permutation

## Set Names

In [4]:
import names
NAMES = [names.get_full_name() for _ in range(M)]

## Set address

## Create dataframe

In [5]:
df = pd.DataFrame({
    'Name': NAMES,
    'DOB': DATES
})

df.head(10)



Unnamed: 0,Name,DOB
0,Thomas Maeda,"October 21, 2020"
1,Anthony Loud,"November 05, 2020"
2,Mary Stewart,"May 01, 2020"
3,John Barrera,"October 25, 2020"
4,Leonard Freeman,"December 08, 2020"
5,Amy Mcknight,"March 03, 2020"
6,Salvatore Lott,"May 12, 2020"
7,Leone Guy,"October 29, 2020"
8,Mary Trollinger,"September 11, 2020"
9,Eduardo Jones,"August 27, 2020"


# Search

In [45]:
SEARCH_NAME = 'Salvato Lot'
SEARCH_DATE = 'May 12, 2020'


FUZZY_THRESHOLD = 80


In [248]:
def sequence_match(a, b):
    seq = difflib.SequenceMatcher(None, a, b)
    d = seq.ratio()
    return d

sequence_match('10 Boverton Road', '10 Rawling')

0.3076923076923077

In [448]:
def name_sim(doc_name, search_name):
    return fuzz.token_set_ratio(doc_name, search_name) / 100


def address_sim(doc_address, search_address):
    if doc_address == search_address:
        return 1.
    elif (not doc_address and search_address) or (not search_address and doc_address):
        return 1
    else:
        return fuzz.partial_ratio(doc_address, search_address) / 100


def dob_sim(doc_birth, search_birth):
    if search_birth == doc_birth:
        return 1
    if (not search_birth and doc_birth) or (not doc_birth and search_birth):
        return 1
    else:
        return 0
        
    

In [449]:
# We need to have some hierarchy when we compare addresses.


def metric(SEARCH, DOC):
    name_sim_weight = 1
    address_sim_weight = 1
    
    sum_weights = sum([name_sim_weight, address_sim_weight])
    
    m = ((name_sim_weight * name_sim(SEARCH.get('name'), DOC.get('name')) +
           address_sim_weight * address_sim(SEARCH.get('address'), DOC.get('address')))
         / sum_weights) * dob_sim(SEARCH.get('dob'), DOC.get('dob'))

    return m


metric(SEARCH, DOC)

1.0

In [450]:
def metric2(SEARCH, DOC):
    name_sim_weight = 0.9
    address_sim_weight = 0.1
    
    sum_weights = sum([name_sim_weight, address_sim_weight])
    
    name_similarity = name_sim(SEARCH.get('name'), DOC.get('name'))
    dob_similarity = dob_sim(SEARCH.get('dob'), DOC.get('dob'))
    address_similarity = address_sim(SEARCH.get('address'), DOC.get('address'))
    
    m = name_sim_weight * (name_similarity * dob_similarity) + address_sim_weight * address_similarity 
    return m

In [537]:
def metric3(SEARCH, DOC):
    name_sim_weight = 0.9
    address_sim_weight = 0.1
    
    sum_weights = sum([name_sim_weight, address_sim_weight])
    
    name_similarity = name_sim(SEARCH.get('name'), DOC.get('name'))
    dob_similarity = dob_sim(SEARCH.get('dob'), DOC.get('dob'))
    address_similarity = address_sim(SEARCH.get('address'), DOC.get('address'))
    
    m = 0.75 * name_similarity * dob_similarity + 0.25  * name_similarity * address_similarity
    return m

In [563]:
# We need to have some hierarchy when we compare addresses.

SEARCH = {
    'name': 'Andy Dolores',
    'dob': 1251, 
    'address': '10 Boverton Road, Bristol'
}

DOC = {'name': 'Andy Dolores',
       'dob': None, 
       'address': '10 Rawling Road Bristol'
      }

display(metric(SEARCH, DOC))
display(metric2(SEARCH, DOC))
display(metric3(SEARCH, DOC))

0.825

0.9650000000000001

0.9125

In [559]:
address_sim('10 Boverton Road', '10 Bristol Road')

0.73