# Name Matching Algorithm Test Bench

In [1]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

from imports import *
from name_variation_generator import nameVariationGenerator, augment_database
from fairnm import FairNM
from testbench import TestBench

model_path = "./models/SNM.model"
vocab_path = "./models/SNM.vocab"
name_weights_path = "./data/name_weights.csv"

fairnm = FairNM(model_path, vocab_path, name_weights_path)
testbench = TestBench()

### Import Name Database

In [2]:
names_df = pd.read_csv('./data/name_database.csv', sep = '\t')
names_df.sample(5)

Unnamed: 0,full_name,country_code,language_code,first_name,middle_name,last_name
3348,Virgiliu Postolachi,FRA,FRA,Virgiliu,,Postolachi
148503,Inigo Thomas,BRI,BRI,Inigo,,Thomas
29712,Louis Georges de Contades,FRA,FRA,Louis,Georges,de Contades
213282,Joeline Möbius,GER,GER,Joeline,,Möbius
136492,Gus Casely-Hayford,BRI,BRI,Gus,,Casely-Hayford


### Customizable Name Matching System
In the following cell you can define your own name matching algorithm. You can fill in anything you like, as long as the resulting score is a float bounded between 0 and 1 which defines the similarity of the two names.

In [4]:
# As an example we set our FairNM algorithm

def name_matcher(name1, name2):
    """ Enter your similarity scoring function

    Parameters
    ----------
    name1 (str) : first name to compare
    name2 (str) : second name to compare

    Returns
    -------
    sim_score (float) : similarity score, 1 is exact match 0 is no-match
    """
    sim_score = fairnm.nameMatcher(name1, name2)
    
    return sim_score

### Initialize the Test Bench Configuration
Option to exclude certain country codes or variations from the testbench and set sample size and/or random seed/.

Default variations include:
- full_name (no alterations)
- fat_finger_replace
- random_deletion
- swapped_names

In [5]:
test_df = augment_database(names_df, 
                           name_variation_generator = nameVariationGenerator(),
                           excluded_variations = [],
                           excluded_langs = ['VIE'],
                           sample_size = 3000,
                           random_seed = 12
                          )
test_df.head(5)

Unnamed: 0,person_id,language_code,full_name,fat_finger,random_del,swapped_names
0,0,ARAB,Rasoul Yunan,Rasoul Yynan,Rasul Yunan,Yunan Rasoul
1,1,ARAB,Ahmad ibn Rustah,Ahmad ibn Rusfah,Ahmad ibn Rstah,ibn Rustah Ahmad
2,2,ARAB,Mostafa Khomeini,Mostafa Kjomeini,Mostaa Khomeini,Khomeini Mostafa
3,3,ARAB,Sheikh Ahmad-e Jami,Sheikh Ahmad-e Jsmi,Sheikh Ahma-e Jami,Jami Sheikh
4,4,ARAB,Mostafa El-Sayed,Mosfafa El-Sayed,Mostaf El-Sayed,El-Sayed Mostafa


### Run Test Bench
Here we run the test bench and translate the results into performance measures:

- precision
- recall
- F1
- Fairness 

In [7]:
TP_FP_res = testbench.run(test_df, fairnm, name_matcher)

full_name


Processing: 100%|███████████████████████████████████| 12/12 [02:55<00:00, 14.62s/it]


fat_finger


Processing: 100%|███████████████████████████████████| 12/12 [02:09<00:00, 10.82s/it]


random_del


Processing: 100%|███████████████████████████████████| 12/12 [02:41<00:00, 13.43s/it]


swapped_names


Processing: 100%|███████████████████████████████████| 12/12 [08:03<00:00, 40.29s/it]


In [10]:
measuresResults = testbench.TP_FP_to_Measures(TP_FP_res)

### Visualize Results

Select which Measure of Interest (MoI) you want to see, we can look at:

- Precision
- Recall
- F1
- Fairness
- Overview

and select whether we want to see the performance per language code or the overall performance by toggeling the $\texttt{sep}$ boolean (True means separate, False means overall).

In [11]:
# Define the options for the dropdown menus
MoIs = ['Precision', 'Recall', 'F1', 'Fairness', 'Overview']
sep = [True, False]

# Create the dropdown widgets
dropdown_MoI = widgets.Dropdown(
    options=MoIs,
    description='Measure of Interest:'
);

dropdown_sep = widgets.Dropdown(
    options=sep,
    description='Separate:'
);

# Define the output widget
output = widgets.Output();

# Define the function to generate the plot
def generate_plot(change):
    # Clear the previous output
    with output:
        output.clear_output(wait=True);
    
    # Get the selected values from the dropdown menus
    selected_option_MoI = dropdown_MoI.value
    selected_option_sep = dropdown_sep.value
    
    # Generate the new plot
    with output:
        fig = plt.figure()
        sns.set_theme()
        testbench.visualizer(measuresResults, MoI=selected_option_MoI, sep=selected_option_sep)
        plt.show();

# Attach the callback function to the dropdowns' event
dropdown_MoI.observe(generate_plot, 'value')
dropdown_sep.observe(generate_plot, 'value')

# Generate the initial plot
generate_plot(None);

# Display the dropdown menus and output widget
display(dropdown_MoI, dropdown_sep, output);

Dropdown(description='Measure of Interest:', options=('Precision', 'Recall', 'F1', 'Fairness', 'Overview'), va…

Dropdown(description='Separate:', options=(True, False), value=True)

Output()