In [None]:
from tkinter.font import names

import pandas
import numpy as np
import re
from itertools import product
from nltk.metrics import edit_distance

In [None]:
snake_tokenize = lambda string: re.split(r'[_]', string)
space_tokenize = lambda string: re.split(r'[ ]', string)
snake_space_tokenize = lambda string: re.split(r'[ _]', string)

def variable_tokenize(string):
    # This regex pattern will split at the transitions between lowercase and uppercase letters
    pattern = r'(?<=[a-z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z])'

    # Use re.split to split the string based on the pattern
    tokens = re.split(pattern, string)

    return tokens

def tokenize(string):
    # tokens = []
    # for token in snake_tokenize(string):
    #     for sub_token in space_tokenize(token):
    #         tokens.extend(variable_tokenize(sub_token))
    tokens = snake_space_tokenize(string)
    return tokens

In [None]:
token_replacements = [
    ('pol', 'policy'),
    ('plcy', 'policy'),
    ('no', 'number'),
    ('cov', 'coverage')
]

def clean(string):
    return string.replace('"', '').lower()

def replace_common_abbreviated_tokens(tokens):
    def replace_token(token):
        for old, new in token_replacements:
            if token == old:
                return new
        return token
    return [replace_token(token) for token in tokens]

def clean_and_tokenize(string):
    # tokenize first for variable tokens
    tokens = tokenize(string)
    # clean each token, convert case
    tokens = list(map(clean, tokens))
    # perform common replacements
    tokens = replace_common_abbreviated_tokens(tokens)
    return tokens


In [None]:
"""
Make a fake data dictionary for testing
"""
dictionary_fields = [
    "POLICY_NUMBERs",
    "POL_NUMBER",
    # "policyNumber",
    # "polNo",
    # "PolicyNumber",
    "COVERAGE",
    "ANNUAL_PREMIUM"
]

dictionary_df = pandas.DataFrame(
    dictionary_fields,
    columns=["field_names"])

dictionary_df['tokenized_field_names'] = (
    dictionary_df['field_names'].apply(clean_and_tokenize))

dictionary_df

Unnamed: 0,field_names,tokenized_field_names
0,POLICY_NUMBERs,"[policy, numbers]"
1,POL_NUMBER,"[policy, number]"
2,COVERAGE,[coverage]
3,ANNUAL_PREMIUM,"[annual, premium]"


In [None]:
"""
Read the glossary data
"""
acord_df = pandas.read_csv(
    'test-data/ACORD-Business-Glossary Model 2.13.csv',
    header=0)

acord_df['tokenized_glossary'] = (
    acord_df['Glossary Terms'].apply(clean_and_tokenize))

# print(acord_df['tokenized_glossary'].head())

In [None]:
"""
attempts to handle out of order words in each token list.
"""
def source_token_within_search_edit_distance(tokenized_term1, tokenized_term2):

    # get the best matched tokens from a list of tuples
    #   each tuple has a "target" token, a "potential match" token, and a similarity score
    def best_matches(tuples_list):

        best = {}
        for target, potential_match, score in tuples_list:
            # Check if we have seen this target before or if the current score is better
            if target not in best or score < best[target][2]:
                best[target] = (target, potential_match, score)

        return list(best.values())


    # make unique pairs
    l1 = list(set(tokenized_term1))
    l2 = list(set(tokenized_term2))
    pairs = product(l1, l2)

    # calculate the Jaccard distance between all pairs
    token_distances = [(token1, token2, edit_distance(token1, token2))
                       for token1, token2 in pairs]


    best = best_matches(token_distances)
    # todo: the total distance needs to handle the "extra" fields in each token list that are not "best matches"
    #    e.g., "policy number" and "the policy number" has an extra "the" in the second list of tokens.
    #    and vice versa
    total_distance = sum(token_distance[2] for token_distance in best)

    if len(tokenized_term2) - len(tokenized_term1) > 0:
        extra_terms = len(tokenized_term2) - len(tokenized_term1)
        extra_term_penalty = extra_terms + (extra_terms * total_distance)
        # extra_term_penalty = 0
    else:
        extra_term_penalty = 0

    return total_distance + extra_term_penalty

In [None]:
def ordered_token_edit_distance(tokenized_term1, tokenized_term2):
    total_distance = 0

    if len(tokenized_term1) == len(tokenized_term2):
        # modification on pure edit distance of the entire token list: if the lists are the same length
        #    then discount the distance when two tokens start with the same sequence
        #    for example: `policy` and `form` are the same distance from `pol` but `pol` is far
        #    more likely to be closer to `policy`
        for (token1, token2) in zip(tokenized_term1, tokenized_term2):
            my_distance = edit_distance(token1, token2)
            my_distance = my_distance / 2 if token2.startswith(token1) else my_distance
            total_distance = total_distance + my_distance
    else:
        # otherwise join the lists back together with spaces (to preserve `token differentiation`)
        #    and edit distance those strings
        space = " "
        string1 = space.join(tokenized_term1).strip()
        string2 = space.join(tokenized_term2).strip()
        total_distance = edit_distance(string1, string2)

    return total_distance

In [None]:
print("a", ordered_token_edit_distance(clean_and_tokenize("POL_NUMBER"), clean_and_tokenize("Policy Number")))
print("b", ordered_token_edit_distance(clean_and_tokenize("POL_NUMBER"), clean_and_tokenize("From Number")))
print("c", ordered_token_edit_distance(clean_and_tokenize("POL_NUMBER"), clean_and_tokenize("The Policy Number")))
print("d", ordered_token_edit_distance(clean_and_tokenize("POL_NUMBER"), clean_and_tokenize("The Form Number")))

a 0.0
b 6.0
c 4
d 8


In [None]:
def find_closest_match(target_df=None, target_field=None, match_df=None, match_field=None, match_algorithm=None, match_score_field=None, matches_df=None):
    # Prepare an empty list to store closest matches
    closest_matches = []

    # Iterate over each tokenized field name in dictionary_df
    for idx, dict_tokens in target_df[target_field].items():
        # print('> ', idx, dict_tokens)
        best_similarity = float('inf')
        best_match_idx = None

        # Compare with each tokenized glossary term in acord_df
        for a_idx, acord_tokens in match_df[match_field].items():
            similarity = match_algorithm(dict_tokens, acord_tokens)
            # print('> ', similarity, best_similarity, dict_tokens, acord_tokens)
            if similarity < best_similarity:
                best_similarity = similarity
                best_match_idx = a_idx
                if best_similarity == 0:
                    break

        # Append the best match for the current dictionary token
        closest_matches.append([idx, best_match_idx, best_similarity])

    return pandas.DataFrame(closest_matches, columns=[
        "target_index",
        match_score_field + "_match_index",
        match_score_field
    ])

In [None]:
match_algorithms = [
    ("similarity", ordered_token_edit_distance),
    ("best_tokens_similarity", source_token_within_search_edit_distance),
    ("similarity3", ordered_token_edit_distance),
]

matches_df = None
for match_score_field, match_algorithm in match_algorithms:
    # Call the `find_closest_match` function
    my_matches_df = find_closest_match(
        target_df=dictionary_df,
        target_field='tokenized_field_names',
        match_df=acord_df,
        match_field='tokenized_glossary',
        match_algorithm=match_algorithm,
        match_score_field=match_score_field
    )

    # If final_df is None, set it to matches_df
    if matches_df is None:
        matches_df = my_matches_df
    else:
        matches_df = matches_df.merge(my_matches_df, on='target_index')


# best_score_field, best_algo = max(match_algorithms, key=lambda x: matches_df[x[0]].max())

match_metadata = [
    (score_field, score_field + '_match_index', algo.__name__)
    for score_field, algo in match_algorithms
]

print(match_metadata)
matches_df['closest_similar_index'] = np.where(
    matches_df['similarity'] > matches_df['best_tokens_similarity'],
    matches_df['best_tokens_similarity_match_index'],
    matches_df['similarity_match_index']
)

matches_df['closest_similarity_score'] = np.where(
    matches_df['similarity'] > matches_df['best_tokens_similarity'],
    matches_df['best_tokens_similarity'],
    matches_df['similarity']
)

matches_df['closest_similarity_algorithm'] = np.where(
    matches_df['similarity'] > matches_df['best_tokens_similarity'],
    'source_token_within_search_edit_distance',
    'ordered_token_edit_distance'
)
# print (source_token_within_search_edit_distance.__name__)

# matches_df

[('similarity', 'similarity_match_index', 'ordered_token_edit_distance'), ('best_tokens_similarity', 'best_tokens_similarity_match_index', 'source_token_within_search_edit_distance'), ('similarity3', 'similarity3_match_index', 'ordered_token_edit_distance')]
similarity


In [None]:
# extract index from target (dictionary) and reset index
reset_dictionary_df = (dictionary_df.
                       loc[matches_df['target_index']].
                       reset_index(drop=True))

# extract index from proposed match (acord) and reset index
reset_acord_df = (acord_df.
                  loc[matches_df['closest_similar_index']].
                  reset_index(drop=True))

# join target df with matches df
joined_df = reset_dictionary_df.join(reset_acord_df)

# append similarity score
joined_df['closest_similarity_score'] = matches_df['closest_similarity_score']
joined_df['closest_similarity_algorithm'] = matches_df['closest_similarity_algorithm']

joined_df = joined_df[[
    'field_names',
    'Glossary Terms',
    # 'Definition',
    'closest_similarity_score',
    'closest_similarity_algorithm'
]]

print(joined_df.head(10))
joined_df.to_csv('test-data/out.csv', index=False)

      field_names            Glossary Terms  closest_similarity_score  \
0  POLICY_NUMBERs             Policy Number                       2.0   
1      POL_NUMBER             Policy Number                       0.0   
2    policyNumber             Policy Number                       0.0   
3           polNo             Policy Number                       0.0   
4    PolicyNumber             Policy Number                       0.0   
5        COVERAGE                  Coverage                       0.0   
6  ANNUAL_PREMIUM  Guideline Annual Premium                       1.0   

               closest_similarity_algorithm  
0               ordered_token_edit_distance  
1               ordered_token_edit_distance  
2               ordered_token_edit_distance  
3               ordered_token_edit_distance  
4               ordered_token_edit_distance  
5               ordered_token_edit_distance  
6  source_token_within_search_edit_distance  
