In [55]:
from tkinter.font import names
import re
import pandas
import numpy as np
from itertools import product
from nltk.metrics import edit_distance
from gensim.models import Word2Vec
from bs4 import BeautifulSoup #pip install beautifulsoup4


In [36]:
snake_tokenize = lambda string: re.split(r'[_]', string)
space_tokenize = lambda string: re.split(r'[ ]', string)
snake_space_tokenize = lambda string: re.split(r'[ _]', string)

def camel_case_tokenize(string):
    # This regex pattern will split at the transitions between lowercase and uppercase letters
    pattern = r'(?<=[a-z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z])'

    # Use re.split to split the string based on the pattern
    tokens = re.split(pattern, string)

    return tokens

def tokenize(string):
    # not needed right now:
    # -----------------------------------------------------
    # tokens = []
    # for token in snake_tokenize(string):
    #     for sub_token in space_tokenize(token):
    #         tokens.extend(variable_tokenize(sub_token))
    tokens = snake_space_tokenize(string)
    return tokens

In [37]:
replacement_tokens = [
    ('pol', 'policy'),
    ('plcy', 'policy'),
    ('no', 'number'),
    ('cov', 'coverage')
]

def clean(string):
    return string.replace('"', '').lower()

def normalize(tokens):
    def replace_token(token):
        for old, new in replacement_tokens:
            if token == old:
                return new
        return token
    return [replace_token(token) for token in tokens]

def tokenize_then_normalize(string):
    # tokenize first for variable tokens
    tokens = tokenize(string)
    # clean each token, convert case
    tokens = list(map(clean, tokens))
    # perform common replacements
    tokens = normalize(tokens)
    return tokens


In [60]:
def prepare_df(df, field, result_prefix=None):
    if result_prefix is None:
        df['tokenized_' + field] = (
            df[field].apply(tokenize_then_normalize))
    else:
        df[result_prefix] = (
            df[field].apply(tokenize_then_normalize))
    return df


"""
Make a fake data dictionary for testing
"""
dictionary_fields = [
    "POLICY_NUMBERs",
    "POL_NUMBER",
    # "policyNumber",
    # "polNo",
    # "PolicyNumber",
    "COVERAGE",
    "ANNUAL_PREMIUM"
]

dictionary_df = pandas.DataFrame(
    dictionary_fields,
    columns=["field_names"])

dictionary_df = prepare_df(dictionary_df, "field_names")
dictionary_df

Unnamed: 0,field_names,tokenized_field_names
0,POLICY_NUMBERs,"[policy, numbers]"
1,POL_NUMBER,"[policy, number]"
2,COVERAGE,[coverage]
3,ANNUAL_PREMIUM,"[annual, premium]"


In [61]:
"""
Read the glossary data
"""
acord_df = pandas.read_csv(
    'test-data/ACORD-Business-Glossary Model 2.13.csv',
    header=0)

acord_df = prepare_df(acord_df, "Glossary Terms", 'tokenized_glossary') 

acord_df[['Glossary Terms', 'tokenized_glossary']]

Unnamed: 0,Glossary Terms,tokenized_glossary
0,"A"" rates""","[a, rates]"
1,A I Or Robotics,"[a, i, or, robotics]"
2,A&E,[a&e]
3,A-Share Variable Annuities,"[a-share, variable, annuities]"
4,A.M. Best rating,"[a.m., best, rating]"
...,...,...
6227,eEg7,[eeg7]
6228,excess and surplus (E&S) lines insurance,"[excess, and, surplus, (e&s), lines, insurance]"
6229,fringe benefits,"[fringe, benefits]"
6230,hSOD,[hsod]


In [43]:
"""
attempts to handle out of order words in each token list.
"""
def best_paired_tokens_edit_distance(tokenized_term1, tokenized_term2):

    # get the best matched tokens from a list of tuples
    #   each tuple has a "target" token, a "potential match" token, and a similarity score
    def best_matches(tuples_list):

        best = {}
        for target, potential_match, score in tuples_list:
            # Check if we have seen this target before or if the current score is better
            if target not in best or score < best[target][2]:
                best[target] = (target, potential_match, score)

        return list(best.values())


    # make unique pairs
    l1 = list(set(tokenized_term1))
    l2 = list(set(tokenized_term2))
    pairs = product(l1, l2)

    # calculate the Jaccard distance between all pairs
    token_distances = [(token1, token2, edit_distance(token1, token2))
                       for token1, token2 in pairs]


    best = best_matches(token_distances)
    # todo: the total distance needs to handle the "extra" fields in each token list that are not "best matches"
    #    e.g., "policy number" and "the policy number" has an extra "the" in the second list of tokens.
    #    and vice versa
    total_distance = sum(token_distance[2] for token_distance in best)

    if len(tokenized_term2) - len(tokenized_term1) > 0:
        extra_terms = len(tokenized_term2) - len(tokenized_term1)
        extra_term_penalty = extra_terms + (extra_terms * total_distance)
        # extra_term_penalty = 0
    else:
        extra_term_penalty = 0

    return total_distance + extra_term_penalty

In [44]:
def modified_edit_distance(tokenized_term1, tokenized_term2):
    total_distance = 0

    if len(tokenized_term1) == len(tokenized_term2):
        # modification on pure edit distance of the entire token list: if the lists are the same length
        #    then discount the distance when two tokens start with the same sequence
        #    for example: `policy` and `form` are the same distance from `pol` but `pol` is far
        #    more likely to be closer to `policy`
        for (token1, token2) in zip(tokenized_term1, tokenized_term2):
            my_distance = edit_distance(token1, token2)
            my_distance = my_distance / 2 if token2.startswith(token1) else my_distance
            total_distance = total_distance + my_distance
    else:
        # otherwise join the lists back together with spaces (to preserve `token differentiation`)
        #    and edit distance those strings
        space = " "
        string1 = space.join(tokenized_term1).strip()
        string2 = space.join(tokenized_term2).strip()
        total_distance = edit_distance(string1, string2)

    return total_distance

In [45]:
print("a", modified_edit_distance(tokenize_then_normalize("POL_NUMBER"), tokenize_then_normalize("Policy Number")))
print("b", modified_edit_distance(tokenize_then_normalize("POL_NUMBER"), tokenize_then_normalize("From Number")))
print("c", modified_edit_distance(tokenize_then_normalize("POL_NUMBER"), tokenize_then_normalize("The Policy Number")))
print("d", modified_edit_distance(tokenize_then_normalize("POL_NUMBER"), tokenize_then_normalize("The Form Number")))

a 0.0
b 6.0
c 4
d 8


In [46]:
def find_closest_match(target_df=None, target_field=None, match_df=None, match_field=None, match_algorithm=None, match_score_field=None, matches_df=None):
    # Prepare an empty list to store closest matches
    closest_matches = []

    # Iterate over each tokenized field name in dictionary_df
    for idx, dict_tokens in target_df[target_field].items():
        # print('> ', idx, dict_tokens)
        best_similarity = float('inf')
        best_match_idx = None

        # Compare with each tokenized glossary term in acord_df
        for a_idx, acord_tokens in match_df[match_field].items():
            similarity = match_algorithm(dict_tokens, acord_tokens)
            # print('> ', similarity, best_similarity, dict_tokens, acord_tokens)
            if similarity < best_similarity:
                best_similarity = similarity
                best_match_idx = a_idx
                if best_similarity == 0:
                    break

        # Append the best match for the current dictionary token
        closest_matches.append([idx, best_match_idx, best_similarity])

    return pandas.DataFrame(closest_matches, columns=[
        "target_index",
        match_score_field + "_match_index",
        match_score_field
    ])

In [50]:
match_algorithms = [
    ("modified_edit_distance", modified_edit_distance),
    ("best_paired_tokens_edit_distance", best_paired_tokens_edit_distance),
    # ("similarity3", modified_edit_distance),
]

matches_df = None
for match_score_field, match_algorithm in match_algorithms:
    # Call the `find_closest_match` function
    my_matches_df = find_closest_match(
        target_df=dictionary_df,
        target_field='tokenized_field_names',
        match_df=acord_df,
        match_field='tokenized_glossary',
        match_algorithm=match_algorithm,
        match_score_field=match_score_field
    )

    # If final_df is None, set it to matches_df
    if matches_df is None:
        matches_df = my_matches_df
    else:
        matches_df = matches_df.merge(my_matches_df, on='target_index')


# best_score_field, best_algo = max(match_algorithms, key=lambda x: matches_df[x[0]].max())

match_metadata = [
    (score_field, score_field + '_match_index', algo.__name__)
    for score_field, algo in match_algorithms
]

# print(match_metadata)
matches_df['closest_similar_index'] = np.where(
    matches_df['modified_edit_distance'] > matches_df['best_paired_tokens_edit_distance'],
    matches_df['best_paired_tokens_edit_distance_match_index'],
    matches_df['modified_edit_distance_match_index']
)

matches_df['closest_similarity_score'] = np.where(
    matches_df['modified_edit_distance'] > matches_df['best_paired_tokens_edit_distance'],
    matches_df['best_paired_tokens_edit_distance'],
    matches_df['modified_edit_distance']
)

matches_df['closest_similarity_algorithm'] = np.where(
    matches_df['modified_edit_distance'] > matches_df['best_paired_tokens_edit_distance'],
    'best_paired_tokens_edit_distance',
    'modified_edit_distance'
)
# print (source_token_within_search_edit_distance.__name__)

matches_df

Unnamed: 0,target_index,modified_edit_distance_match_index,modified_edit_distance,best_paired_tokens_edit_distance_match_index,best_paired_tokens_edit_distance,closest_similar_index,closest_similarity_score,closest_similarity_algorithm
0,0,4811,1.0,4811,1,4811,1.0,modified_edit_distance
1,1,4811,0.0,4811,0,4811,0.0,modified_edit_distance
2,2,1649,0.0,1649,0,1649,0.0,modified_edit_distance
3,3,2222,5.0,3021,1,3021,1.0,best_paired_tokens_edit_distance


In [57]:
# extract index from target (dictionary) and reset index
reset_dictionary_df = (dictionary_df.
                       loc[matches_df['target_index']].
                       reset_index(drop=True))

# extract index from proposed match (acord) and reset index
reset_acord_df = (acord_df.
                  loc[matches_df['closest_similar_index']].
                  reset_index(drop=True))

# join target df with matches df
joined_df = reset_dictionary_df.join(reset_acord_df)

# append similarity score
def stripHtml (html):
    soup = BeautifulSoup(html)
    text = soup.get_text()
    text = text.replace('\n', '')
    return text
    
joined_df['closest_similarity_score'] = matches_df['closest_similarity_score']
joined_df['closest_similarity_algorithm'] = matches_df['closest_similarity_algorithm']

joined_df['Definition'] =  (
    joined_df['Definition'].apply(stripHtml))

joined_df = joined_df[[
    'field_names',
    'Glossary Terms',
    'Definition',
    'closest_similarity_score',
    'closest_similarity_algorithm'
]]

joined_df.to_csv('test-data/out.csv', index=False)
joined_df

  soup = BeautifulSoup(html)


Unnamed: 0,field_names,Glossary Terms,Definition,closest_similarity_score,closest_similarity_algorithm
0,POLICY_NUMBERs,Policy Number,A unique identifier assigned to a policy (e.g....,1.0,modified_edit_distance
1,POL_NUMBER,Policy Number,A unique identifier assigned to a policy (e.g....,0.0,modified_edit_distance
2,COVERAGE,Coverage,A financial services agreement component detai...,0.0,modified_edit_distance
3,ANNUAL_PREMIUM,Guideline Annual Premium,This is the premium that needs to be paid for ...,1.0,best_paired_tokens_edit_distance
