In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [1]:
from tkinter.font import names
import re
import pandas
import spacy
from bs4 import BeautifulSoup 

# hack because venv nonsense
# model = 'en_core_web_sm'
# model = 'en_core_web_md'
model = 'en_core_web_lg'
model_location = "..\\.venv\\Lib\\site-packages\\" + model + "\\" + model + "-3.8.0"

nlp = spacy.load(model_location) # python -m spacy download en_core_web_lg
spacy.info()


{'spacy_version': '3.8.2',
 'location': 'C:\\Users\\tbadmin\\Documents\\projects\\machine_learning_musings\\.venv\\Lib\\site-packages\\spacy',
 'platform': 'Windows-11-10.0.22631-SP0',
 'python_version': '3.12.3',
 'pipelines': {'en_core_web_lg': '3.8.0',
  'en_core_web_md': '3.8.0',
  'en_core_web_sm': '3.8.0'}}

In [30]:
snake_tokenize = lambda string: re.split(r'[_]', string)
space_tokenize = lambda string: re.split(r'[ ]', string)
snake_space_tokenize = lambda string: re.split(r'[ _]', string)

def camel_case_tokenize(string):
    # This regex pattern will split at the transitions between lowercase and uppercase letters
    pattern = r'(?<=[a-z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z])'

    # Use re.split to split the string based on the pattern
    tokens = re.split(pattern, string)

    return tokens

def tokenize(string):
    # tokenize across: python case, java case, nlp (space)
    tokens = []
    for token in snake_tokenize(string):
        for sub_token in space_tokenize(token):
            tokens.extend(camel_case_tokenize(sub_token))
    # tokens = snake_space_tokenize(string)
    return tokens

In [31]:
replacement_tokens = [
    ('pol', 'policy'),
    ('plcy', 'policy'),
    ('no', 'number'),
    ('cov', 'coverage')
]

def clean(string):
    return string.replace('"', '').lower()

def normalize(tokens):
    def replace_token(token):
        for old, new in replacement_tokens:
            if token == old:
                return new
        return token
    return [replace_token(token) for token in tokens]

def tokenize_then_normalize(string):
    # tokenize first for variable tokens
    tokens = tokenize(string)
    # clean each token, convert case
    tokens = list(map(clean, tokens))
    # perform common replacements
    tokens = normalize(tokens)
    return tokens

def stripHtml (html):
    soup = BeautifulSoup(html)
    text = soup.get_text()
    text = text.replace('\n', '')
    return text

In [32]:
def to_nlp(tokens):
    return nlp(" ".join(tokens))

def prepare_df(df, field, chunk_size=100):
    
    result_prefix = 'tokenized_' + field
    df[result_prefix] = None
    df['nlp_' + result_prefix] = None

    num_chunks = len(df) // chunk_size + int(len(df) % chunk_size != 0)

    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size
        
        print(f'processing chunk {start}...{end}')

        chunk = df.iloc[start:end]

        df.loc[start:end - 1, result_prefix] = chunk[field].apply(tokenize_then_normalize)
        df.loc[start:end - 1, 'nlp_' + result_prefix] = chunk[result_prefix].apply(to_nlp)

    return df

In [None]:
def get_data_dictionary_df():
    """
    Make a fake data dictionary for testing
    """
    dictionary_fields = [
        "POLICY_NUMBERs",
        "POL_NUMBER",
        # "policyNumber",
        # "polNo",
        # "PolicyNumber",
        "COVERAGE",
        "ANNUAL_PREMIUM"
    ]
    
    dictionary_df = pandas.DataFrame(
        dictionary_fields,
        columns=["field_names"])
    
    dictionary_df = prepare_df(dictionary_df, "field_names")
    return dictionary_df
    
dictionary_df = get_data_dictionary_df()
dictionary_df

processing chunk 0...100


In [6]:
def get_acord_df():
    """
    Read the glossary data
    """
    acord_df = pandas.read_csv(
        'test-data/ACORD-Business-Glossary Model 2.13.csv',
        header=0)
    
    
    acord_df.rename(columns={'Glossary Terms': 'glossary'}, inplace=True)
    
    acord_df = prepare_df(acord_df, "glossary") 
    
    return acord_df

# ocasionally get windows access violations. they are not being caught, the process just hangs
try:
    acord_df = get_acord_df()
except Exception as e:   
    print(e)
    
acord_df[[ 
    'glossary', 
    'tokenized_glossary', 
    'nlp_tokenized_glossary']]


processing chunk 0...100
processing chunk 100...200
processing chunk 200...300
processing chunk 300...400
processing chunk 400...500
processing chunk 500...600
processing chunk 600...700
processing chunk 700...800
processing chunk 800...900
processing chunk 900...1000
processing chunk 1000...1100
processing chunk 1100...1200
processing chunk 1200...1300
processing chunk 1300...1400
processing chunk 1400...1500
processing chunk 1500...1600
processing chunk 1600...1700
processing chunk 1700...1800
processing chunk 1800...1900
processing chunk 1900...2000
processing chunk 2000...2100
processing chunk 2100...2200
processing chunk 2200...2300
processing chunk 2300...2400
processing chunk 2400...2500
processing chunk 2500...2600
processing chunk 2600...2700
processing chunk 2700...2800
processing chunk 2800...2900
processing chunk 2900...3000
processing chunk 3000...3100
processing chunk 3100...3200
processing chunk 3200...3300
processing chunk 3300...3400
processing chunk 3400...3500
proces

KeyError: "['Glossary Terms'] not in index"

In [15]:
def spacy_similarity(doc1, doc2):
    return doc1.similarity(doc2)

In [16]:
def get_matches_df(a_df=dictionary_df, a_field='nlp_tokenized_field_names', b_df=acord_df, b_field='nlp_tokenized_glossary'): 

    matches = []
    
    # Iterate over each tokenized field name in dictionary_df
    for idx, a_encoding in a_df[a_field].items():
        best_similarity = float('-inf')
        best_match_idx = None
    
        # Compare with each tokenized glossary term in acord_df
        for a_idx, b_encoding in b_df[b_field].items():
            similarity = spacy_similarity(a_encoding, b_encoding)
            if similarity > best_similarity:
                best_similarity = similarity
                best_match_idx = a_idx
                if best_similarity == 0:
                    break
    
        # Append the best match for the current dictionary token
        matches.append([idx, best_match_idx, best_similarity])
        
    matches_df = pandas.DataFrame(matches, columns=[
        'a_index',
        'b_index',
        'spacy_similarity'
    ])
        
    return matches_df

matches_df = get_matches_df()
matches_df

  return doc1.similarity(doc2)


Unnamed: 0,a_index,b_index,spacy_similarity
0,0,4811,0.521119
1,1,4811,1.0
2,2,1649,1.0
3,3,3021,0.899214


In [29]:
def get_joined(a_df=dictionary_df, a_field='a_index', b_df=acord_df, b_field='b_index'):
    # extract index from target (dictionary) and reset index
    reset_a_df = (a_df.loc[matches_df['a_index']].reset_index(drop=True))
    
    # extract index from proposed match (acord) and reset index
    reset_b_df = (b_df.loc[matches_df['b_index']].reset_index(drop=True))
    
    # join target df with matches df
    joined_df = reset_a_df.join(reset_b_df)
    joined_df['spacy_similarity'] = matches_df['spacy_similarity']
    
    joined_df['definition'] =  (
        joined_df['Definition'].apply(stripHtml))
    
    joined_df = joined_df[[
        'field_names',
        'glossary',
        'definition',
        'spacy_similarity'
    ]]
    return joined_df

joined_df = get_joined()
joined_df.to_csv('test-data/out.csv', index=False)
joined_df

  soup = BeautifulSoup(html)


Unnamed: 0,field_names,glossary,definition,spacy_similarity
0,POLICY_NUMBERs,Policy Number,A unique identifier assigned to a policy (e.g....,0.521119
1,POL_NUMBER,Policy Number,A unique identifier assigned to a policy (e.g....,1.0
2,COVERAGE,Coverage,A financial services agreement component detai...,1.0
3,ANNUAL_PREMIUM,Guideline Annual Premium,This is the premium that needs to be paid for ...,0.899214
