In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import os

# Load the MDR dataset
# This dataset contains existing variable names and their descriptions
mdrfile = pd.read_excel('data\mdr Variables 1.xlsx')
mdr_defs = mdrfile[['name', 'definition']]

  mdrfile = pd.read_excel('data\mdr Variables 1.xlsx')


In [2]:
#Fill NaNs with empty strings - ' '

mdr_defs['definition'] = mdr_defs['definition'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mdr_defs['definition'] = mdr_defs['definition'].fillna('')


In [3]:
# Load a pre-trained Sentence-BERT (SBERT) model
# 'all-MiniLM-L6-v2' is a lightweight and efficient model for generating sentence embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode the existing descriptions from the MDR dataset into embeddings
definition_embeddings = model.encode(mdr_defs['definition'].tolist())

In [4]:
# Load the ABS-MOPS dataset
abs_mops = pd.read_excel('data\ABS-MOPS Variables - December 11 2024.xlsm', sheet_name=['Data Sheet'], header=13)
abs_mops = abs_mops['Data Sheet']
abs_mops.head(2)

# Rename columns for clarity
abs_defs = abs_mops[['Provide a brief description of the variable, this will alert staff entering content of its intended purpose\n\nNOTE: Maximum number of characters should be 500', 
                     'Legacy Variable',
                     'Unique Name for Variable \nOn upload, will verify with those already in database to ensure unique and alert to those that are not\n\nNOTE: \n1) Variable Names should be all caps with no spaces\n2) Variables can not end with _# or _## as those are reserved for handling of repeating persons.']]
abs_defs.rename(columns={
    'Legacy Variable': 'legacy_variable',
    'Provide a brief description of the variable, this will alert staff entering content of its intended purpose\n\nNOTE: Maximum number of characters should be 500': 'description',
    'Unique Name for Variable \nOn upload, will verify with those already in database to ensure unique and alert to those that are not\n\nNOTE: \n1) Variable Names should be all caps with no spaces\n2) Variables can not end with _# or _## as those are reserved for handling of repeating persons.': 'ABS-MOPS_New_Variable_Name'
}, inplace=True)

# Encode the new descriptions from the ABS-MOPS dataset into embeddings
new_definition_embeddings = model.encode(abs_defs['description'].tolist())

  abs_mops = pd.read_excel('data\ABS-MOPS Variables - December 11 2024.xlsm', sheet_name=['Data Sheet'], header=13)
  for idx, row in parser.parse():
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abs_defs.rename(columns={


In [5]:
abs_defs.head()

Unnamed: 0,description,legacy_variable,ABS-MOPS_New_Variable_Name
0,Yes/No if business has ceased operation,B_OPR_CEASED,OPSTAT_CEASED_STAT
1,Month the business ceased operation,B_CEASED_MONTH,OPSTAT_CEASED_MONTH_DATE
2,Year the business ceased operation,B_CEASED_DATE_ YEAR,OPSTAT_CEASED_YEAR_DATE
3,Yes/No if business is a majority-owned subsidi...,B_PARENTCO_FOREIGN,OWNED_FOREIGN_STAT
4,Yes/No if business is more than 50% owned by a...,B_PARENTCO_DOMESTIC,OWNED_DOMESTIC_STAT


In [6]:
# Initialize a list to store results
results = []

# Calculate cosine similarities for each new description
for i, new_embedding in enumerate(new_definition_embeddings):
    # Compute cosine similarity between the new embedding and all existing embeddings
    similarities = util.pytorch_cos_sim(new_embedding, definition_embeddings)
    
    # Get the indices of the top 3 most similar embeddings
    top_3_indices = similarities.argsort(descending=True)[0][:3].tolist()
    
    # Store the top 3 matches along with their similarity scores
    top_matches = []
    for idx in top_3_indices:
        top_matches.append({
            'most_similar_variable_name_from_MDR': mdr_defs['name'][idx],
            'similarity_score': similarities[0][idx].item()            
        })
    
    # If there are less than 3 matches, pad with low similarity scores
    while len(top_matches) < 3:
        top_matches.append({
            'most_similar_variable_name_from_MDR': 'N/A',
            'similarity_score': 0.0
        })
    
    # Append the results for the current new variable
    results.append({
        'legacy_variable_name_from_ABS_MOPS': abs_defs['legacy_variable'][i],
        'ABS-MOPS_New_Variable_Name': abs_defs['ABS-MOPS_New_Variable_Name'][i],
        'top_matches': top_matches
    })

In [7]:
# Create a new DataFrame from the results
results_df = pd.DataFrame(results)

# Explode the 'top_matches' column to create separate rows for each top match
exploded_df = results_df.explode('top_matches')

# Extract the 'top_matches' dictionary into separate columns
exploded_df = pd.concat([exploded_df.drop(['top_matches'], axis=1), exploded_df['top_matches'].apply(pd.Series)], axis=1)
exploded_df.rename(columns ={
    'legacy_variable_name_from_ABS_MOPS': 'ABS MOPS - Legacy Variables',
    'most_similar_variable_name_from_MDR': 'MDR - Matching Variable Name',
    'ABS-MOPS_New_Variable_Name': 'ABS MOPS - New Variable Names'
}, inplace = True)
exploded_df = exploded_df.set_index(['ABS MOPS - Legacy Variables', 'MDR - Matching Variable Name', 'ABS MOPS - New Variable Names'])
print(exploded_df.head())


                                                                                        similarity_score
ABS MOPS - Legacy Variables MDR - Matching Variable Name ABS MOPS - New Variable Names                  
B_OPR_CEASED                BUS_TECH_ADD_MFG_N_ARCHIVE   OPSTAT_CEASED_STAT                     0.586060
                            BUS_TECH_ADD_MFG_Y_ARCHIVE   OPSTAT_CEASED_STAT                     0.586060
                            BUS_TECH_ADD_MFG             OPSTAT_CEASED_STAT                     0.586060
B_CEASED_MONTH              OPSTAT_DATE                  OPSTAT_CEASED_MONTH_DATE               0.674443
                                                         OPSTAT_CEASED_MONTH_DATE               0.674443


In [12]:
# Sort the DataFrame by the highest similarity score
sorted_df = exploded_df.sort_values(by='similarity_score', ascending=False)
# Rename columns for better readability
sorted_df.rename(columns={
    'similarity_score': 'Similarity Score'
}, inplace=True)

sorted_df['Similarity Score'] = sorted_df['Similarity Score'].round(2)
# Display the top 15 rows of the sorted DataFrame
print(sorted_df.head(15))

                                                                                                               Similarity Score
ABS MOPS - Legacy Variables                        MDR - Matching Variable Name ABS MOPS - New Variable Names                  
\nRESP_TITLE                                       CERT_TITLE_TXT               CERT_TITLE_TXT                             1.00
RESP_PHONE_AREA_CODE\nRESP_PHONE_PREFIX, RESP_P... CERT_PHONE_NUM               CERT_PHONE_NUM                             1.00
RESP_PHONE_EXT                                     CERT_EXTENSION_NUM           CERT_EXTENSION_NUM                         1.00
RESP_CONTACT_NAME                                  CERT_NAME_TXT                CERT_NAME_TXT                              1.00
                                                                                CERT_NAME_TXT                              1.00
RESP_EMAIL                                         CERT_EMAIL_TXT               CERT_EMAIL_TXT          

In [13]:
# Save the DataFrame to an Excel file with adjusted column widths
output_path = 'result/Matching ABS MOPS Legacy to MDR Variable Names output.xlsx'

sorted_df.to_excel(output_path)

print("File saved as 'output.xlsx")
os.system(output_path)

File saved as 'output.xlsx


1