In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [2]:

# Load MDR and BERD data
mdr_file_path = "data/mdr Variables 1.xlsx"
berd_file_path = "data/BERD data dictionary working BWS 2.xlsx"

mdr_data = pd.ExcelFile(mdr_file_path)
berd_data = pd.ExcelFile(berd_file_path)

mdr_df = mdr_data.parse('Sheet1')
berd_df = berd_data.parse('Sheet1')
print("Before filtering Business Frame: " + str(mdr_df.shape))

# Filtering everything but Business Frame
mdr_df = mdr_df[mdr_df['statistical_program_cycle_frame_type'] == 'Business Frame']

print("After filtering Business Frame: " + str(mdr_df.shape))
# Extract relevant columns
mdr_descriptions = mdr_df[['name', 'definition']].dropna().reset_index(drop=True)
berd_descriptions = berd_df[['New Variable Name', '2023 Description']].dropna().reset_index(drop=True)


Before filtering Business Frame: (41626, 9)
After filtering Business Frame: (9410, 9)


In [3]:
# Convert descriptions to lists
mdr_texts = mdr_descriptions['definition'].tolist()
berd_texts = berd_descriptions['2023 Description'].tolist()

# Load the pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings using SBERT
mdr_embeddings = model.encode(mdr_texts, convert_to_tensor=True)
berd_embeddings = model.encode(berd_texts, convert_to_tensor=True)

# Compute cosine similarity
similarity_scores = util.pytorch_cos_sim(mdr_embeddings, berd_embeddings)

# Convert to a DataFrame
similarity_df = pd.DataFrame(similarity_scores.cpu().numpy(), index=mdr_descriptions['name'], columns=berd_descriptions['New Variable Name'])



In [4]:
# Extract the best matches
top_matches = similarity_df.idxmax(axis=0).reset_index()
top_matches.columns = ['BERD_NAME', 'MDR_NAME']
top_matches['Similarity_Score'] = similarity_df.max(axis=0).values

In [5]:
top_matches = top_matches.sort_values(by='Similarity_Score', ascending=False)

In [6]:

top_matches['Similarity_Score'] = (round(top_matches['Similarity_Score'] * 100, 1)).astype(str) + '%'
# Merging
top_matches = top_matches.merge(berd_df[['New Variable Name', 'Question Number', '2023 Description']], left_on='BERD_NAME', right_on='New Variable Name', how='left').drop(columns=['New Variable Name'])
top_matches = top_matches.merge(mdr_descriptions[['name', 'definition']], left_on='MDR_NAME', right_on='name', how='left').drop(columns=['name'])

# Rename
top_matches.rename(columns={
    'definition': 'Definition (MDR)',
    'Question Number': 'Question Number (BERD)',
    '2023 Description': '2023 Description (BERD)'
}, inplace=True)

In [7]:
top_matches.tail()

Unnamed: 0,BERD_NAME,MDR_NAME,Similarity_Score,Question Number (BERD),2023 Description (BERD),Definition (MDR)
1441,2ND_LRG_DOM_VAL,PHYSLOC_ADDR_CITY,32.5%,2-16,R&D performed at second largest domestic location,Physical Location City
1442,2ND_LRG_DOM_VAL,PHYSLOC_ADDR_CITY,32.5%,2-16,R&D performed at second largest domestic location,Physical Location City
1443,2ND_LRG_DOM_VAL,PHYSLOC_ADDR_CITY,32.5%,2-16,R&D performed at second largest domestic location,Physical Location City
1444,2ND_LRG_DOM_VAL,PHYSLOC_ADDR_CITY,32.5%,2-16,R&D performed at second largest domestic location,Physical Location City
1445,LRG_DOM_LOC_VAL,MERCH_OTHER_EST_PCT,32.4%,2-14,R&D performed at largest domestic location,"Merchandise Sales, Products manufactured or a..."


In [9]:
top_matches.to_excel('result/mdr_berd_semantic_matching.xlsx')