In [107]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

In [108]:


# Set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Load datasets (adjust the paths as needed)
mdr_df = pd.read_excel("data/mdr Variables 1.xlsx")
absmops_df = pd.read_excel("data/QFR_DataSet for Matching to MDR.xlsx")
answer_df = pd.read_excel("data/QFR_Variable_Mapping_20250205.xlsx")
absmops_df = absmops_df.merge(
    answer_df[['Old Variable Name', 'New Variable Name']],  # Only take the column you want to add
    how="inner",
    left_on="Legacy Variable Name",
    right_on="Old Variable Name"
)
mdr_df = mdr_df[mdr_df['statistical_program_cycle_frame_type'] == 'Business Frame']
absmops_df.drop_duplicates(subset=['Legacy Variable Name'], inplace=True)


Using device: cuda


In [109]:
# Rename only necessary columns based on column index
absmops_df = absmops_df.rename(columns={
    absmops_df.columns[1]: "Legacy Variable",       # Column index 4
    absmops_df.columns[-1]: "Variable Name",         # Column index 5
    absmops_df.columns[4]: "Description"           # Column index 14
})

# Select only needed columns
absmops_df = absmops_df[["Legacy Variable", "Variable Name", "Description"]]

# Prepare text
mdr_descriptions = mdr_df['definition'].fillna("").astype(str).tolist()
absmops_descriptions = absmops_df['Description'].fillna("").astype(str).tolist()

# Load model
model = SentenceTransformer('all-mpnet-base-v2', device=device)

# Generate embeddings
mdr_embeddings = model.encode(mdr_descriptions, convert_to_tensor=True, device=device)
absmops_embeddings = model.encode(absmops_descriptions, convert_to_tensor=True, device=device)

# Cosine similarity
cosine_scores = util.cos_sim(mdr_embeddings, absmops_embeddings)
best_matches = cosine_scores.argmax(dim=1).cpu().numpy()
# Build result DataFrame
results = pd.DataFrame({
    'MDR Variable': mdr_df['name'],
    'MDR Description': mdr_descriptions,
    'ABS-MOPS Variable Name': absmops_df.iloc[best_matches]['Variable Name'].values,
    'ABS-MOPS Description': absmops_df.iloc[best_matches]['Description'].values,
    'Similarity Score': [cosine_scores[i, best_matches[i]].item() for i in range(len(best_matches))]
})

# Remove rows where MDR Variable is not in ABS-MOPS Variable Name list
absmops_variable_set = set(absmops_df["Variable Name"].dropna().str.lower().str.strip())
results = results[
    results['MDR Variable'].str.lower().str.strip().isin(absmops_variable_set)
]

# Match flags
results['Name Match'] = (
    results['MDR Variable'].str.lower().str.strip() ==
    results['ABS-MOPS Variable Name'].str.lower().str.strip()
).astype(int)

results['Description Match'] = (
    results['MDR Description'].str.lower().str.strip() ==
    results['ABS-MOPS Description'].str.lower().str.strip()
).astype(int)

# Sort by similarity
results = results.sort_values(by='Similarity Score', ascending=False)

# Save
results.to_csv("mdr_absmops_filtered_matches.csv", index=False)
print("Filtered and sorted results saved to 'mdr_absmops_filtered_matches.csv'")


Filtered and sorted results saved to 'mdr_absmops_filtered_matches.csv'


In [110]:
# Normalize for comparison (lowercase and strip whitespace)
results['MDR Variable Clean'] = results['MDR Variable'].str.lower().str.strip()
results['ABS-MOPS Variable Clean'] = results['ABS-MOPS Variable Name'].str.lower().str.strip()
results['MDR Description Clean'] = results['MDR Description'].str.lower().str.strip()
results['ABS-MOPS Description Clean'] = results['ABS-MOPS Description'].str.lower().str.strip()

# Create match flags
results['Name Match'] = (results['MDR Variable Clean'] == results['ABS-MOPS Variable Clean']).astype(int)

# Drop temp clean columns if you don't want to keep them
results.drop(columns=[
    'MDR Variable Clean', 
    'ABS-MOPS Variable Clean', 
    'MDR Description Clean', 
    'ABS-MOPS Description Clean'
], inplace=True)
results = results.drop_duplicates(
    subset=['MDR Variable', 'ABS-MOPS Variable Name', 'MDR Description', 'ABS-MOPS Description']
)

# Re-sort by Similarity Score
results = results.sort_values(by='Similarity Score', ascending=False)

# Save results
results.to_csv("mdr_absmops_matches_with_flags.csv", index=False)
print("Comparison results saved to 'mdr_absmops_matches_with_flags.csv'")


Comparison results saved to 'mdr_absmops_matches_with_flags.csv'


In [111]:
results

Unnamed: 0,MDR Variable,MDR Description,ABS-MOPS Variable Name,ABS-MOPS Description,Similarity Score,Name Match,Description Match
8115,RCPT_TOT_VAL,"Sales, Shipments, Receipts, or Revenue",RCPT_TOT_VAL,Sales Receipts and Operating Revenue-101,0.753976,1,0
7590,EXPS_DEPR_VAL,Depreciation and Amortization Charges,EXPS_DEPR_VAL,"Depreciation,Depletion, Amortization-102",0.741466,1,0
7600,EXPS_INTEREST_VAL,Operating Interest Expense,EXPS_OP_COST_OTH_VAL,Other Operating Cost and Expenses 103,0.665146,0,0
7452,AFFIL_OWNER_EIN_NUM,Ownership or Controlling Company: EIN,AFFIL_OWNER_EIN_NUM,EI of company who owns,0.660261,1,0
7564,EIN_NUM,EIN to which the establishment is linked,EIN_NUM,EI of corp,0.577308,1,0
3167,OPSTAT,"Operational Status, What is this establishment...",OPSTAT,Corp status active/discontinued/merged,0.564284,1,0
2357,EIN_NUM,Employer Identification Number,EIN_NUM,EI of corp,0.450451,1,0


In [112]:
results["Name Match"].value_counts()

Name Match
1    6
0    1
Name: count, dtype: int64