In [1]:
%pip install pandas
%pip install numpy
%pip install torch
%pip install sentence_transformers
%pip install scipy
%pip install scikit-learn
%pip install openpyxl

Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Downloading numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
   ---------------------------------------- 0.0/11.6 MB ? eta -:--:--
   -------------- ------------------------- 4.2/11.6 MB 22.9 MB/s eta 0:00:01
   ----------------------------- ---------- 8.7/11.6 MB 22.4 MB/s eta 0:00:01
   ---------------------------------------- 11.6/11.6 MB 21.4 MB/s eta 0:00:00
Downloading numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
   ---------------------------------------- 0.0/15.9 MB ? eta -:--:--
   ------------- -------------------------- 5.2/15.9 MB 24.5 MB/s eta 0:00:01
   ----------------------- --

In [7]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
model = SentenceTransformer("all-mpnet-base-v2")

# Load datasets
file_path_berd = "data/BERD data dictionary working BWS 2.xlsx"
file_path_mdr = "data/mdr Variables 1.xlsx"

# Load BERD dataset
berd_df = pd.read_excel(file_path_berd)
# Extract relevant columns from BERD dataset
berd_df = berd_df[["Question Number", "2023 Description", "New Variable Name"]].dropna(subset=["2023 Description"])
berd_df["Question Number"] = berd_df["Question Number"].astype(str)
berd_df["Question Number"] = berd_df["Question Number"].str.replace("DERIVED", "")
berd_df["Question Number"] = berd_df["Question Number"].str.replace("remarks page", "")

# Load MDR dataset
mdr_df = pd.read_excel(file_path_mdr)
# Filter MDR dataset to only include rows where statistical_program_cycle_frame_type = "Business Frame"
mdr_df = mdr_df[mdr_df["statistical_program_cycle_frame_type"] == "Business Frame"]
# Extract relevant columns from MDR dataset
mdr_df = mdr_df[["name", "definition"]].dropna()

# Convert descriptions to lists
berd_descriptions = berd_df["2023 Description"].fillna("").tolist()
mdr_descriptions = mdr_df["definition"].fillna("").tolist()

In [9]:
# Generate embeddings using BERT
print("Generating BERT embeddings...")
berd_embeddings = model.encode(berd_descriptions, convert_to_tensor=True)
mdr_embeddings = model.encode(mdr_descriptions, convert_to_tensor=True)
print("Embeddings generated.")

Generating BERT embeddings...
Embeddings generated.


In [10]:
# Compute cosine similarity
similarity_matrix = util.cos_sim(mdr_embeddings, berd_embeddings)

# Ensure only 1 match per BERD description
num_matches = 1  # Only find 1 match per description
top_n_match_indices = torch.argsort(similarity_matrix, dim=0, descending=True)[:num_matches, :]

# Initialize CountVectorizer for Jaccard Similarity
vectorizer = CountVectorizer(binary=True, stop_words="english")

# Convert BERD descriptions into a sparse binary matrix
berd_sparse = vectorizer.fit_transform(berd_df["2023 Description"])

# Convert MDR descriptions into a sparse binary matrix
mdr_sparse = vectorizer.transform(mdr_df["definition"])

In [11]:
def fast_jaccard_similarity(X1, X2):
    """ Compute Jaccard similarity for all rows in X2 against all rows in X1 using sparse matrices. """
    intersection = X2 @ X1.T  # Fast sparse matrix multiplication
    union = (X2.sum(axis=1)[:, None] + X1.sum(axis=1)[None, :] - intersection)
    return intersection / union  # Keep it as a sparse matrix

In [12]:
# Optimal weight balance
cosine_weight = 0.85  # Cosine similarity captures deep semantic meaning
jaccard_weight = 0.15  # Jaccard similarity helps with token overlap

# Set a threshold for similarity filtering
similarity_threshold = 0.3

In [13]:
# Store results
expanded_rows = []

# Process each row in BERD dataset
for row_idx in range(berd_sparse.shape[0]):
    row = berd_df.iloc[row_idx]
    description = row["2023 Description"]
    
    # Find the best match for this BERD description in the MDR dataset
    match_idx = top_n_match_indices[0, row_idx].item()

    # Initialize MDR match details
    matched_mdr_name = "No Match Found"
    matched_mdr_definition = "No Match Found"
    combined_score = 0.0
    cosine_score = 0.0
    jaccard_score_value = 0.0

    # If a valid match is found
    if match_idx < mdr_df.shape[0]:
        matched_mdr_name = mdr_df.iloc[match_idx]["name"]
        matched_mdr_definition = mdr_df.iloc[match_idx]["definition"]
        cosine_score = similarity_matrix[match_idx, row_idx].item()
        
        # Compute Jaccard similarity for this match
        jaccard_score_value = fast_jaccard_similarity(berd_sparse[row_idx:row_idx+1], mdr_sparse[match_idx:match_idx+1]).toarray()[0, 0]

#                                            0.85                                       0.15 
        combined_score = (cosine_score * cosine_weight) + (jaccard_score_value * jaccard_weight)

        # Apply threshold
        if combined_score < similarity_threshold:
            matched_mdr_name = "No Match Found"
            matched_mdr_definition = "No Match Found"

    # Combine all data into a single row
    full_row = [
        row["Question Number"],
        row["2023 Description"],
        row["New Variable Name"],
        round(combined_score * 100, 2),
        round(cosine_score * 100, 2),
        round(jaccard_score_value * 100, 2),
        matched_mdr_name,
        matched_mdr_definition,
    ]
    expanded_rows.append(full_row)

print("Processing completed.")


Processing completed.


In [15]:
# Convert results to DataFrame
columns = [
    "Question Number", "2023 Description", "New Variable Name",
    "Combined Score", "Cosine Similarity", "Jaccard Similarity",
    "MDR Variable Name", "MDR Definition",
]
df_final = pd.DataFrame(expanded_rows, columns=columns)

# Sort results by highest similarity score
df_final = df_final.sort_values(by=["Combined Score"], ascending=False).reset_index(drop=True)

# Save to CSV
df_final.to_csv("result/Final_BERD_MDR_Matched_Dataset.csv", index=False)
print("Final dataset saved as 'Final_BERD_MDR_Matched_Dataset.csv'.")

Final dataset saved as 'Final_BERD_MDR_Matched_Dataset.csv'.


In [16]:
# Display the first few rows of the final dataset
df_final

Unnamed: 0,Question Number,2023 Description,New Variable Name,Combined Score,Cosine Similarity,Jaccard Similarity,MDR Variable Name,MDR Definition
0,1-3,Reporting period,REFPER_CAL_YEAR_STAT,80.10,82.48,66.67,SURV_YEAR_STAT,Reporting Period Type
1,,Total worldwide research and development (Per...,,76.72,80.18,57.14,RD_VAL_LT3MIL_ARCHIVE,Company's worldwide expenses for research and ...
2,4-6c,Worldwide capital expenditures: total,CAPEX_WW_TOT_VAL,76.70,79.65,60.00,CAPEX_NEW_TOT_DVAL,Capital Expenditures: All new - total
3,4-5c,Expenditures: total,CAPEX_WW_TOT_VAL,76.59,84.23,33.33,EXPS_CON_PCT_OTH,"Expenditures incurred, Other"
4,1-2,Was company owned by a US company?,PARENTCO_DOM_STAT,75.47,77.03,66.67,AFFIL_SPL_OWN_DOM_NO_ARCHIVE,Is the company owned or controlled by another ...
...,...,...,...,...,...,...,...,...
878,2-25,Who completed this section?,,33.95,39.94,0.00,CERT_DATE_FROM_YEAR,"Certification, Year covered by this report (f..."
879,7-5,Who completed this section?,,33.95,39.94,0.00,CERT_DATE_FROM_YEAR,"Certification, Year covered by this report (f..."
880,2-20f,"Universities, colleges, and academic researche...",EXPS_DOM_RD_EXT_USUNI_VAL,33.92,39.91,0.00,FUNDS_DEV_FEDGOV,"Source of Research and Development Funds, Fed..."
881,2-5,IR&D,EXPS_WORLD_RD_RECOUP_VAL,31.83,37.45,0.00,ARU_INDUSTRY,Alternate Reporting Unit: Industry description


In [17]:
df_final.head()

Unnamed: 0,Question Number,2023 Description,New Variable Name,Combined Score,Cosine Similarity,Jaccard Similarity,MDR Variable Name,MDR Definition
0,1-3,Reporting period,REFPER_CAL_YEAR_STAT,80.1,82.48,66.67,SURV_YEAR_STAT,Reporting Period Type
1,,Total worldwide research and development (Per...,,76.72,80.18,57.14,RD_VAL_LT3MIL_ARCHIVE,Company's worldwide expenses for research and ...
2,4-6c,Worldwide capital expenditures: total,CAPEX_WW_TOT_VAL,76.7,79.65,60.0,CAPEX_NEW_TOT_DVAL,Capital Expenditures: All new - total
3,4-5c,Expenditures: total,CAPEX_WW_TOT_VAL,76.59,84.23,33.33,EXPS_CON_PCT_OTH,"Expenditures incurred, Other"
4,1-2,Was company owned by a US company?,PARENTCO_DOM_STAT,75.47,77.03,66.67,AFFIL_SPL_OWN_DOM_NO_ARCHIVE,Is the company owned or controlled by another ...
