In [89]:
#Install necessary Python Libraries

#%pip install sentence-transformers

In [90]:
import pandas as pd

# Load DataFrame with existing names + definitions

mdr_file_path = "mdr Variables 1.xlsx"
mdr_data = pd.ExcelFile(mdr_file_path)
mdr_df = mdr_data.parse('Sheet1')
print("Before filtering Business Frame: " + str(mdr_df.shape))

# Filtering everything but Business Frame
mdr_df = mdr_df[mdr_df['statistical_program_cycle_frame_type'] == 'Business Frame']

print("After filtering Business Frame: " + str(mdr_df.shape))
# Remove rows where 'definition' or 'name' is empty or NaN
mdr_defs = mdr_df[['name', 'definition']].dropna().reset_index(drop=True)


Before filtering Business Frame: (41626, 9)
After filtering Business Frame: (9410, 9)


In [91]:
mdr_defs.loc[20]

name                                          PRCH_PROFTECH
definition    Purchased professional and technical services
Name: 20, dtype: object

In [92]:
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [93]:
# Load Dataframe with new names + definitions
qfr = pd.read_excel('QFR_DataSet for Matching to MDR.xlsx', sheet_name=['Sheet1'], header=0)
qfr = qfr['Sheet1']
qfr.head()

Unnamed: 0,SURVEY,Legacy Variable Name,ISOURCE,KEYCDE,LABEL,ROW,COL,REQFLG,RSPFLG,ADJFLG,...,DTIMES,ERR_LIST,HITEM,HITM_UPD,HITM_SRT,CONFLG,PRTFLG,USRNME,PRGNME,PRGDTM
0,QFR,1,F,1,Is this your Company's Industry -YES/NO,,,,,,...,,,N,,,,,seabo001,ITEMDICT REV,2009-05-08 14:52:39.708
1,QFR,21,F,21,Closing date,,,,,,...,,,N,,,,,seabo001,ITEMDICT REV,2006-09-20 12:37:40.227
2,QFR,22,F,22,EI of corp,,,,,,...,,,N,,,,,seabo001,ITEMDICT REV,2006-09-20 12:37:46.843
3,QFR,31,F,31,Corp status active/discontinued/merged,,,,,,...,,,N,,,,,seabo001,ITEMDICT REV,2006-09-20 12:37:54.996
4,QFR,34,F,34,Discontinued Date,,,,,,...,,,N,,,,,seabo001,ITEMDICT REV,2006-09-20 12:38:02.600


In [94]:
abs_defs = qfr[['LABEL', 'DESCRP1', 'QUESTION', 'SFELBL1']]
abs_defs['definition'] = abs_defs[['LABEL', 'DESCRP1', 'QUESTION', 'SFELBL1']].astype(str).apply(lambda row: ' '.join(row.values), axis=1)
abs_defs['Legacy Variable Name'] = qfr['Legacy Variable Name']
new_definition_embeddings = model.encode(abs_defs['definition'], convert_to_tensor=True)

# Encode existing definitions
definition_embeddings = model.encode(mdr_defs['definition'].tolist(), convert_to_tensor=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abs_defs['definition'] = abs_defs[['LABEL', 'DESCRP1', 'QUESTION', 'SFELBL1']].astype(str).apply(lambda row: ' '.join(row.values), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abs_defs['Legacy Variable Name'] = qfr['Legacy Variable Name']


In [95]:
results = []

# Calculate cosine similarities for each new description
for i, new_embedding in enumerate(new_definition_embeddings):
    similarities = util.pytorch_cos_sim(new_embedding, definition_embeddings)
    most_similar_idx = similarities.argmax().item()
    similarity_score = similarities[0][most_similar_idx].item()
    similarity_score = round(similarity_score * 100, 1)
    results.append({
        'legacy_name_from_qfr': abs_defs['Legacy Variable Name'][i],
        'most_similar_name_from_MDR': mdr_defs['name'][most_similar_idx],
        'similarity_score': similarity_score
    })

# Create a new DataFrame from the results
results_df = pd.DataFrame(results)

# Display the new DataFrame
print(results_df)

    legacy_name_from_qfr most_similar_name_from_MDR  similarity_score
0                  00001            NAICS_WRTIN_TXT              49.3
1                  00021              YEAR_END_DATE              47.3
2                  00022                    EIN_NUM              76.3
3                  00031                OPSTAT_DATE              56.6
4                  00034                OPSTAT_DATE              63.1
..                   ...                        ...               ...
144                 CPAY         EXPS_INTEREST_PAID              38.1
145                 PPAY                PAY_QTR1_PY              56.5
146                P2PAY                PAY_QTR1_PY              57.0
147                P3PAY         PAY_QTR1_PRDWRK_PY              58.6
148                NDNET                NET_REV_LSD              46.6

[149 rows x 3 columns]


In [96]:
abs_defs.head()

Unnamed: 0,LABEL,DESCRP1,QUESTION,SFELBL1,definition,Legacy Variable Name
0,Is this your Company's Industry -YES/NO,Is this your company's industry - yes/no,Is this your company's industry - yes/no,Does the industry description describe your ...,Is this your Company's Industry -YES/NO Is thi...,1
1,Closing date,Closing date,Closing date,Annual closing date (MMDD) ...,Closing date Closing date Closing date Annual ...,21
2,EI of corp,Employer Identification Number,Employer Identification Number,Federal Employer Identification Number (EI) ...,EI of corp Employer Identification Number Empl...,22
3,Corp status active/discontinued/merged,Corporation Status Active/Discontinued/Merged,Corporation Status Active/Discontinued/Merged,Corporation Status ...,Corp status active/discontinued/merged Corpora...,31
4,Discontinued Date,Discontinued Date,Discontinued Date,Corporation Status of DISCONTINUED Date ...,Discontinued Date Discontinued Date Discontinu...,34


In [97]:
mdr_defs.head()

Unnamed: 0,name,definition
0,BENEFIT_HEALTH,Employer's cost for health insurance
1,BENEFIT_HEALTH_PY,Employer's cost for health insurance Prior Year
2,BENEFIT_OTH,Employer's Cost for other benefits
3,BENEFIT_OTH_PY,Employer's Cost for other benefits Prior Year
4,BENEFIT_PENSION_BEN,Employer's Cost for defined benefit pension plans


In [98]:
sorted_df = results_df.sort_values(by=['similarity_score'], ascending=[False])
sorted_df['similarity_score'] = sorted_df['similarity_score'].apply(lambda x: str(x) + '%')
sorted_df.rename(columns={'legacy_name_from_qfr': 'QFR - Legacy Variable', 'most_similar_name_from_MDR': 'MDR - name'}, inplace=True)

print(sorted_df.head(15))


    QFR - Legacy Variable            MDR - name similarity_score
2                   00022               EIN_NUM            76.3%
59                  00218              DEPR_VAL            71.7%
20                  00101          RCPT_TOT_VAL            69.1%
55                  00214              INV_STAT            65.9%
8                   00051      AFFIL_OWNER_GT50            64.0%
24                  00105     EXPS_INTEREST_VAL            63.7%
22                  00103        EXPS_OTHER_VAL            63.1%
4                   00034           OPSTAT_DATE            63.1%
29                  00110  EXPS_OTHER_WRTIN_TXT            62.2%
21                  00102              DEPR_VAL            61.9%
6                   00041      AFFIL_OWNER_GT50            60.5%
121                  RPTE         YEAR_END_DATE            59.0%
23                  00104      PROFIT_LOSS_DVAL            58.7%
147                 P3PAY    PAY_QTR1_PRDWRK_PY            58.6%
26                  00107

In [99]:
print(sorted_df.tail(15))

    QFR - Legacy Variable                    MDR - name similarity_score
110                   CB8                SMOKE-VARIABLE            33.4%
124                 XRSPC                GEO_STATE_FIPS            33.2%
108                 CB118                  FGINFO_PHONE            33.1%
61                  00220                      DEPR_VAL            33.0%
45                  00204    JOINT_COMPANY_ADDR_STATE_4            33.0%
16                  00091                        COMPID            32.7%
140                 D327S  NC-9902C_TYPOP_SELFDSG_WRTIN            32.3%
39                  00120      JOINT_COMPANY_ADDR_ZIP_5            32.0%
109                 CB119                  FGINFO_PHONE            31.5%
17                  00092                PREV_OWNER_EIN            30.4%
139                 D223S              VALUE_ADDED_DVAL            30.1%
138                 D223L              VALUE_ADDED_DVAL            30.0%
19                  00094       AIES_TYPOP_WHSL_OLD

In [102]:
sorted_df.to_csv('results_for_single_column_between_QFR_and_new_MDR.csv')