In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
df_synthetic = pd.read_csv('/content/d_all_synth.csv')

In [None]:
df_lab = pd.read_csv('/content/d_lab_400.csv')

In [None]:
!pip install tqdm



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

tqdm.pandas()

def retrieve_id_postal_for_match(postal_code, spec, bsns, df_lab):
    """
    Searches for an eligible lab by expanding the numeric postal code range.
    If no local match is found, it assigns a random lab that meets the
    specialization and business model criteria.
    """
    if pd.isna(postal_code) or pd.isna(spec) or pd.isna(bsns):
         return {'FINAL_LAB_ID': [np.nan], 'FINAL_LAB_POSTAL_CODE': [np.nan]}

    postal_code = int(postal_code)

    full_eligible_pool = df_lab[
        (df_lab['LAB_SPECIALIZATION'] == spec) &
        (df_lab['LAB_TYPE'] == bsns)
    ].copy()

    search_increments = range(100, 6131, 100)

    for current_increment in search_increments:
        lower_bound = postal_code - current_increment
        upper_bound = postal_code + current_increment

        local_eligible_labs = full_eligible_pool[
            (full_eligible_pool['LAB_POSTAL_CODE_INT'] >= lower_bound) &
            (full_eligible_pool['LAB_POSTAL_CODE_INT'] <= upper_bound)
        ][['LAB_ID', 'LAB_POSTAL_CODE']]

        if not local_eligible_labs.empty:
            best_match = local_eligible_labs.sample(n=1).iloc[0]

            return {
                'FINAL_LAB_ID': [int(best_match['LAB_ID'])],
                'FINAL_LAB_POSTAL_CODE': [int(best_match['LAB_POSTAL_CODE'])]
            }

    if not full_eligible_pool.empty:
        default_match = full_eligible_pool[['LAB_ID', 'LAB_POSTAL_CODE']].sample(n=1).iloc[0]

        return {
            'FINAL_LAB_ID': [int(default_match['LAB_ID'])],
            'FINAL_LAB_POSTAL_CODE': [int(default_match['LAB_POSTAL_CODE'])]
        }

    return {'FINAL_LAB_ID': [int(-1)], 'FINAL_LAB_POSTAL_CODE': [int(0)]}


def assign_final_lab_match(d, df_lab):
    """
    Applies the lab matching logic to the df_synthetic DataFrame and
    shows progress for large datasets.
    """

    d['POSTAL_CODE'] = pd.to_numeric(d['POSTAL_CODE'].astype(str).str.zfill(5), errors='coerce').astype('Int64')
    df_lab['LAB_POSTAL_CODE'] = df_lab['LAB_POSTAL_CODE'].astype(str).str.zfill(5)

    df_lab['LAB_POSTAL_CODE_INT'] = df_lab['LAB_POSTAL_CODE'].astype(int)


    match_results = d.progress_apply(
        lambda row: retrieve_id_postal_for_match(
            row['POSTAL_CODE'],
            row['REQUIRED_SPEC'],
            row['REQUIRED_LAB_TYPE'],
            df_lab
        ),
        axis=1
    )

    match_df = match_results.apply(pd.Series)

    d['LAB_ID'] = match_df['FINAL_LAB_ID'].str[0].astype(int)
    d['LAB_POSTAL_CODE'] = match_df['FINAL_LAB_POSTAL_CODE'].str[0].astype(int)

    df_lab.drop(columns=['LAB_POSTAL_CODE_INT'], inplace=True)

    return d

In [12]:
df_synthetic = assign_final_lab_match(df_synthetic, df_lab)

100%|██████████| 1792615/1792615 [1:53:16<00:00, 263.75it/s]


In [13]:
df_synthetic.head(1)

Unnamed: 0.1,Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID,REQUIRED_SPEC,EXPEDITED,REQUIRED_LAB_TYPE,LAB_ID,LAB_POSTAL_CODE
0,0,1952483117,94538,0,0,0,1,0,0,1379044,1,0,0,488,94461


In [20]:
df_synthetic.drop(columns=['Unnamed: 0'], inplace=True)

In [21]:
df_synthetic.columns

Index(['RENDERING_NPI', 'POSTAL_CODE', 'DELIVERY_SYSTEM_ENCODED',
       'PROVIDER_TYPE_ENCODED', 'AGE_GROUP_ENCODED', 'PREV', 'TXMT', 'ADV',
       'CASE_ID', 'REQUIRED_SPEC', 'EXPEDITED', 'REQUIRED_LAB_TYPE', 'LAB_ID',
       'LAB_POSTAL_CODE'],
      dtype='object')

In [None]:
df_synthetic.to_csv('/content/d_all_synthetic_training.csv')