In [1]:
import pandas as pd

df_main = pd.read_csv('./data/csv/d_training_main.csv')
df_lab = pd.read_csv('./data/artificial_lab_data.csv')

In [18]:
len(df_main['RENDERING_NPI'].unique())

8149

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

TARGET_COLUMN = 'LAB_ID'

EXCLUDE_COLUMNS = [
    'RENDERING_NPI', 
    TARGET_COLUMN
]

features = [col for col in df_main.columns if col not in EXCLUDE_COLUMNS]

X = df_main[features].copy()
y = df_main[TARGET_COLUMN].copy()

postal_encoder = LabelEncoder()
X['POSTAL_CODE_ENCODED'] = postal_encoder.fit_transform(X['POSTAL_CODE'])
X.drop(columns=['POSTAL_CODE'], inplace=True) 

lab_postal_encoder = LabelEncoder()
X['LAB_POSTAL_CODE_ENCODED'] = lab_postal_encoder.fit_transform(X['LAB_POSTAL_CODE'])
X.drop(columns=['LAB_POSTAL_CODE'], inplace=True) 

X_train, X_test, y_train, y_test_original = train_test_split(
    X, 
    y,
    test_size=0.2, 
    random_state=42
)

train_label_encoder = LabelEncoder()
y_train_encoded = train_label_encoder.fit_transform(y_train)

try:
    y_test_encoded = train_label_encoder.transform(y_test_original)
except ValueError as e:

    print("Warning: Test set contains labels not in training set. Handling manually.")
    mapping = {label: index for index, label in enumerate(train_label_encoder.classes_)}
    y_test_encoded = np.array([mapping.get(label, -1) for label in y_test_original])

n_classes_train = len(train_label_encoder.classes_)

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=n_classes_train,
    n_estimators=100,
    learning_rate=0.1,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

print(f"\nXGBoost Training on {n_classes_train} unique LAB_IDs")
xgb_model.fit(X_train, y_train_encoded) 
print("training complete")

valid_test_indices = y_test_encoded != -1
y_test_valid = y_test_encoded[valid_test_indices]
X_test_valid = X_test.iloc[valid_test_indices]

y_proba = xgb_model.predict_proba(X_test_valid) 
y_pred = xgb_model.predict(X_test_valid)

accuracy = accuracy_score(y_test_valid, y_pred)

print(f"accuracy: {accuracy:.4f}")


XGBoost Training on 2689 unique LAB_IDs


Parameters: { "use_label_encoder" } are not used.



training complete
accuracy: 0.2959


Just realized the low accuracy could be because of not enough instances present for each NPI to train on, there's hardly 1-2/NPI that's then being classified into 10,000 different laboratories, which is highly imbalanced.

**Alternative Approach:** \
Instead of taking total service counts for each NPI, we can separate each instance into the type of service required by choosing between ADV / TXMT / PREV using binary encoding, that can then be classified into the most suitable lab_id based on the clinic's postal code and the type of service required and business model. So each instance would represent only one entry for one service under one NPI, instead of taking a service count we would divide this into individual entries of service required and assigned labs.

In [2]:
df_main.columns

Index(['RENDERING_NPI', 'POSTAL_CODE', 'DELIVERY_SYSTEM_ENCODED',
       'PROVIDER_TYPE_ENCODED', 'AGE_GROUP_ENCODED', 'ADV_USER_CNT',
       'ADV_SVC_CNT', 'PREV_USER_CNT', 'PREV_SVC_CNT', 'TXMT_USER_CNT',
       'TXMT_SVC_CNT', 'EXAM_USER_CNT', 'EXAM_SVC_CNT', 'PREV_TO_EXAM_RATIO',
       'TXMT_TO_EXAM_RATIO', 'ADV_TO_EXAM_RATIO', 'REQUIRED_SPEC',
       'REQUIRED_BSNS_MODEL', 'LAB_ID', 'LAB_POSTAL_CODE'],
      dtype='object')

In [21]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker('en_US')

df_main['POSTAL_CODE'] = df_main['POSTAL_CODE'].astype(str).str.zfill(5)


npi_base_features = df_main[[
    'RENDERING_NPI', 
    'POSTAL_CODE', 
    'DELIVERY_SYSTEM_ENCODED',
    'PROVIDER_TYPE_ENCODED'
]].drop_duplicates().reset_index(drop=True)

MIN_INSTANCES = 100
MAX_INSTANCES = 300

def generate_npi_instances(row):
    """Generates a random number of case entries for a single NPI."""
    n_entries = random.randint(MIN_INSTANCES, MAX_INSTANCES)

    data = {col: [row[col]] * n_entries for col in npi_base_features.columns}
    
    data['AGE_GROUP_ENCODED'] = np.random.choice(
        [0, 1],
        size=n_entries, 
        p=[0.5, 0.5]
    )

    service_indicators = np.zeros((n_entries, 3), dtype=int)
    
    random_indices = np.random.randint(0, 3, size=n_entries) 
    service_indicators[np.arange(n_entries), random_indices] = 1

    data['PREV'] = service_indicators[:, 0]
    data['TXMT'] = service_indicators[:, 1]
    data['ADV'] = service_indicators[:, 2]

    return pd.DataFrame(data)

list_of_dfs = npi_base_features.apply(generate_npi_instances, axis=1).tolist()
df_synthetic = pd.concat(list_of_dfs, ignore_index=True)

df_synthetic['CASE_ID'] = range(1, len(df_synthetic) + 1)

df_synthetic = df_synthetic.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total rows created: {len(df_synthetic):,}")
print(f"Number of unique NPIs used: {df_synthetic['RENDERING_NPI'].nunique()}")

Total rows created: 1,793,602
Number of unique NPIs used: 8149


In [None]:
df_synthetic.head(5)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID
0,1407801871,90248,0,0,0,0,1,0,579359
1,1437184769,92071,0,1,1,0,1,0,1601414
2,1255495354,90266,0,0,1,0,1,0,368706
3,1710016514,90723,0,0,1,0,0,1,1016175
4,1649406638,90035,0,0,1,0,0,1,918446


In [26]:
def calculate_required_spec(d):
    ratio_cols = ["PREV", "TXMT", "ADV"]
    
    d['DOMINANT_RATIO'] = d[ratio_cols].idxmax(axis=1)

    encoded_vals = {
        'PREV': 1,
        'TXMT': 0,
        'ADV': 2
    }

    d['REQUIRED_SPEC'] = d['DOMINANT_RATIO'].map(encoded_vals)

    d.drop(columns=['DOMINANT_RATIO'], inplace=True)

    return d

In [None]:
def calculate_required_bsns_model(d):
    
    svc_count_cols = ['PREV_SVC_CNT', 'TXMT_SVC_CNT', 'ADV_SVC_CNT', 'EXAM_SVC_CNT']

    for col in svc_count_cols:
        d[col] = pd.to_numeric(d[col], errors='coerce').fillna(0)
        
    d['TOTAL_SVC_VOL'] = d[svc_count_cols].sum(axis=1)

    lower_q = 0.33
    higher_q = 1 - lower_q
    LOW_VOL_T = d['TOTAL_SVC_VOL'].quantile(lower_q) 
    HIGH_VOL_T = d['TOTAL_SVC_VOL'].quantile(higher_q)

    
    d['REQUIRED_BSNS_MODEL'] = np.where(

        d['TOTAL_SVC_VOL'] >= HIGH_VOL_T,
        
        np.where(

            (d['REQUIRED_SPEC'] == 2) | (d['REQUIRED_SPEC'] == 1) | (d['REQUIRED_SPEC'] == 0),
            1,
            0
        )
    )

    d.drop(columns=['TOTAL_SVC_VOL'], inplace=True)

    return d

In [None]:
df_synthetic = calculate_required_spec(df_synthetic)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID,REQUIRED_SPEC
0,1407801871,90248,0,0,0,0,1,0,579359,0
1,1437184769,92071,0,1,1,0,1,0,1601414,0
2,1255495354,90266,0,0,1,0,1,0,368706,0
3,1710016514,90723,0,0,1,0,0,1,1016175,2
4,1649406638,90035,0,0,1,0,0,1,918446,2


In [32]:
len(df_synthetic[df_synthetic['RENDERING_NPI'] == 1407801871])

275

In [38]:
df_synthetic['TXMT'].unique()

array([1, 0])

In [None]:
def assign_expedited(d, expedited_rate=0.21):
    # Use np.random.choice to randomly assign 1 (True) or 0 (False)
    # The probability p=[0.79, 0.21] means 85% of cases are standard, 15% are expedited.
    d['Expedited'] = np.random.choice(
        [0, 1], 
        size=len(d), 
        p=[1 - expedited_rate, expedited_rate]
    ).astype(int)
    
    return d

In [None]:
def assign_lab_type(d):
    REST = 0
    PREV = 1
    PROS = 2
    
    FULL_SERVICE = 0
    SPECIALTY = 1
    MILLING_CENTER = 2

    d['Expedited'] = pd.to_numeric(d['Expedited'], errors='coerce').fillna(0).astype(int)
    
    conditions = [

        (d['Expedited'] == 1) & ((d['REQUIRED_SPEC'] == REST) | (d['REQUIRED_SPEC'] == PROS)),
        
        (d['Expedited'] == 0) & (d['REQUIRED_SPEC'] == PROS),

        (d['Expedited'] == 0) & (d['REQUIRED_SPEC'] == REST),

        (d['REQUIRED_SPEC'] == PREV)
    ]

    choices = [
        MILLING_CENTER,     # P1: REST/PROS + Expedited --> Milling Center (2)
        SPECIALTY,          # P2: PROS + Standard Speed --> Specialty Lab (1)
        FULL_SERVICE,       # P3: REST + Standard Speed --> Full-Service Lab (0)
        FULL_SERVICE        # P4: PREVENTIVE (Expedited or not) --> Full-Service Lab (0)
    ]

    d['REQUIRED_LAB_TYPE'] = np.select(
        conditions, 
        choices, 
        default=FULL_SERVICE
    )
    
    return d

In [47]:
df_synthetic = assign_expedited(df_synthetic)
df_synthetic = assign_lab_type(df_synthetic)

In [48]:
df_synthetic.head(100)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID,REQUIRED_SPEC,Expedited,REQUIRED_LAB_TYPE
0,1407801871,90248,0,0,0,0,1,0,579359,0,0,0
1,1437184769,92071,0,1,1,0,1,0,1601414,0,1,2
2,1255495354,90266,0,0,1,0,1,0,368706,0,1,2
3,1710016514,90723,0,0,1,0,0,1,1016175,2,1,2
4,1649406638,90035,0,0,1,0,0,1,918446,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1750459723,90025,2,0,1,0,0,1,1086030,2,0,1
96,1487665030,92505,0,0,0,0,0,1,690027,2,0,1
97,1457482069,92324,0,0,0,0,1,0,650457,0,0,0
98,1104088178,91789,0,0,1,0,0,1,150977,2,0,1
