In [5]:
import pandas as pd

df_main = pd.read_csv('./data/csv/d_training_main.csv')
df_lab = pd.read_csv('./data/artificial_lab_data.csv')

In [13]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15108 entries, 0 to 15107
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   RENDERING_NPI            15108 non-null  int64  
 1   POSTAL_CODE              15108 non-null  int64  
 2   DELIVERY_SYSTEM_ENCODED  15108 non-null  int64  
 3   PROVIDER_TYPE_ENCODED    15108 non-null  int64  
 4   AGE_GROUP_ENCODED        15108 non-null  int64  
 5   ADV_USER_CNT             15108 non-null  float64
 6   ADV_SVC_CNT              15108 non-null  float64
 7   PREV_USER_CNT            15108 non-null  float64
 8   PREV_SVC_CNT             15108 non-null  float64
 9   TXMT_USER_CNT            15108 non-null  float64
 10  TXMT_SVC_CNT             15108 non-null  float64
 11  EXAM_USER_CNT            15108 non-null  float64
 12  EXAM_SVC_CNT             15108 non-null  float64
 13  PREV_TO_EXAM_RATIO       15108 non-null  float64
 14  TXMT_TO_EXAM_RATIO    

In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

TARGET_COLUMN = 'LAB_ID'

EXCLUDE_COLUMNS = [
    'RENDERING_NPI', 
    TARGET_COLUMN
]

features = [col for col in df_main.columns if col not in EXCLUDE_COLUMNS]

X = df_main[features].copy()
y = df_main[TARGET_COLUMN].copy()

postal_encoder = LabelEncoder()
X['POSTAL_CODE_ENCODED'] = postal_encoder.fit_transform(X['POSTAL_CODE'])
X.drop(columns=['POSTAL_CODE'], inplace=True) 

lab_postal_encoder = LabelEncoder()
X['LAB_POSTAL_CODE_ENCODED'] = lab_postal_encoder.fit_transform(X['LAB_POSTAL_CODE'])
X.drop(columns=['LAB_POSTAL_CODE'], inplace=True) 

X_train, X_test, y_train, y_test_original = train_test_split(
    X, 
    y,
    test_size=0.2, 
    random_state=42
)

train_label_encoder = LabelEncoder()
y_train_encoded = train_label_encoder.fit_transform(y_train)

try:
    y_test_encoded = train_label_encoder.transform(y_test_original)
except ValueError as e:

    print("Warning: Test set contains labels not in training set. Handling manually.")
    mapping = {label: index for index, label in enumerate(train_label_encoder.classes_)}
    y_test_encoded = np.array([mapping.get(label, -1) for label in y_test_original])

n_classes_train = len(train_label_encoder.classes_)

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=n_classes_train,
    n_estimators=100,
    learning_rate=0.1,
    n_jobs=-1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

print(f"\nXGBoost Training on {n_classes_train} unique LAB_IDs")
xgb_model.fit(X_train, y_train_encoded) 
print("training complete")

valid_test_indices = y_test_encoded != -1
y_test_valid = y_test_encoded[valid_test_indices]
X_test_valid = X_test.iloc[valid_test_indices]

y_proba = xgb_model.predict_proba(X_test_valid) 
y_pred = xgb_model.predict(X_test_valid)

accuracy = accuracy_score(y_test_valid, y_pred)

print(f"accuracy: {accuracy:.4f}")
# print(f"Shape of Probability Output (y_proba): {y_proba.shape}")


XGBoost Training on 2689 unique LAB_IDs


Parameters: { "use_label_encoder" } are not used.



training complete
accuracy: 0.2959


In [14]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import joblib
test_case_data = {
    # Direct Input Features:
    'RENDERING_NPI': 1234567890,           # Placeholder NPI (will be dropped)
    'POSTAL_CODE': '90210',                # Clinic Location (will be encoded)
    'LAB_POSTAL_CODE': '90001',            # Default Lab Location (will be encoded)
    'DELIVERY_SYSTEM_ENCODED': 0,
    'PROVIDER_TYPE_ENCODED': 1,
    'REQUIRED_BSNS_MODEL': 0,              # Full-Service Lab (Direct input)

    # Imputed/Calculated Features:
    'AGE_GROUP_ENCODED': 2,                # Imputed default
    
    # Imputed Raw Service Counts (Assuming ADV is the focus, others are minimal)
    'ADV_USER_CNT': 10,
    'ADV_SVC_CNT': 5,
    'PREV_USER_CNT': 0,
    'PREV_SVC_CNT': 0,
    'TXMT_USER_CNT': 0,
    'TXMT_SVC_CNT': 0,
    'EXAM_USER_CNT': 10,                   # Baseline for denominator
    'EXAM_SVC_CNT': 5,                     # Baseline for denominator

    # Calculated Ratios (Must be calculated by the backend before encoding)
    'PREV_TO_EXAM_RATIO': 0.0,             # 0 / 5
    'TXMT_TO_EXAM_RATIO': 0.0,             # 0 / 5
    'ADV_TO_EXAM_RATIO': 1.0,              # 5 / 5
    
    # Inferred Specialization (From 'ADV' service requirement)
    'REQUIRED_SPEC': 2,                    # Prosthodontic (Inferred from ADV service need)
}

X_new_raw = pd.DataFrame([test_case_raw_data])

X_new = X_new_raw.drop(columns=['RENDERING_NPI']).copy()

X_new['POSTAL_CODE_ENCODED'] = postal_encoder.transform(X_new['POSTAL_CODE'])
X_new.drop(columns=['POSTAL_CODE'], inplace=True)

X_new['LAB_POSTAL_CODE_ENCODED'] = lab_postal_encoder.transform(X_new['LAB_POSTAL_CODE'])
X_new.drop(columns=['LAB_POSTAL_CODE'], inplace=True)

X_new_processed = X_new.reindex(columns=X_train.columns, fill_value=0)


probabilities = xgb_model.predict_proba(X_new_processed)

top_5_indices = np.argsort(probabilities[0])[-5:][::-1]

top_5_lab_ids = train_label_encoder.inverse_transform(top_5_indices)
top_5_probabilities = probabilities[0][top_5_indices]


print("\nRecommendation Results for New Clinic Case")
print(f"Top 5 Recommended LAB_IDs (Out of {n_classes_train} total):\n")

for lab_id, prob in zip(top_5_lab_ids, top_5_probabilities):
    print(f"LAB_ID {lab_id:<10}: Probability = {prob * 100:.2f}%")

df_lab[df_lab['LAB_ID'] == top_5_lab_ids[0]]

ValueError: y contains previously unseen labels: [np.int64(94210)]