In [41]:
import pandas as pd

df_main = pd.read_csv('./data/csv/d_training_main.csv')
df_lab = pd.read_csv('./data/artificial_lab_data.csv')

In [42]:
len(df_main['RENDERING_NPI'].unique())

8149

Just realized the low accuracy could be because of not enough instances present for each NPI to train on, there's hardly 1-2/NPI that's then being classified into 10,000 different laboratories, which is highly imbalanced.

**Alternative Approach:** \
Instead of taking total service counts for each NPI, we can separate each instance into the type of service required by choosing between ADV / TXMT / PREV using binary encoding, that can then be classified into the most suitable lab_id based on the clinic's postal code and the type of service required and business model. So each instance would represent only one entry for one service under one NPI, instead of taking a service count we would divide this into individual entries of service required and assigned labs.

In [43]:
df_main.columns

Index(['RENDERING_NPI', 'POSTAL_CODE', 'DELIVERY_SYSTEM_ENCODED',
       'PROVIDER_TYPE_ENCODED', 'AGE_GROUP_ENCODED', 'ADV_USER_CNT',
       'ADV_SVC_CNT', 'PREV_USER_CNT', 'PREV_SVC_CNT', 'TXMT_USER_CNT',
       'TXMT_SVC_CNT', 'EXAM_USER_CNT', 'EXAM_SVC_CNT', 'PREV_TO_EXAM_RATIO',
       'TXMT_TO_EXAM_RATIO', 'ADV_TO_EXAM_RATIO', 'REQUIRED_SPEC',
       'REQUIRED_BSNS_MODEL', 'LAB_ID', 'LAB_POSTAL_CODE'],
      dtype='object')

In [44]:
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker('en_US')

df_main['POSTAL_CODE'] = df_main['POSTAL_CODE'].astype(str).str.zfill(5)


npi_base_features = df_main[[
    'RENDERING_NPI', 
    'POSTAL_CODE', 
    'DELIVERY_SYSTEM_ENCODED',
    'PROVIDER_TYPE_ENCODED'
]].drop_duplicates().reset_index(drop=True)

MIN_INSTANCES = 100
MAX_INSTANCES = 300

def generate_npi_instances(row):
    """Generates a random number of case entries for a single NPI."""
    n_entries = random.randint(MIN_INSTANCES, MAX_INSTANCES)

    data = {col: [row[col]] * n_entries for col in npi_base_features.columns}
    
    data['AGE_GROUP_ENCODED'] = np.random.choice(
        [0, 1],
        size=n_entries, 
        p=[0.5, 0.5]
    )

    service_indicators = np.zeros((n_entries, 3), dtype=int)
    
    random_indices = np.random.randint(0, 3, size=n_entries) 
    service_indicators[np.arange(n_entries), random_indices] = 1

    data['PREV'] = service_indicators[:, 0]
    data['TXMT'] = service_indicators[:, 1]
    data['ADV'] = service_indicators[:, 2]

    return pd.DataFrame(data)

list_of_dfs = npi_base_features.apply(generate_npi_instances, axis=1).tolist()
df_synthetic = pd.concat(list_of_dfs, ignore_index=True)

df_synthetic['CASE_ID'] = range(1, len(df_synthetic) + 1)

df_synthetic = df_synthetic.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total rows created: {len(df_synthetic):,}")
print(f"Number of unique NPIs used: {df_synthetic['RENDERING_NPI'].nunique()}")

Total rows created: 1,792,615
Number of unique NPIs used: 8149


In [45]:
df_synthetic.head(5)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID
0,1952483117,94538,0,0,0,1,0,0,1379044
1,1669575783,94601,0,1,0,0,1,0,1673754
2,1780781450,90002,0,0,0,0,1,0,1131355
3,1689050296,92842,0,1,0,1,0,0,1679072
4,1134573462,92026,0,0,1,1,0,0,203204


In [46]:
def calculate_required_spec(d):
    ratio_cols = ["PREV", "TXMT", "ADV"]
    
    d['DOMINANT_RATIO'] = d[ratio_cols].idxmax(axis=1)

    encoded_vals = {
        'PREV': 1,
        'TXMT': 0,
        'ADV': 2
    }

    d['REQUIRED_SPEC'] = d['DOMINANT_RATIO'].map(encoded_vals)

    d.drop(columns=['DOMINANT_RATIO'], inplace=True)

    return d

In [47]:
df_synthetic = calculate_required_spec(df_synthetic)

In [48]:
len(df_synthetic[df_synthetic['RENDERING_NPI'] == 1407801871])

259

In [49]:
df_synthetic['TXMT'].unique()

array([0, 1])

In [50]:
def assign_expedited(d, expedited_rate=0.21):
    d['EXPEDITED'] = np.random.choice(
        [0, 1], 
        size=len(d), 
        p=[1 - expedited_rate, expedited_rate]
    ).astype(int)
    
    return d

In [51]:
def assign_lab_type(d):
    REST = 0
    PREV = 1
    PROS = 2
    
    FULL_SERVICE = 0
    SPECIALTY = 1
    MILLING_CENTER = 2

    d['EXPEDITED'] = pd.to_numeric(d['EXPEDITED'], errors='coerce').fillna(0).astype(int)
    
    conditions = [

        (d['EXPEDITED'] == 1) & ((d['REQUIRED_SPEC'] == REST) | (d['REQUIRED_SPEC'] == PROS)),
        
        (d['EXPEDITED'] == 0) & (d['REQUIRED_SPEC'] == PROS),

        (d['EXPEDITED'] == 0) & (d['REQUIRED_SPEC'] == REST),

        (d['REQUIRED_SPEC'] == PREV)
    ]

    choices = [
        MILLING_CENTER,     # P1: REST/PROS + Expedited --> Milling Center (2)
        SPECIALTY,          # P2: PROS + Standard Speed --> Specialty Lab (1)
        FULL_SERVICE,       # P3: REST + Standard Speed --> Full-Service Lab (0)
        FULL_SERVICE        # P4: PREVENTIVE (Expedited or not) --> Full-Service Lab (0)
    ]

    d['REQUIRED_LAB_TYPE'] = np.select(
        conditions, 
        choices, 
        default=FULL_SERVICE
    )
    
    return d

In [52]:
df_synthetic = assign_expedited(df_synthetic)
df_synthetic = assign_lab_type(df_synthetic)

In [53]:
df_synthetic.head(100)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,PREV,TXMT,ADV,CASE_ID,REQUIRED_SPEC,EXPEDITED,REQUIRED_LAB_TYPE
0,1952483117,94538,0,0,0,1,0,0,1379044,1,0,0
1,1669575783,94601,0,1,0,0,1,0,1673754,0,1,2
2,1780781450,90002,0,0,0,0,1,0,1131355,0,0,0
3,1689050296,92842,0,1,0,1,0,0,1679072,1,0,0
4,1134573462,92026,0,0,1,1,0,0,203204,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1124198296,95129,0,0,1,0,1,0,178002,0,0,0
96,1982085650,95628,0,0,1,0,1,0,1417783,0,0,0
97,1992079586,91710,0,0,1,0,1,0,1435297,0,1,2
98,1174803803,93274,0,1,0,1,0,0,1513021,1,0,0


In [54]:
df_synthetic.to_csv('./data/csv/d_all_synth.csv')

In [55]:
df_lab = pd.read_csv('./data/artificial_lab_data.csv')
df_lab.rename(columns={'LAB_BSNS_MODEL': 'LAB_TYPE'}, inplace=True)
df_lab.head(1)

Unnamed: 0,LAB_ID,LAB_NAME,LAB_POSTAL_CODE,LAB_SPECIALIZATION,LAB_TYPE
0,1,"Pope, Ray and Hudson",90601,2,1


In [56]:
len(df_synthetic['RENDERING_NPI'].unique())

8149

**Taking only 400 labs into consideration to classifiy 8149 unique NPIs**

In [57]:
n_consider = 400

df_lab = df_lab.sample(n=n_consider, random_state=42).reset_index(drop=True)

In [58]:
print(f"number of unique lab ids: {len(df_lab['LAB_ID'].unique())}")
df_lab.head(5)

number of unique lab ids: 400


Unnamed: 0,LAB_ID,LAB_NAME,LAB_POSTAL_CODE,LAB_SPECIALIZATION,LAB_TYPE
0,6253,"Williams, Daugherty and Alvarez",93993,1,2
1,4685,Salinas-Andersen,95195,1,0
2,1732,"Pittman, Peterson and Hall",90920,0,0
3,4743,Hernandez LLC,93252,1,2
4,4522,Green Ltd,90121,1,2


In [59]:
df_lab.to_csv('./data/csv/d_lab_400.csv')

In [60]:
df_lab['LAB_POSTAL_CODE'].describe()['max'] - df_lab['LAB_POSTAL_CODE'].describe()['min']

6131.0

***further steps are done in the [smart-dental-fe1.ipynb](smart-dental-fe1.ipynb) through google colab, because the lab id and postal code assignment through complex logic is computationally expensive to run locally, while my i9 14900HX takes 4 hrs to complete the same process takes about 2 hrs on colab. Would definitely help if you have colab pro in case your session crashes.***