In [24]:
import pandas as pd

In [25]:
file_path = "./data/ddw1.xlsx"
df = pd.read_excel(file_path)

In [26]:
len(df)

24300

**Features**

In [27]:
print(df.columns)

Index(['RENDERING_NPI', 'PROVIDER_LEGAL_NAME', 'CALENDAR_YEAR',
       'DELIVERY_SYSTEM', 'PROVIDER_TYPE', 'AGE_GROUP', 'ADV_USER_CNT',
       'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT', 'ADV_SVC_ANNOTATION_CODE',
       'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE', 'PREV_SVC_CNT',
       'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT',
       'EXAM_SVC_ANNOTATION_CODE'],
      dtype='object')


**Number of missing values under each feature**

In [28]:
missing_val = df.isnull().sum()
print(missing_val)

RENDERING_NPI                     0
PROVIDER_LEGAL_NAME               2
CALENDAR_YEAR                     0
DELIVERY_SYSTEM                   0
PROVIDER_TYPE                     0
AGE_GROUP                         0
ADV_USER_CNT                   5848
ADV_USER_ANNOTATION_CODE      18452
ADV_SVC_CNT                    5848
ADV_SVC_ANNOTATION_CODE       18452
PREV_USER_CNT                  8744
PREV_USER_ANNOTATION_CODE     15556
PREV_SVC_CNT                   4592
PREV_SVC_ANNOTATION_CODE      19708
TXMT_USER_CNT                  5792
TXMT_USER_ ANNOTATION_CODE    18508
TXMT_SVC_CNT                   5792
TXMT_SVC_ANNOTATION_CODE      18508
EXAM_USER_CNT                  4712
EXAM_USER_ANNOTATION_CODE     19588
EXAM_SVC_CNT                   4712
EXAM_SVC_ANNOTATION_CODE      19588
dtype: int64


**Delivery System**

In [29]:
feature_name = 'DELIVERY_SYSTEM'
print(df[feature_name].unique())
print(f"missing values: {df[feature_name].isnull().sum()}")

['FFS' 'PHP' 'GMC']
missing values: 0


**FFS**: (Fee-For-Service), providers are paid a fee for each specific service they perform.

**GMC**: (Geographic Managed Care), might be enrolled in a health plan that serves a specific geographic area.

**PHP**: (Pre-Paid Health), pays a fixed, regular fee to the health plan, it could provide a pre-determined set of healthcare services as needed. Could be like a subscription service for healthcare, by paying upfront.

**Suggestions**:
* transform (cat -> num)

In [30]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

le = LabelEncoder()

df['DELIVERY_SYSTEM_ENCODED'] = le.fit_transform(df['DELIVERY_SYSTEM'])
df.head(1)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,PROVIDER_TYPE,AGE_GROUP,ADV_USER_CNT,ADV_USER_ANNOTATION_CODE,ADV_SVC_CNT,ADV_SVC_ANNOTATION_CODE,...,PREV_SVC_ANNOTATION_CODE,TXMT_USER_CNT,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE,DELIVERY_SYSTEM_ENCODED
0,1003003781,CHOTI SUPAK,2018,FFS,RENDERING,AGE 0-20,101.0,,847.0,,...,,37.0,,115.0,,83.0,,98.0,,0


reordering columns

In [31]:
new_order = ['RENDERING_NPI', 'PROVIDER_LEGAL_NAME', 'CALENDAR_YEAR',
       'DELIVERY_SYSTEM', 'DELIVERY_SYSTEM_ENCODED', 'PROVIDER_TYPE', 'AGE_GROUP', 'ADV_USER_CNT',
       'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT', 'ADV_SVC_ANNOTATION_CODE',
       'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE', 'PREV_SVC_CNT',
       'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT', 'EXAM_SVC_ANNOTATION_CODE']

df = df[new_order]
df.head(1)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,AGE_GROUP,ADV_USER_CNT,ADV_USER_ANNOTATION_CODE,ADV_SVC_CNT,...,PREV_SVC_CNT,PREV_SVC_ANNOTATION_CODE,TXMT_USER_CNT,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,AGE 0-20,101.0,,847.0,...,216.0,,37.0,,115.0,,83.0,,98.0,


**More about Provider Type**

In [32]:
feature_name = 'PROVIDER_TYPE'
print(df[feature_name].unique())
print(f"missing values: {df[feature_name].isnull().sum()}")

['RENDERING' 'RENDERING SNC']
missing values: 0


**RENDERING**: providers those that generally have higher prices, operate as a **for-profit**, and provides services to commercial insurance. 

**RENDERING SNC**: refers to provider in the Safety Net Clinic (SNC), they operate as a mission-driven or **non-profit framework** generally offer affordable prices, for low to average-waged people.

**Suggestions**:
* transform (cat -> num)

In [33]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

le = LabelEncoder()

df['PROVIDER_TYPE_ENCODED'] = le.fit_transform(df['PROVIDER_TYPE'])
new_order = ['RENDERING_NPI', 'PROVIDER_LEGAL_NAME', 'CALENDAR_YEAR',
       'DELIVERY_SYSTEM', 'DELIVERY_SYSTEM_ENCODED', 'PROVIDER_TYPE', 'PROVIDER_TYPE_ENCODED', 'AGE_GROUP', 'ADV_USER_CNT',
       'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT', 'ADV_SVC_ANNOTATION_CODE',
       'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE', 'PREV_SVC_CNT',
       'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT', 'EXAM_SVC_ANNOTATION_CODE']
df = df[new_order]
df.head(1)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED,AGE_GROUP,ADV_USER_CNT,ADV_USER_ANNOTATION_CODE,...,PREV_SVC_CNT,PREV_SVC_ANNOTATION_CODE,TXMT_USER_CNT,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 0-20,101.0,,...,216.0,,37.0,,115.0,,83.0,,98.0,


**Age Groups**

In [34]:
feature_name = 'AGE_GROUP'
print(df[feature_name].unique())
print(f"missing values: {df[feature_name].isnull().sum()}")

['AGE 0-20' 'AGE 21+']
missing values: 0


In [35]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

le = LabelEncoder()

df['AGE_GROUP_ENCODED'] = le.fit_transform(df['AGE_GROUP'])
new_order = ['RENDERING_NPI', 'PROVIDER_LEGAL_NAME', 'CALENDAR_YEAR',
       'DELIVERY_SYSTEM', 'DELIVERY_SYSTEM_ENCODED', 'PROVIDER_TYPE', 'PROVIDER_TYPE_ENCODED', 'AGE_GROUP', 'AGE_GROUP_ENCODED', 'ADV_USER_CNT',
       'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT', 'ADV_SVC_ANNOTATION_CODE',
       'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE', 'PREV_SVC_CNT',
       'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT', 'EXAM_SVC_ANNOTATION_CODE']
df = df[new_order]
df.head(1)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED,AGE_GROUP,AGE_GROUP_ENCODED,ADV_USER_CNT,...,PREV_SVC_CNT,PREV_SVC_ANNOTATION_CODE,TXMT_USER_CNT,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 0-20,0,101.0,...,216.0,,37.0,,115.0,,83.0,,98.0,


The rest of the features (from my understanding) are divided into:
* **ADV**: advanced procedures
* **PREV**: preventive procedures
* **TXMT**: treatment and surgical procedures
* **EXAM**: examination

If we had to order this in terms of criticality (as in increasing order of critical-ness of a procedure), it would be: **EXAM** < **PREV** < **ADV** or **TXMT**

Each of these have a **User Count**, **Annotation Code**, **Service Count**, and **Service Annotation Code**

Though I could not narrow down what "Annotation Code" really is, I could notice the Annotation Codes were only given to those rows where the corresponding "Count" was missing. 
This could mean annotation codes are given to special cases where the data is not present but is a special case scenario that refers to something.



In [36]:
print(df['ADV_USER_ANNOTATION_CODE'].unique())

[nan  1.]


In [37]:
print(df['ADV_SVC_ANNOTATION_CODE'].unique())

[nan  2.  1.]


In [38]:
list_anno_code = [f_name for f_name in df.columns if "ANNOTATION_CODE" in f_name]

anno_table = {
    'feature': list_anno_code,
    'unique values': [un_vals for un_vals in [df[val].unique() for val in list_anno_code]]
}

df_anno_table = pd.DataFrame(anno_table)
print(df_anno_table)

                      feature    unique values
0    ADV_USER_ANNOTATION_CODE       [nan, 1.0]
1     ADV_SVC_ANNOTATION_CODE  [nan, 2.0, 1.0]
2   PREV_USER_ANNOTATION_CODE       [nan, 1.0]
3    PREV_SVC_ANNOTATION_CODE  [nan, 1.0, 2.0]
4  TXMT_USER_ ANNOTATION_CODE       [nan, 1.0]
5    TXMT_SVC_ANNOTATION_CODE  [nan, 2.0, 1.0]
6   EXAM_USER_ANNOTATION_CODE       [nan, 1.0]
7    EXAM_SVC_ANNOTATION_CODE  [nan, 1.0, 2.0]


Since straight up imputing methods (mean/median/mode) wouldn't really make sense in our case as these annotation codes mean a specific case, I'm gonna add -1 in place of NANs to handle missing values.

In [39]:
for col in list_anno_code:
    df[col] = df[col].fillna(-1)

In [40]:
anno_table = {
    'feature': list_anno_code,
    'unique values': [un_vals for un_vals in [df[val].unique() for val in list_anno_code]]
}

df_anno_table = pd.DataFrame(anno_table)
print(df_anno_table)

                      feature     unique values
0    ADV_USER_ANNOTATION_CODE       [-1.0, 1.0]
1     ADV_SVC_ANNOTATION_CODE  [-1.0, 2.0, 1.0]
2   PREV_USER_ANNOTATION_CODE       [-1.0, 1.0]
3    PREV_SVC_ANNOTATION_CODE  [-1.0, 1.0, 2.0]
4  TXMT_USER_ ANNOTATION_CODE       [-1.0, 1.0]
5    TXMT_SVC_ANNOTATION_CODE  [-1.0, 2.0, 1.0]
6   EXAM_USER_ANNOTATION_CODE       [-1.0, 1.0]
7    EXAM_SVC_ANNOTATION_CODE  [-1.0, 1.0, 2.0]


Likewise for the count features we replace missing values with -1.

In [41]:
missing_val = df.isnull().sum()
print(missing_val)

RENDERING_NPI                    0
PROVIDER_LEGAL_NAME              2
CALENDAR_YEAR                    0
DELIVERY_SYSTEM                  0
DELIVERY_SYSTEM_ENCODED          0
PROVIDER_TYPE                    0
PROVIDER_TYPE_ENCODED            0
AGE_GROUP                        0
AGE_GROUP_ENCODED                0
ADV_USER_CNT                  5848
ADV_USER_ANNOTATION_CODE         0
ADV_SVC_CNT                   5848
ADV_SVC_ANNOTATION_CODE          0
PREV_USER_CNT                 8744
PREV_USER_ANNOTATION_CODE        0
PREV_SVC_CNT                  4592
PREV_SVC_ANNOTATION_CODE         0
TXMT_USER_CNT                 5792
TXMT_USER_ ANNOTATION_CODE       0
TXMT_SVC_CNT                  5792
TXMT_SVC_ANNOTATION_CODE         0
EXAM_USER_CNT                 4712
EXAM_USER_ANNOTATION_CODE        0
EXAM_SVC_CNT                  4712
EXAM_SVC_ANNOTATION_CODE         0
dtype: int64


In [42]:
list_cnt_code = [f_name for f_name in df.columns if "CNT" in f_name]

for col in list_cnt_code:
    df[col] = df[col].fillna(-1)

In [43]:
missing_val = df.isnull().sum()
print(missing_val)

RENDERING_NPI                 0
PROVIDER_LEGAL_NAME           2
CALENDAR_YEAR                 0
DELIVERY_SYSTEM               0
DELIVERY_SYSTEM_ENCODED       0
PROVIDER_TYPE                 0
PROVIDER_TYPE_ENCODED         0
AGE_GROUP                     0
AGE_GROUP_ENCODED             0
ADV_USER_CNT                  0
ADV_USER_ANNOTATION_CODE      0
ADV_SVC_CNT                   0
ADV_SVC_ANNOTATION_CODE       0
PREV_USER_CNT                 0
PREV_USER_ANNOTATION_CODE     0
PREV_SVC_CNT                  0
PREV_SVC_ANNOTATION_CODE      0
TXMT_USER_CNT                 0
TXMT_USER_ ANNOTATION_CODE    0
TXMT_SVC_CNT                  0
TXMT_SVC_ANNOTATION_CODE      0
EXAM_USER_CNT                 0
EXAM_USER_ANNOTATION_CODE     0
EXAM_SVC_CNT                  0
EXAM_SVC_ANNOTATION_CODE      0
dtype: int64


We can still see there's 2 missing values under PROVIDER_LEGAL_NAME, we could just drop them since it's unlikely to have a significant impact on the data.

**Also it was later determined that the 2 missing values under PROVIDER_LEGAL_NAME were no longer valid in the NPPES registry, thus dropping it would make more sense**

In [44]:
df.dropna(subset=['PROVIDER_LEGAL_NAME'], inplace=True)
missing_val = df.isnull().sum()
print(missing_val)

RENDERING_NPI                 0
PROVIDER_LEGAL_NAME           0
CALENDAR_YEAR                 0
DELIVERY_SYSTEM               0
DELIVERY_SYSTEM_ENCODED       0
PROVIDER_TYPE                 0
PROVIDER_TYPE_ENCODED         0
AGE_GROUP                     0
AGE_GROUP_ENCODED             0
ADV_USER_CNT                  0
ADV_USER_ANNOTATION_CODE      0
ADV_SVC_CNT                   0
ADV_SVC_ANNOTATION_CODE       0
PREV_USER_CNT                 0
PREV_USER_ANNOTATION_CODE     0
PREV_SVC_CNT                  0
PREV_SVC_ANNOTATION_CODE      0
TXMT_USER_CNT                 0
TXMT_USER_ ANNOTATION_CODE    0
TXMT_SVC_CNT                  0
TXMT_SVC_ANNOTATION_CODE      0
EXAM_USER_CNT                 0
EXAM_USER_ANNOTATION_CODE     0
EXAM_SVC_CNT                  0
EXAM_SVC_ANNOTATION_CODE      0
dtype: int64


In [45]:
df.to_csv('./data/trial/csv/cleaned_v1.csv', index=False)
df.to_excel('./data/trial/xlsx/cleaned_v1.xlsx', index=False)

In [46]:
df.head(5)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED,AGE_GROUP,AGE_GROUP_ENCODED,ADV_USER_CNT,...,PREV_SVC_CNT,PREV_SVC_ANNOTATION_CODE,TXMT_USER_CNT,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 0-20,0,101.0,...,216.0,-1.0,37.0,-1.0,115.0,-1.0,83.0,-1.0,98.0,-1.0
1,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 21+,1,67.0,...,30.0,-1.0,43.0,-1.0,73.0,-1.0,15.0,-1.0,15.0,-1.0
2,1003004698,"GODFREY III, MERLE FRANKLIN,",2018,FFS,0,RENDERING,0,AGE 21+,1,357.0,...,131.0,-1.0,357.0,-1.0,712.0,-1.0,316.0,-1.0,316.0,-1.0
3,1003009440,VONTELA REKHA,2018,FFS,0,RENDERING,0,AGE 0-20,0,975.0,...,3732.0,-1.0,208.0,-1.0,456.0,-1.0,919.0,-1.0,1081.0,-1.0
4,1003009440,VONTELA REKHA,2018,FFS,0,RENDERING,0,AGE 21+,1,837.0,...,1396.0,-1.0,335.0,-1.0,824.0,-1.0,744.0,-1.0,744.0,-1.0


Because it's unclear as to how this data is going to be used in the final product, for now let's go further based on few assumptions in the best of my understanding about the data.

If a clinic has high counts of **ADV_SVC_CNT** (Advanced Service Count) and **TXMT_SVC_CNT** (Treatment Service Count) which means they require labs with strong expertise and capable of complex procedures.

In [47]:
len(df['PROVIDER_LEGAL_NAME'].unique())

10945

Attempting to retrieve address details for a NPI number via the NPPES NPI Registry API, this could be later used to narrow down to the closest lab available for a clinic.

In [48]:
import requests

In [49]:
NPI_NUMBER = 1003003781
api_url = f"https://npiregistry.cms.hhs.gov/api/?number={NPI_NUMBER}&enumeration_type=&taxonomy_description=&name_purpose=&first_name=&use_first_name_alias=&last_name=&organization_name=&address_purpose=&city=&state=&postal_code=&country_code=&limit=&skip=&pretty=&version=2.1"
print(api_url)

https://npiregistry.cms.hhs.gov/api/?number=1003003781&enumeration_type=&taxonomy_description=&name_purpose=&first_name=&use_first_name_alias=&last_name=&organization_name=&address_purpose=&city=&state=&postal_code=&country_code=&limit=&skip=&pretty=&version=2.1


In [50]:
response = requests.get(api_url)
data = response.json()

#gets first 5 digits of the postal code, going by the ZIP + 4 digit format
int(data['results'][0]['addresses'][1]['postal_code']) // 10000 

96013

This postal code can be retrieved for all unique NPIs, which can be used to determine labs based on proximity.

In [53]:
from faker import Faker

fake = Faker('en_US')

df_npi_unique = df['RENDERING_NPI'].unique()

npi_postal_codes = []

for npi in df_npi_unique:
    try:
        postal_code_5_digits = fake.zipcode_in_state(state_abbr='CA')
        
    except (requests.exceptions.RequestException, KeyError, IndexError) as e:
        print(f"Failed to retrieve real postal code for NPI {npi}. Reason: {e}")
        
    finally:
        npi_postal_codes.append({'RENDERING_NPI': npi, 'POSTAL_CODE': postal_code_5_digits})

df_npi_postal_codes = pd.DataFrame(npi_postal_codes)
df_npi_postal_codes.head(5)

Unnamed: 0,RENDERING_NPI,POSTAL_CODE
0,1003003781,95131
1,1003004698,91067
2,1003009440,93432
3,1003010372,95632
4,1003011172,95761


Besides postal code, I think it'd be safe to assume the following:
* **TXMT_TO_EXAM_RATIO**: Average Treatment Service, can be determined by $\frac{Treatment Service Count}{Exam Service Count}$, a higher value of this indicate that the clinic focuses more on advanced treatment.
* **PREV_TO_EXAM_RATIO**: Average Preventive Servce, can be determined by $\frac{Preventive Service Count}{Exam Service Count}$, a higher value of this would show otherwise, puts more emphasis on preventive care.

Adding these columns to the data

In [None]:
import pandas as pd
import numpy as np

service_prefixes = ['PREV', 'TXMT', 'ADV']

for prefix in service_prefixes:
    feature = f'{prefix}_SVC_CNT'
    df[f'{prefix}_TO_EXAM_RATIO'] = np.where(df['EXAM_SVC_CNT'] > 0, 
                                        df[feature] / df['EXAM_SVC_CNT'], 
                                        0)
    
df.head(5)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED,AGE_GROUP,AGE_GROUP_ENCODED,ADV_USER_CNT,...,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE,PREV_TO_EXAM_RATIO,TXMT_TO_EXAM_RATIO,ADV_TO_EXAM_RATIO
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 0-20,0,101.0,...,-1.0,115.0,-1.0,83.0,-1.0,98.0,-1.0,2.204082,1.173469,8.642857
1,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 21+,1,67.0,...,-1.0,73.0,-1.0,15.0,-1.0,15.0,-1.0,2.0,4.866667,16.866667
2,1003004698,"GODFREY III, MERLE FRANKLIN,",2018,FFS,0,RENDERING,0,AGE 21+,1,357.0,...,-1.0,712.0,-1.0,316.0,-1.0,316.0,-1.0,0.414557,2.253165,3.838608
3,1003009440,VONTELA REKHA,2018,FFS,0,RENDERING,0,AGE 0-20,0,975.0,...,-1.0,456.0,-1.0,919.0,-1.0,1081.0,-1.0,3.452359,0.421832,8.124884
4,1003009440,VONTELA REKHA,2018,FFS,0,RENDERING,0,AGE 21+,1,837.0,...,-1.0,824.0,-1.0,744.0,-1.0,744.0,-1.0,1.876344,1.107527,7.173387


In theory, here's how it could work:

**Data Collection & Profiling**

**Clinic Profile**:\
This would include
* Postal Code
* Demographic: Age group
* Business Model: **RENDERING** or **RENDERING SNC** as provider types, and **FFS**, **GMC**, **PHP** as delivery system types.
* Service Demand: **PREV_TO_EXAM_RATIO**, **TXMT_TO_EXAM_RATIO**, **ADV_TO_EXAM_RATIO**

**Lab Profile**:\
This would include:
* Postal Code
* Specialization: *Restorative*, *Cosmetic*, *Preventive*
* Business Model: Pricing structure, example: flat fee, or volume discounts

**The Matching Algorithm**\
Once both sets of data are profiled, the AI model performs a similarity-based matching process. The goal is to calculate a score for each potential clinic-lab pair based on how well they align.

**Proximity Scoring**: The model calculates a proximity score by comparing the clinic's postal code to each lab's postal code. The closer the two are, the higher the score. A direct zip code match would be the highest score.

**Specialization Scoring**: The model compares the clinic's service demand profile with the lab's specializations.

If a clinic has a high **TXMT_TO_EXAM_RATIO**, the model will give a higher score to a lab that specializes in crowns and bridges.

If a clinic has a high **PREV_TO_EXAM_RATIO**, the model will give a high score to a lab that focuses on mouthguards and retainers.

**Business Model Scoring**: The model will align business models based on provider type (**RENDERING / RENDERING SNC**) or the delivery method (**FFS, GMC, PHP**).

**Performing splits**

In [None]:
df.columns

Index(['RENDERING_NPI', 'PROVIDER_LEGAL_NAME', 'CALENDAR_YEAR',
       'DELIVERY_SYSTEM', 'DELIVERY_SYSTEM_ENCODED', 'PROVIDER_TYPE',
       'PROVIDER_TYPE_ENCODED', 'AGE_GROUP', 'AGE_GROUP_ENCODED',
       'ADV_USER_CNT', 'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT',
       'ADV_SVC_ANNOTATION_CODE', 'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE',
       'PREV_SVC_CNT', 'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT', 'EXAM_SVC_ANNOTATION_CODE',
       'PREV_TO_EXAM_RATIO', 'TXMT_TO_EXAM_RATIO', 'ADV_TO_EXAM_RATIO'],
      dtype='object')

In [None]:
df.head(1)

Unnamed: 0,RENDERING_NPI,PROVIDER_LEGAL_NAME,CALENDAR_YEAR,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED,AGE_GROUP,AGE_GROUP_ENCODED,ADV_USER_CNT,...,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE,PREV_TO_EXAM_RATIO,TXMT_TO_EXAM_RATIO,ADV_TO_EXAM_RATIO
0,1003003781,CHOTI SUPAK,2018,FFS,0,RENDERING,0,AGE 0-20,0,101.0,...,-1.0,115.0,-1.0,83.0,-1.0,98.0,-1.0,2.204082,1.173469,8.642857


**D-Numeric: all-numeric data**

In [None]:
df_num_main = df[['RENDERING_NPI', 'DELIVERY_SYSTEM_ENCODED', 'PROVIDER_TYPE_ENCODED', 'AGE_GROUP_ENCODED', 'ADV_USER_CNT', 'ADV_USER_ANNOTATION_CODE', 'ADV_SVC_CNT',
       'ADV_SVC_ANNOTATION_CODE', 'PREV_USER_CNT', 'PREV_USER_ANNOTATION_CODE',
       'PREV_SVC_CNT', 'PREV_SVC_ANNOTATION_CODE', 'TXMT_USER_CNT',
       'TXMT_USER_ ANNOTATION_CODE', 'TXMT_SVC_CNT',
       'TXMT_SVC_ANNOTATION_CODE', 'EXAM_USER_CNT',
       'EXAM_USER_ANNOTATION_CODE', 'EXAM_SVC_CNT', 'EXAM_SVC_ANNOTATION_CODE',
       'PREV_TO_EXAM_RATIO', 'TXMT_TO_EXAM_RATIO', 'ADV_TO_EXAM_RATIO']].copy()
df_num_main.head(1)

Unnamed: 0,RENDERING_NPI,DELIVERY_SYSTEM_ENCODED,PROVIDER_TYPE_ENCODED,AGE_GROUP_ENCODED,ADV_USER_CNT,ADV_USER_ANNOTATION_CODE,ADV_SVC_CNT,ADV_SVC_ANNOTATION_CODE,PREV_USER_CNT,PREV_USER_ANNOTATION_CODE,...,TXMT_USER_ ANNOTATION_CODE,TXMT_SVC_CNT,TXMT_SVC_ANNOTATION_CODE,EXAM_USER_CNT,EXAM_USER_ANNOTATION_CODE,EXAM_SVC_CNT,EXAM_SVC_ANNOTATION_CODE,PREV_TO_EXAM_RATIO,TXMT_TO_EXAM_RATIO,ADV_TO_EXAM_RATIO
0,1003003781,0,0,0,101.0,-1.0,847.0,-1.0,89.0,-1.0,...,-1.0,115.0,-1.0,83.0,-1.0,98.0,-1.0,2.204082,1.173469,8.642857


In [None]:
df_num_main.to_csv('./data/trial/csv/all_numerical_v1.csv', index=False)
df_num_main.to_excel('./data/trial/xlsx/all_numerical_v1.xlsx', index=False)

**D-General: NPI, Legal Name**

In [None]:
df_general = df[['RENDERING_NPI', 'PROVIDER_LEGAL_NAME']].copy()

In [None]:
df_general.to_csv('./data/trial/csv/d_general.csv', index=False)
df_general.to_excel('./data/trial/xlsx/cleaned_v1.xlsx', index=False)

**D-Encoded: Delivery System, Provider Type, Age Group**

In [None]:
d_encoded_delivery = df[['DELIVERY_SYSTEM', 'DELIVERY_SYSTEM_ENCODED']].drop_duplicates().sort_values('DELIVERY_SYSTEM_ENCODED').copy()
d_encoded_delivery

Unnamed: 0,DELIVERY_SYSTEM,DELIVERY_SYSTEM_ENCODED
0,FFS,0
34,GMC,1
9,PHP,2


In [None]:
d_encoded_delivery.to_csv('./data/trial/csv/d_encoded_delivery.csv', index=False)

In [None]:
d_encoded_provider = df[['PROVIDER_TYPE', 'PROVIDER_TYPE_ENCODED']].drop_duplicates().sort_values('PROVIDER_TYPE_ENCODED').copy()
d_encoded_provider

Unnamed: 0,PROVIDER_TYPE,PROVIDER_TYPE_ENCODED
0,RENDERING,0
19698,RENDERING SNC,1


In [None]:
d_encoded_provider.to_csv('./data/trial/csv/d_encoded_provider.csv', index=False)

In [None]:
d_encoded_age = df[['AGE_GROUP', 'AGE_GROUP_ENCODED']].drop_duplicates().sort_values('AGE_GROUP_ENCODED').copy()
d_encoded_age

Unnamed: 0,AGE_GROUP,AGE_GROUP_ENCODED
0,AGE 0-20,0
1,AGE 21+,1


In [None]:
d_encoded_age.to_csv('./data/trial/csv/d_encoded_age.csv', index=False)

**Generating artificial data for the lab-side using Faker library**

In [None]:
import pandas as pd
from faker import Faker
import random

fake = Faker('en_US')

specialization_types = {
    'restorative': 0,
    'preventive': 1,
    'cosmetic': 2
}

business_models = {
    'full_service_lab': 0,
    'specialty_lab': 1,
    'milling_center': 2
}

num_records = 10000

lab_ids = list(range(1, num_records + 1))
lab_names = [fake.company() for _ in range(num_records)]
postal_codes = [fake.zipcode_in_state(state_abbr='CA') for _ in range(num_records)]
specializations = [random.choice(list(specialization_types.values())) for _ in range(num_records)]
business_model_types = [random.choice(list(business_models.values())) for _ in range(num_records)]

data = {
    'lab_id': lab_ids,
    'lab_name': lab_names,
    'postal_code': postal_codes,
    'specialization_type': specializations,
    'business_model': business_model_types
}

df_lab = pd.DataFrame(data)
print(df.head(5))

Successfully generated 10000 artificial lab records and saved to 'artificial_lab_data.csv'

First 5 rows of the generated data:
   lab_id                     lab_name postal_code  specialization_type  \
0       1  Cooper, Carter and Harrison       92023                    0   
1       2    Wolf, Tanner and Chandler       93421                    2   
2       3                  Jones Group       93701                    2   
3       4                    Brown Inc       94655                    1   
4       5             Trevino and Sons       91310                    2   

   business_model  
0               1  
1               0  
2               0  
3               2  
4               0  


In [None]:
df_lab.to_csv('./data/trial/artificial_lab_data.csv', index=False)

**Combining clinic data and lab data to build a training dataset for machine learning algorithm - through custom logic**