## Add provider groups to our SQL database

Lets first import some of the basic packages we will be using.

In [1]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import date, datetime, timedelta
import sqlalchemy
import pymysql
import openpyxl
import glob
from ast import literal_eval
from collections import Counter

# Files we need to obtain

We need to first obtain the files that have our provider groups. I have stored the parent directory in a dotenv locally, but feel free to create your own variables to replace the ones here.

In [2]:
load_dotenv()

hyperlink_path = 'json_completed_hyperlinks_update.csv'
parent_dir = os.getenv('dir')
data_dir = os.path.join(parent_dir,'data_update')

df = pd.read_csv(hyperlink_path, header=None)
df.head()
df.columns = ['ParseID','Hyperlink']
hyperlinks = df['Hyperlink'].tolist()

def foldername(hyperlink):
    hyperlink = hyperlink.split('/')[-1]
    return hyperlink[0:-8]
def providers_path(folder):
    return os.path.join(data_dir,folder,folder+'_providers.csv')

folder_names= [foldername(hyperlink) for hyperlink in hyperlinks]
provider_files = [providers_path(folder_name) for folder_name in folder_names]

provider_files[0]

'D:/Vignesh/Capstone\\data_update\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates_providers.csv'

# Lets read one:

Let us read one file to see what we are dealing with.

In [3]:
df = pd.read_csv(provider_files[0], usecols=['tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval})
folder = folder_names[0]

In [4]:
df.dtypes
df.head()

Unnamed: 0,tin,npi_provider_groups
0,593582520,[1225090087]
1,272050459,[1639508567]
2,160743209,[1609314343]
3,561844651,"[1215134309, 1043649635, 1851721047, 1285715185]"
4,371756970,[1174710636]


Notice how we have an array of npi numbers, lets explode this.

In [5]:
df_explode = df.explode(column= 'npi_provider_groups',ignore_index=True)
df_explode.head()

Unnamed: 0,tin,npi_provider_groups
0,593582520,1225090087
1,272050459,1639508567
2,160743209,1609314343
3,561844651,1215134309
4,561844651,1043649635


# Lets add all the files to a single dataframe

In [6]:
df = pd.concat((pd.read_csv(f, usecols=['tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval}) for f in provider_files), ignore_index=True)

In [10]:
df[df.isna().any(axis=1)]

Unnamed: 0,tin,npi_provider_groups
1993,,"[1487195756, 1942589122]"
21891,,[]
62098,,[]
101913,,[]
141728,,[]
...,...,...
10861117,,[]
10900932,,[]
10940747,,[]
10980562,,[]


In [11]:
df.dropna(inplace=True)
df.drop_duplicates(subset=['tin'], inplace=True, ignore_index=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46854 entries, 0 to 46853
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tin                  46854 non-null  float64
 1   npi_provider_groups  46854 non-null  object 
dtypes: float64(1), object(1)
memory usage: 732.2+ KB


In [None]:
# df = df.explode(column= 'npi_provider_groups',ignore_index=True)

In [13]:
npi_path = 'Facility_Affiliation.csv'
npi = pd.read_csv(npi_path,usecols=['NPI','facility_afl_ccn','parent_ccn'], encoding='windows-1252')

In [53]:
npi.head()

Unnamed: 0,NPI,facility_afl_ccn,parent_ccn
0,1003000126,210003,
1,1003000126,210022,
2,1003000126,490063,
3,1003000126,490145,
4,1003000134,140010,


In [14]:
npi['ccn'] = npi.apply(lambda row: row['facility_afl_ccn'] if np.isnan(row['parent_ccn']) else row['parent_ccn'], axis=1)

In [62]:
npi.dtypes

NPI                   int64
facility_afl_ccn     object
parent_ccn          float64
ccn                  object
dtype: object

In [15]:
def find_npi(x):
    ccns = []
    for npi_value in x['npi_provider_groups']:
        queried = npi[npi['NPI']==npi_value]
        lst = queried['ccn'].to_list()
        if lst: 
            ccns = ccns + lst
    if ccns:
        count= Counter(ccns)
        x['ccn'] = count.most_common()[0][0]
    else:
        x['ccn'] = np.nan
    return x

df = df.apply(find_npi, axis=1)

In [17]:
sum(df['ccn'].notna())
sum(df['tin'].notna())

46854

In [18]:
df.to_parquet('tin_to_ccn.parquet')