## Add provider groups to our SQL database

Lets first import some of the basic packages we will be using.

In [28]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import date, datetime, timedelta
import sqlalchemy
import pymysql
import openpyxl
import glob
from ast import literal_eval

# Files we need to obtain

We need to first obtain the files that have our provider groups. I have stored the parent directory in a dotenv locally, but feel free to create your own variables to replace the ones here.

In [2]:
load_dotenv()

hyperlink_path = 'json_completed_hyperlinks_update.csv'
parent_dir = os.getenv('dir')
data_dir = os.path.join(parent_dir,'data_update')

df = pd.read_csv(hyperlink_path, header=None)
df.head()
df.columns = ['ParseID','Hyperlink']
hyperlinks = df['Hyperlink'].tolist()

def foldername(hyperlink):
    hyperlink = hyperlink.split('/')[-1]
    return hyperlink[0:-8]
def providers_path(folder):
    return os.path.join(data_dir,folder,folder+'_providers.csv')

folder_names= [foldername(hyperlink) for hyperlink in hyperlinks]
provider_files = [providers_path(folder_name) for folder_name in folder_names]

provider_files[0]

'D:/Vignesh/Capstone\\data_update\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates_providers.csv'

# Lets read one:

Let us read one file to see what we are dealing with.

In [23]:
df = pd.read_csv(provider_files[0], usecols=['tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval})
folder = folder_names[0]

In [25]:
df.dtypes
df.head()

Unnamed: 0,tin,npi_provider_groups
0,593582520,[1225090087]
1,272050459,[1639508567]
2,160743209,[1609314343]
3,561844651,"[1215134309, 1043649635, 1851721047, 1285715185]"
4,371756970,[1174710636]


Notice how we have an array of npi numbers, lets explode this.

In [26]:
df_explode = df.explode(column= 'npi_provider_groups',ignore_index=True)
df_explode.head()

Unnamed: 0,tin,npi_provider_groups
0,593582520,1225090087
1,272050459,1639508567
2,160743209,1609314343
3,561844651,1215134309
4,561844651,1043649635


# Lets add all the files to a single dataframe

In [33]:
df = pd.concat((pd.read_csv(f, usecols=['tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval}) for f in provider_files), ignore_index=True)
df.drop_duplicates(inplace=True)
df = df.explode(column= 'npi_provider_groups',ignore_index=True)

npi_path = 'Facility_Affiliation.csv'
npi = pd.read_csv(npi_path,usecols=['NPI','facility_afl_cnn','parent_cnn'])
npi['parent_cnn']= npi.apply(lambda row: row.facility_afl_cnn if row.parent_cnn is np.nan else row.parent_cnn, axis=1)




KeyboardInterrupt: 