In [6]:
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from datetime import date, datetime, timedelta
import sqlalchemy
import pymysql
import openpyxl
import glob
from ast import literal_eval
from collections import Counter
from tqdm.auto import tqdm
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
import pyarrow as pa

In [7]:
load_dotenv()

hyperlink_path = 'json_completed_hyperlinks_update.csv'
parent_dir = os.getenv('dir')
data_dir = os.path.join(parent_dir,'data_update')

df = pd.read_csv(hyperlink_path, header=None)
df.head()
df.columns = ['ParseID','Hyperlink']
hyperlinks = df['Hyperlink'].tolist()

def foldername(hyperlink):
    hyperlink = hyperlink.split('/')[-1]
    return hyperlink[0:-8]
def providers_path(folder):
    return os.path.join(data_dir,folder,folder+'_providers.csv')

folder_names= [foldername(hyperlink) for hyperlink in hyperlinks]
provider_files = [providers_path(folder_name) for folder_name in folder_names]

ccn = pd.read_parquet('tin_to_ccn_nonan.parquet')

In [22]:
ccn = ccn.astype({'tin':'int64'})
ccn.dtypes

tin                     int64
npi_provider_groups    object
ccn                    object
dtype: object

In [23]:
ccn.drop_duplicates(subset = ['tin','ccn'], inplace=True, ignore_index=True)
ccn.info()
ccn.head(100)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3895 entries, 0 to 3894
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   tin                  3895 non-null   int64 
 1   npi_provider_groups  3895 non-null   object
 2   ccn                  3895 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.4+ KB


Unnamed: 0,tin,npi_provider_groups,ccn
0,1356638811,[1356638811],297112
1,1285698381,[1285698381],030101
2,1508899253,[1508899253],290007
3,1932106853,[1932106853],030055
4,1215107347,[1215107347],060006
...,...,...,...
95,880232840,"[1770553760, 1679551642, 1407833395]",290045
96,880500863,"[1003174368, 1184736308, 1477521193, 114429807...",290007
97,561919753,"[1427022755, 1073803631, 1083699805, 102308729...",030117
98,870516264,"[1639279698, 1770676793, 1518418623, 141790618...",460001


In [24]:
df = pd.read_csv(provider_files[2], usecols=['provider_reference','tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval})
provider_files[0]

'D:/Vignesh/Capstone\\data_update\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates\\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates_providers.csv'

In [26]:
merged = df.merge(ccn[['tin','ccn']], how='inner', on=['tin'])
merged.dropna(subset=['ccn'], inplace=True)
merged.head()

Unnamed: 0,provider_reference,tin,npi_provider_groups,ccn
0,0,1356638811,[1356638811],297112
1,1,1285698381,[1285698381],30101
2,1106,1285698381,[1285698381],30101
3,11,1508899253,[1508899253],290007
4,34,1932106853,[1932106853],30055


In [27]:
def merged_prov_ccn(file_path):
    df = pd.read_csv(file_path, usecols=['provider_reference','tin','npi_provider_groups'], converters={'npi_provider_groups': literal_eval})
    df.dropna(subset=['tin'],inplace=True)
    df = df.astype({'tin':'int64'})
    merged = df.merge(ccn[['tin','ccn']], how='inner', on=['tin'])
    return merged

def save_file(df, path):
    df.to_parquet(path)
    print('File saved to parquet: \n' + path)
    

In [28]:
for i, provider in enumerate(provider_files):
    merged = merged_prov_ccn(provider)
    save_file(merged, os.path.join(data_dir,folder_names[i],folder_names[i]+'_ccn.parquet') )

File saved to parquet: 
D:/Vignesh/Capstone\data_update\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates_ccn.parquet
File saved to parquet: 
D:/Vignesh/Capstone\data_update\2023-01-01_Golden-Rule-Insurance-Company_Insurer_PPO---NDC_PPO-NDC_in-network-rates\2023-01-01_Golden-Rule-Insurance-Company_Insurer_PPO---NDC_PPO-NDC_in-network-rates_ccn.parquet
File saved to parquet: 
D:/Vignesh/Capstone\data_update\2023-01-01_Health-Plan-of-Nevada-Inc-_Insurer_Commercial-HMO_Commercial-HMO_in-network-rates\2023-01-01_Health-Plan-of-Nevada-Inc-_Insurer_Commercial-HMO_Commercial-HMO_in-network-rates_ccn.parquet
File saved to parquet: 
D:/Vignesh/Capstone\data_update\2023-01-01_HEALTHSCOPE-BENEFITS_Third-Party-Administrator_SHO_U1_3342_in-network-rates\2023-01-01_HEALTHSCOPE-BENEFITS_Third-Party-Administrator_SHO_U1_3342_in-network-rates_ccn.parquet
File saved to parquet: 
D:/Vignesh

In [8]:
for i, folder in tqdm(enumerate(folder_names)):
    ref_path = os.path.join(data_dir,folder,folder+'_ccn.parquet')
    rate_path = os.path.join(data_dir,folder,folder+'_rates.csv')
    merge_file = os.path.join(data_dir,folder,folder+'_merge.parquet')
    print(ref_path)
    ref = pd.read_parquet(ref_path, columns = ['provider_reference', 'ccn'])
    ref = ref.astype({'provider_reference': 'int64','ccn': 'str'})
    rate = pd.read_csv(rate_path, usecols=['provider_reference','billing_type','billing_code','negotiated_rates'], 
                       dtype={'provider_reference': 'int64','billing_type':'str','billing_code':'str','negotiated_rates':'float64'})
    merged = rate.merge(ref, how='inner', on=['provider_reference'])
    merged.to_parquet(merge_file)

0it [00:00, ?it/s]

D:/Vignesh/Capstone\data_update\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates\2023-01-01_ALL-SAVERS-INSURANCE-COMPANY_Insurer_PPO---NDC_PPO-NDC_in-network-rates_ccn.parquet
D:/Vignesh/Capstone\data_update\2023-01-01_Golden-Rule-Insurance-Company_Insurer_PPO---NDC_PPO-NDC_in-network-rates\2023-01-01_Golden-Rule-Insurance-Company_Insurer_PPO---NDC_PPO-NDC_in-network-rates_ccn.parquet
D:/Vignesh/Capstone\data_update\2023-01-01_Health-Plan-of-Nevada-Inc-_Insurer_Commercial-HMO_Commercial-HMO_in-network-rates\2023-01-01_Health-Plan-of-Nevada-Inc-_Insurer_Commercial-HMO_Commercial-HMO_in-network-rates_ccn.parquet
D:/Vignesh/Capstone\data_update\2023-01-01_HEALTHSCOPE-BENEFITS_Third-Party-Administrator_SHO_U1_3342_in-network-rates\2023-01-01_HEALTHSCOPE-BENEFITS_Third-Party-Administrator_SHO_U1_3342_in-network-rates_ccn.parquet
D:/Vignesh/Capstone\data_update\2023-01-01_HealthSCOPE-Benefits--Inc-_Third-Party-Administrator_BAPTIST-HEALTH-JACKSONVILLE-WITH-R