In [1]:
import requests
import shutil
import ijson.backends.python as ijson
import json
import gzip
from tqdm.auto import tqdm
from csv import writer
import os
from io import StringIO
import pandas as pd
import numpy as np

In [2]:
# Define paths to important files and places where you want to store files
hyperlink_path = 'json_files_hyperlinks.xlsx'
hyperlinks = pd.read_excel(hyperlink_path)['Hyperlinks'].tolist()
parent_dir= 'D:\Vignesh\Capstone'
dir_json = os.path.join(parent_dir, 'JSON')
dir_data = os.path.join(parent_dir,'data')

In [3]:
def download_file(url, path):
    local_filename = url.split('/')[-1]
    download_path = os.path.join(path,local_filename)
    if os.path.exists(download_path):
        print(download_path + '\nFile already Exists!')
        return (local_filename, download_path)
    else:
        with requests.get(url, stream=True) as r:
            total_length = int(r.headers.get('content-length'))
            with tqdm.wrapattr(r.raw, "read", total=total_length, desc="")as raw:
                with open(download_path, 'wb') as output:
                    shutil.copyfileobj(raw, output)
        print(local_filename +'\nDownload Complete')
        return (local_filename, download_path)


def check_file_size(url):
    with requests.get(url, stream=True) as r:
        return int(r.headers.get('content-length'))

def download_multiple_files(urls, path):
    local_filenames = []
    download_paths = []
    for url in urls:
        local_filename = url.split('/')[-1]
        download_path = os.path.join(path,local_filename)
        if os.path.exists(download_path):
            print(download_path + '\nFile already Exists!')
            local_filenames.append(local_filename) 
            download_paths.append(download_path)
        else:
            with requests.get(url, stream=True) as r:
                total_length = int(r.headers.get('content-length'))
                with tqdm.wrapattr(r.raw, "read", total=total_length, desc="")as raw:
                    with open(download_path, 'wb') as output:
                        shutil.copyfileobj(raw, output)
            print(local_filename +'\nDownload Complete!')
            local_filenames.append(local_filename) 
            download_paths.append(download_path)
    return (local_filenames, download_paths)

def delete_file(path):
    os.remove(path)
    print(path + '\nFile Deleted')

def write_large_file(filename, index, hyperlink):
    with open(filename, 'a') as f:
        writer_object = writer(f)
        writer_object.writerow([index,hyperlink])
        print('File Too Large, written to Large File CSV')
        f.close()

def write_completed_file(filename, index, hyperlink):
    with open(filename, 'a') as f:
        writer_object = writer(f)
        writer_object.writerow([index,hyperlink])
        print('File has been completed')
        f.close()

def write_provider_csv(filename, reference, tin, npi_provider_groups):
    with open(filename, 'a') as f:
        writer_object = writer(f)
        for i,r in enumerate(reference):
            writer_object.writerow([r,tin[i],npi_provider_groups[i]])
        f.close()


def write_rates_csv(filename, billing_code, provider_reference, rate):
    with open(filename, 'a') as f:
        writer_object = writer(f)
        for i, ref in enumerate(provider_reference):
            writer_object.writerow([billing_code[i],ref,rate[i]])
        f.close()

def make_paths_folders(filename, json_file):
    folder_name = filename[0:-8]
    path= os.path.join(dir_data, folder_name)
    if os.path.exists(path) is False:
        os.mkdir(path)
    else:
        print(path + '\nFolder Already Exists')
    return path

def parse_file(filename, json_file):
    path = make_paths_folders(filename, json_file)
    providers_csv = os.path.join(path, filename[0:-8]) + '_providers.csv'
    rates_csv = os.path.join(path, filename[0:-8]) + '_rates.csv'
    write_provider_csv(providers_csv, ['provider_reference'], ['tin'], ['npi_provider_groups'])
    write_rates_csv(rates_csv, ['billing_code'], ['provider_reference'], ['negotiated_rates'])

    npi_provider_groups = []
    tin = []
    reference = []

    billing_type = []
    billing_code = []
    ref_group = []
    rates = []

    with gzip.open(json_file, mode="rt") as f:
        parser = ijson.parse(f)
        for prefix, event, value in parser:
            if len(npi_provider_groups) >= 100:
                write_provider_csv(providers_csv, reference, tin, npi_provider_groups)
                npi_provider_groups = []
                tin = []
                reference = []
        
            if len(rates) >= 1000:
                write_rates_csv(rates_csv, billing_code, ref_group, rates)
                billing_code = []
                ref_group = []
                rates = []

            
            if prefix=='provider_references.item.provider_groups.item.npi' and event=='start_array'and value==None:
                temp_npi = []
            elif prefix=='provider_references.item.provider_groups.item.npi.item' and event=='number':
                temp_npi.append(value)
            elif prefix=='provider_references.item.provider_groups.item.tin.value' and event=='string':
                temp_tin =value
            elif prefix=='provider_references.item.provider_group_id' and event=='number':
                npi_provider_groups.append(temp_npi)
                tin.append(temp_tin)
                reference.append(value)

            elif prefix=='provider_references' and event=='end_array':
                write_provider_csv(providers_csv, reference, tin, npi_provider_groups)
                npi_provider_groups = []
                tin = []
                reference = []

            elif prefix=='in_network.item.billing_code' and event=='string':
                temp_code = value
            elif prefix=='in_network.item.negotiated_rates.item.provider_references.item' and event=='number':
                temp_ref = value
            elif prefix=='in_network.item.negotiated_rates.item.negotiated_prices.item.negotiated_rate' and event=='number':
                billing_code.append(temp_code)
                ref_group.append(temp_ref)
                rates.append(value)
            elif prefix=='in_network' and event=='end_array':
                write_rates_csv(rates_csv, billing_code, ref_group, rates)
                billing_code = []
                ref_group = []
                rates = []
        f.close()
    print(json_file + '\nParse Complete')    

def download_parse(url, path):
    (filename, json_file) = download_file(url, path)
    parse_file(filename, json_file)


def download_parse_delete(url, path):
    (filename, json_file) = download_file(url, path)
    parse_file(filename, json_file)
    delete_file(json_file)

In [38]:
hyperlinks[50]

'https://uhc-tic-mrf.azureedge.net/public-mrf/2023-01-01/2023-01-01_Neighborhood-Health-Partnership--Inc_Insurer_D0008974_UHC-Dental_in-network-rates.json.gz'

In [4]:
number_to_parse = 100
start = 131
large_files = 'json_large_hyperlinks.csv'
completed = 'json_completed_hyperlinks.csv'
for i in range(start,start + number_to_parse):
    if check_file_size(hyperlinks[i]) <= 50000000: 
        print('Hyperlink File: ' + str(i) + ' Started!')
        download_parse_delete(hyperlinks[i], dir_json)
        write_completed_file(completed, i, hyperlinks[i])
    else:
        write_large_file(large_files,i,hyperlinks[i])
        print('File number: ' + str(i))


Hyperlink File: 91 Started!


  0%|          | 0/15757805 [00:00<?, ?it/s]

2023-01-01_Neighborhood-Health-Partnership--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Neighborhood-Health-Partnership--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Neighborhood-Health-Partnership--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 92 Started!


  0%|          | 0/1072 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice-Inc-_Insurer_Amwell-Provider-Network_AWNETWORK_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Amwell-Provider-Network_AWNETWORK_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Amwell-Provider-Network_AWNETWORK_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 93
File Too Large, written to Large File CSV
File number: 94
File Too Large, written to Large File CSV
File number: 95
File Too Large, written to Large File CSV
File number: 96
File Too Large, written to Large File CSV
File number: 97
File Too Large, written to Large File CSV
File number: 98
File Too Large, written to Large File CSV
File number: 99
File Too Large, written to Large File CSV
File number: 100
File Too Large, written to Large File CSV
File number: 101
File Too Large, written to Large File CSV
File number: 102
Fil

  0%|          | 0/705936 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice-Inc-_Insurer_Galileo-Provider-Network_GLNETWORK_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Galileo-Provider-Network_GLNETWORK_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Galileo-Provider-Network_GLNETWORK_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 110
Hyperlink File: 111 Started!


  0%|          | 0/1085015 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice-Inc-_Insurer_National-Ancillary-Network_NANETWORK_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_National-Ancillary-Network_NANETWORK_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_National-Ancillary-Network_NANETWORK_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 112 Started!


  0%|          | 0/6168 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice--Inc-_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 113 Started!


  0%|          | 0/15757776 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 114 Started!


  0%|          | 0/15757778 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice--Inc-_Insurer_PPO---NDC_PPO-NDC_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_PPO---NDC_PPO-NDC_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc-_Insurer_PPO---NDC_PPO-NDC_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 115
Hyperlink File: 116 Started!


  0%|          | 0/509258 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice-Inc-_Insurer_UHC---Embedded-Vision_UHC-Vision_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_UHC---Embedded-Vision_UHC-Vision_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_UHC---Embedded-Vision_UHC-Vision_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 117 Started!


  0%|          | 0/46672100 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice-Inc-_Insurer_Virginia-Provider-Network_VANETWORK_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Virginia-Provider-Network_VANETWORK_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice-Inc-_Insurer_Virginia-Provider-Network_VANETWORK_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 118
Hyperlink File: 119 Started!


  0%|          | 0/6243 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_OBPM---Optum-Bundle-Payment_OBPM_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 120 Started!


  0%|          | 0/15757851 [00:00<?, ?it/s]

2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Optimum-Choice--Inc--and-MAMSI-Life-and-Health-Insurance-Company_Insurer_PPO-NDC_PPO-NDC_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 121
Hyperlink File: 122 Started!


  0%|          | 0/3779259 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Freedom-Network_24_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Freedom-Network_24_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Freedom-Network_24_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 123 Started!


  0%|          | 0/3684743 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Liberty-Network_25_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Liberty-Network_25_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Liberty-Network_25_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 124 Started!


  0%|          | 0/2552045 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Metro-Network_27_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Metro-Network_27_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Metro-Network_27_in-network-rates.json.gz
File Deleted
File has been completed
File Too Large, written to Large File CSV
File number: 125
Hyperlink File: 126 Started!


  0%|          | 0/222697 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Acupuncture-Massage-Naturopath_31_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Acupuncture-Massage-Naturopath_31_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Acupuncture-Massage-Naturopath_31_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 127 Started!


  0%|          | 0/2727185 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Chiro_28_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Chiro_28_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-Chiro_28_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 128 Started!


  0%|          | 0/406337 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-ST_30_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-ST_30_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OHPH-ST_30_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 129 Started!


  0%|          | 0/229547 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OPH---Optum-Physical-Health_OPH-160_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OPH---Optum-Physical-Health_OPH-160_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_OPH---Optum-Physical-Health_OPH-160_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 130 Started!


  0%|          | 0/4003228 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Optum-Health-Behavioral-Services--OHBS-_5_in-network-rates.json.gz
Download Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Optum-Health-Behavioral-Services--OHBS-_5_in-network-rates.json.gz
Parse Complete
D:\Vignesh\Capstone\JSON\2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_Optum-Health-Behavioral-Services--OHBS-_5_in-network-rates.json.gz
File Deleted
File has been completed
Hyperlink File: 131 Started!


  0%|          | 0/15757791 [00:00<?, ?it/s]

2023-01-01_Oxford-Health-Insurance--Inc-_Insurer_PPO---NDC_PPO-NDC_in-network-rates.json.gz
Download Complete


OSError: [Errno 22] Invalid argument