In [1]:

import os
import requests
import json
import datetime
import shutil
from bs4 import BeautifulSoup
import pandas as pd
from random import choice
from selenium.common.exceptions import TimeoutException
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse
from time import sleep, time
from random import uniform, randint
import json

def create_driver(download_path, driver_path):

    """
    Create selenium chrome browser so we can pull the page source passed in url
    """

    # set up Chrome browser for selenium

    options = webdriver.ChromeOptions()

    # add headless option 

    options.add_argument("headless")

    # simulate maxing out the browser window

    options.add_argument("start-maximized")

    # remove selenium log level 

    options.add_argument("--log-level=3")

    # disable blink features to get around captcha

    options.add_argument("--disable-blink-features")

    options.add_argument("--disable-blink-features=AutomationControlled")

    options.add_experimental_option("excludeSwitches", ["enable-automation"])

    # add browser notifications

    options.add_experimental_option("prefs", { 
        "profile.default_content_setting_values.notifications": 1 
    })

    # set download path

    chrome_prefs = {"download.default_directory": download_path}

    options.experimental_options["prefs"] = chrome_prefs

    options.add_experimental_option('useAutomationExtension', False)

    driver = webdriver.Chrome(executable_path=driver_path, options=options)

    # set user agent to avoid being blocked by websites

    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})

    # get generic headers

    driver.get('https://www.httpbin.org/headers')

    return driver


def get_url_data(driver, url, is_download=False, download_path=None, wait=False):

    """Use driver to get page source or download data"""

    # if is_download is true, get page or download data

    def _download_file(driver, url, download_path, wait):
    
        """Download file from url"""

        # download file and wait for download to complete

        if wait == True:
            driver.get(url)
            sleep(15)
        else:
            driver.get(url)

    if is_download:
        
        _download_file(driver, url, download_path, wait)
        
    else:
        try:
            driver.get(url)
        except TimeoutException:
            print("Loading took too much time!")
        

    return driver



def get_unc_data(driver, url):

    """Create drivers to bypass captcha for UNC data"""

    def _wait_between(a,b):
        rand=uniform(a, b) 
        sleep(rand)


    try:
        driver.get(url)
    except TimeoutException:
        print("Loading took too much time!")
    
    driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/div[1]/div[1]/div[2]/div/a[2]').click()

    sleep(10)

    mainWin = driver.current_window_handle  

    # move the driver to the first iFrame 
    driver.switch_to_frame(driver.find_elements_by_tag_name("iframe")[0])

    # *************  locate CheckBox  **************
    CheckBox = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID ,"recaptcha-anchor"))
            ) 

    # *************  click CheckBox  ***************
    _wait_between(0.5, 0.7)  
    
    # making click on captcha CheckBox 
    CheckBox.click()

    # switch back to main window

    driver.switch_to.window(mainWin)

    driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div/div/div/div[2]/div/div/div/div/fieldset/div/div/div/div[4]/div/div/div/div/div/span/input').click()

    driver.find_element_by_xpath('/html/body/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div/div[2]/div/div/div/div/div[2]/div/div/div/div/fieldset/div/div/div/a').click()

    return driver


def wait_for_downloads(download_path):

    """Wait for downloads to complete"""

    print("Waiting for downloads..", end="")

    
    timeout = 600   # [seconds]

    timeout_start = time()

    # while runtime is greater than 10 minutes then exit
    
    while any([filename.endswith(".crdownload") for filename in os.listdir(download_path)]):
        if time() > timeout_start + timeout:
            sleep(1)
            print("..", end="")
            break

    print("done!")


c:\Users\remot\OneDrive\Desktop\Personal\nc-hospital-transparency\drivers\chromedriver.exe


In [14]:
# read in hospitals csv

df = pd.read_csv('../hospitals.csv')

In [15]:
# loop through each hospital and ping standard charges page
driver = create_driver(download_path)

hospital_data_urls = {}

unc_urls = []

ext = ['.json','.csv', 'wpfb_dl', '.xlsx', 'ptapp']

for index, row in df.iterrows():

    records = []
    
    browser = get_url_data(driver, row['hospital_url'])

    # get the page source and parse itS

    source = browser.page_source

    soup = BeautifulSoup(source, 'lxml')
    
    for entry in soup.find_all(['a'], href=True): 

        download_url = entry.get('href')

        if any(file_type in download_url for file_type in ext):
            # determine if the base hospital_url is in download_url or not

            if row['hospital_id'] in 'vidant-health' and '.csv' in download_url:
                continue

            if download_url.startswith('/'):
                # if download_url starts with '/' then add the base hospital_url to the download_url

                data_url = urlparse(row['hospital_url']).scheme + "://" + urlparse(row['hospital_url']).netloc + '/'  + download_url

                records.append(data_url)

            else:
                # if download_url does not start with '/' then add the base hospital_url to the download_url
                records.append(download_url)

            
    if 'first-health' in row['hospital_id']:

        limit = 250

        page = 0

        first_health_url = row['hospital_url'].format(0)

        print(first_health_url)

        output = requests.get(first_health_url).json()

        page_limit_max = int((output['count'] / limit) + 1)

        first_health_urls = [records.append(row['hospital_url'].format(page)) for page in range(1, page_limit_max)]
        
    if 'wakemed' in row['hospital_id']:

        records.append(row['hospital_url'])
        
    hospital_data_urls[row['hospital_id']] = list(set(records))

https://apim.services.craneware.com/api-pricing-transparency/api/public/148edeae6e4a28704905be6a5f4e3039/charges/standardCharges?page=0&limit=250&search=&codeType=
https://apim.services.craneware.com/api-pricing-transparency/api/public/2540dcb17c9d1497c17eacc26ed9c9ec/charges/standardCharges?page=0&limit=250&search=&codeType=


In [12]:
# write to a json file
hospital_data_urls['northern-regional'] = list(set(hospital_data_urls['northern-regional']))

hospital_data_urls

{'duke-university-hospital': ['https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DUH_standardcharges_cdm.csv',
  'https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DRaH_standardcharges_cdm.csv',
  'https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DRH_standardcharges_cdm.csv',
  'https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DUH_standardcharges_drg.csv',
  'https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DRaH_standardcharges_drg.csv',
  'https://www.dukehealth.org//sites/default/files/general_page/56-2070036_DRH_standardcharges_drg.csv'],
 'north-carolina-baptist-hospital': ['https://www.wakehealth.edu//-/media/WakeForest/Clinical/Files/Patient-and-Family-Resources/Transparency-Files/North-Carolina-Baptist-Hospital--Transparency-File-FINAL.csv?la=en',
  'https://www.wakehealth.edu//-/media/WakeForest/Clinical/Files/Patient-and-Family-Resources/Transparency-Files/Lexington

In [23]:
# remove hospital_data_urls keys where values are empty

hospital_empty_urls = {k: v for k, v in hospital_data_urls.items() if v == []}

hospital_contain_urls = {k: v for k, v in hospital_data_urls.items() if v != []}

hospitals_collected = list(hospital_contain_urls.keys())

hospital_not_collected = list(hospital_empty_urls.keys())

In [24]:

# url_dictionary: {'hospital_id': [url1, url2, url3]} 

# loop through each hospital_id in url dictionary and iterate through url list and download data

export_data = {}

structured_ext = ['.csv', 'wpfb_dl', '.xlsx', '.json']

skip_hospital = ['university-of-north-carolina-hospital', 'atrium-health', 'wakemed']

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36'} # This is chrome, you can set whatever browser you like

# remove all files in download_path

for file in os.listdir(download_path):
    file_path = os.path.join(download_path, file)
    try:
        if os.path.isfile(file_path):
            os.unlink(file_path)
    except Exception as e:
        print(e)


for hospital_id, url_list in hospital_data_urls.items():

    # if hospital_id is atrium then go to next item in hospital_data_urls dictionary

    if any(ids in hospital_id for ids in skip_hospital): 

        export_data[hospital_id] = url_list

        continue

    print(hospital_id)

    # iterate through lists in url_list and download data

    for index, url in enumerate(url_list):

        # if url ends with .csv, .xlsx, .json, or .wpfb_dl then download data

        if index == 0:

            wait = True
        
        else:

            wait = False

        if any(file_type in url for file_type in structured_ext):

            # get the data from the url

            get_url_data(driver, url, is_download=True, download_path=download_path, wait=wait)


start_time  = time()

# while runtime is greater than 10 minutes then exit

while time() - start_time < 600:

    # wait for download to complete in selenium driver

    wait_for_downloads(download_path)




duke-university-hospital
north-carolina-baptist-hospital
app-regional-health-system
catawba-valley-medical-center
cateret-health-care
cone-health
first-health-moore
first-health-montgomery
iredell-health
mission-health
nhrmc-health
northern-regional
novant-health
vidant-health
Waiting for downloads.............................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

KeyboardInterrupt: 

In [None]:
    
for hospital_id, url_list in export_data.items():
    
    if 'wakemed' in row['hospital_id']:

        xpath = '/html/body/app-root/app-allservices/div[1]/div/div[3]/div/app-paginator/div[2]/div/div/button'

        browser.execute_script("arguments[0].click();", WebDriverWait(browser, 20).until(EC.element_to_be_clickable((By.XPATH, xpath))))

        browser.switch_to.alert.accept()
    
    if 'university-of-north-carolina-hospital' in row['hospital_id']:
                    
        for unc in unc_urls:

            driver = get_unc_data(driver, unc)

            source = driver.page_source

            soup = BeautifulSoup(source, 'lxml')

            for entry in soup.find_all(['iframe'], href=True):

                download_url = entry.get('src')

                if 'CSV' in download_url:
        
                    base_url = 'https://portalapprev.com/ptapp/#'

                    download_url = base_url + '/' + download_url

                    records.append(download_url)

In [7]:
import os
import requests
import json
import datetime
import shutil
from bs4 import BeautifulSoup
import pandas as pd
from random import choice
from selenium.common.exceptions import TimeoutException
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from urllib.parse import urlparse
from time import sleep, time
from random import uniform, randint
import json

abspath = os.path.normpath(os.path.dirname(os.path.dirname('')))

download_path = os.path.normpath(os.path.join(abspath, 'data', 'raw'))

print(abspath)

def wait_for_downloads(download_path):

    """Wait for downloads to complete"""

    print("Waiting for downloads..")

    
    timeout = 10   # [seconds]

    timeout_start = time()

    # while runtime is greater than 10 minutes then exit
    
    while any([filename.endswith(".crdownload") for filename in os.listdir(download_path)]):
        if time() > timeout_start + timeout:
            sleep(1)
            print("..", end="")
            break

    print("done!")


.


In [8]:

download_path = r'C:\Users\remot\OneDrive\Desktop\hospital-chargemaster\data\raw'

wait_for_downloads(download_path)



Waiting for downloads....done!


In [1]:
# read json file and convert to dataframe

northern-regional

NameError: name 'download_path' is not defined

In [2]:

def wait_for_downloads(download_path):

    """Wait for downloads to complete"""
    
    timeout = 10   # [seconds]

    timeout_start = time()

    # while runtime is greater than 10 minutes then exit

    print("Waiting for downloads..", end="")
    
    while any([filename.endswith(".crdownload") for filename in os.listdir(download_path)]):
        
        sleep(1)
        print("..", end="")

    print("done!")


In [35]:
import pandas as pd

path = r"C:\Users\remot\OneDrive\Desktop\Personal\nc-hospital-transparency\data\raw\northern-regional-hospital_standardcharges.json"

In [36]:
# read json file and convert to dataframe

df = pd.read_json(path, lines=True)

In [46]:
# drop percent_occurence_within_primary_code column

df_all = df.drop(columns=['PACKAGE_TYPE', 'PERCENT_OCCURRENCE_WITHIN_PRIMARY_CODE','SUPPORTING_SERVICE_CODE' ,'SUPPORTING_SERVICE_CODE_DESCRIPTION'])

In [47]:
for column in df_all:
    if df_all[column].dtype == 'float64':
        df_all[column]=pd.to_numeric(df_all[column], downcast='float')
    if df_all[column].dtype == 'int64':
        df_all[column]=pd.to_numeric(df_all[column], downcast='integer')


In [48]:
df_all.info(memory_usage = "deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2752925 entries, 0 to 2752924
Data columns (total 16 columns):
 #   Column                        Dtype  
---  ------                        -----  
 0   PAYER_GROUP                   object 
 1   PAYER_NAME                    object 
 2   LOCATION                      object 
 3   BILL_TYPE                     object 
 4   PT_SUMMARY                    object 
 5   PRIMARY_CODE                  object 
 6   PRIMARY_CODE_DESCRIPTION      object 
 7   PRIMARY_REV_CODE              object 
 8   PRIMARY_REV_CODE_DESCRIPTION  object 
 9   SUPPORTING_REV_CODE           int16  
 10  PRICE                         float32
 11  GROSS_CHARGES                 float32
 12  CASH_PRICE                    float32
 13  PAYER_NEGOTIATED_RATE         float32
 14  MIN_NEGOTIATED_RATE           float32
 15  MAX_NEGOTIATED_RATE           float32
dtypes: float32(6), int16(1), object(9)
memory usage: 1.9 GB


In [50]:
df.to_csv(r'C:\Users\remot\OneDrive\Desktop\Personal\nc-hospital-transparency\data\raw\northern-regional-hospital_standardcharges.csv', index=False)

In [3]:
import glob

abspath = os.path.normpath(os.path.dirname(os.path.dirname('')))

download_path = os.path.normpath(os.path.join(abspath, 'data', 'raw'))

print(abspath)

def json_to_csv(json, file):
    
    df = pd.read_json(json)

    df.to_csv(file, index=False)


.


In [2]:
glob.glob(download_path + '/*.json')

[]