In [1]:
# load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys
import os
from IPython.display import clear_output

# load functions
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)
from lib.functions_webscrape_atlas import *

# set display options
pd.set_option('display.max_columns', None)

## Load hospital locations

In [2]:
# read hospital locations data
df_locations = pd.read_json('../data/in/raw/atlas/locations.json', dtype={'zip': str})

In [3]:
# create list of hospital ids
hospital_id_list = list(df_locations.copy()['link'].apply(lambda x: x.split('/')[-2]).values)

## Clean hospital locations and add column for hospital id
* Create column for hostpital od
* Clean empty cells
* Save data to file

In [3]:
# create new column for hospital_id
df_locations['hospital_id'] = pd.DataFrame(df_locations.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})
# replace empty cells with 'not reported'
df_locations = df_locations.replace('', 'not reported')
# save locations preprocessed data
df_locations.to_csv('../data/in/staging/hospital_locations.csv', index=False)

## Webscrape departments, certificates and other details of the hospitals

In [4]:

# initialize lists to store departments data
hospital_ids_departments = []
department_names = []
department_counts = []

# initialize lists to store certificates data
hospital_ids_certificates = []
certificates = []

# initialize lists to store details data
hospital_ids_details = []
total_treatments_counts = []
total_treatments_labels = []
nursing_quotient_counts = []
nursing_quotient_labels = []
nursing_counts = []
provider_types = []
bed_counts = []
semi_residential_counts = []
emergency_services = []

# loop through all hospital ids
k = 0
for hospital_id in hospital_id_list:
    k += 1
    print(f'{hospital_id} - {round(k/len(hospital_id_list)*100, 1)} %')
    
    # load website content for specific hospital
    soup = load_hospital_site(hospital_id)

    # extract department data from website content
    hospital_ids_, department_names_, department_counts_ = get_departments(soup, hospital_id)

    hospital_ids_departments.extend(hospital_ids_)
    department_names.extend(department_names_)
    department_counts.extend(department_counts_)

    # extract certificates data from website content
    hospital_ids_certificates_, certificates_ = get_certificates(soup, hospital_id)

    hospital_ids_certificates.extend(hospital_ids_certificates_)
    certificates.extend(certificates_)

    # extract details data from website content
    hospital_id, total_treatments_count, total_treatments_label, nursing_quotient_count, nursing_quotient_label, nursing_count, provider_type, bed_count, semi_residential_count, emergency_service = get_details(soup, hospital_id)

    hospital_ids_details.append(hospital_id)
    total_treatments_counts.append(total_treatments_count)
    total_treatments_labels.append(total_treatments_label)
    nursing_quotient_counts.append(nursing_quotient_count)
    nursing_quotient_labels.append(nursing_quotient_label)
    nursing_counts.append(nursing_count)
    provider_types.append(provider_type)
    bed_counts.append(bed_count)
    semi_residential_counts.append(semi_residential_count)
    emergency_services.append(emergency_service)

    # wait 15 sec between requests as requested by robots.txt of the website
    time.sleep(15)
    clear_output(wait=True)

# create dataframes from lists
df_departments = pd.DataFrame({'hospital_id': hospital_ids_departments, 'department_name': department_names, 'department_count': department_counts})
df_certificates = pd.DataFrame({'hospital_id': hospital_ids_certificates, 'certificate': certificates})
df_details = pd.DataFrame({'hospital_id': hospital_ids_details, 'total_treatments': total_treatments_counts, 'total_treatments_label': total_treatments_labels, 'nursing_quotient': nursing_quotient_counts, 'nursing_quotient_label': nursing_quotient_labels, 'nursing_count': nursing_counts ,'provider_type': provider_types, 'bed_count': bed_counts, 'semi_residential_count': semi_residential_counts, 'emergency_service': emergency_services})

# save department and details data for further processing
df_departments.to_csv('../data/in/staging/atlas_departments.csv', index=False, encoding='utf-8')
df_certificates.to_csv('../data/in/staging/atlas_certificates.csv', index=False, encoding='utf-8')
df_details.to_csv('../data/in/staging/atlas_details.csv', index=False, encoding='utf-8')

773870 - 100.0 %
no certificates found


In [4]:
soup = load_hospital_site('773686')    

nursing_count = soup.find_all(name='div', class_='ce-accordion__header__components')[1].find_all(name='strong')[1].text.strip()

In [5]:
nursing_count

'1.546'

## Webscrape specific treatments of the hospitals
* Create treatments dictionary for a) webscraping and b) database dict
* Save datavase dict
* Retrieve treatments data vie webscraping

In [10]:
# Create dictionary with treatment names as keys and urls as values
url_dict ={'Chirurgischer Herzklappenersatz': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771056/?tx_tverzhospitaldata_show%5Bquantile%5D=114%2C202%2C253%2C343&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Chirurgischer%20Herzklappenersatz&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAHKO0&cHash=6488c3f4b1c6d9c491b41ed9e9c59a11',
           'Minimal-invasiver Herzklappenersatz': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771056/?tx_tverzhospitaldata_show%5Bquantile%5D=114%2C202%2C253%2C343&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Minimal-invasiver%20Herzklappenersatz&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAHM0&cHash=50c6a7f418b8bba0fd937e323b8eac77',
           'Bypassoperation des Herzens': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771056/?tx_tverzhospitaldata_show%5Bquantile%5D=114%2C202%2C253%2C343&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Bypassoperation%20des%20Herzens&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAKIA&cHash=d70d02d9a5fca4c88484fe1f48e71e7f',
           'Herzkatheter mit Stent': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771056/?tx_tverzhospitaldata_show%5Bquantile%5D=114%2C202%2C253%2C343&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Herzkatheter%20mit%20Stent&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAKIB&cHash=7cae1fe1edf884bd4a96e34171e8cdf3',
           'Lungenentzündung': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772997/?tx_tverzhospitaldata_show%5Bquantile%5D=92%2C184%2C282%2C412&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Lungenentz%C3%BCndung&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KALE0&cHash=bcf4ebff7a7f83de638c9932e30b42dd',
           'Brustkrebs-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772063/?tx_tverzhospitaldata_show%5Bquantile%5D=9%2C57%2C141%2C217&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Brustkrebs-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KABK0&cHash=f3404911efb4741fcfb6a70f4fd95837',
           'Lungenkrebs-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771217/?tx_tverzhospitaldata_show%5Bquantile%5D=4%2C15%2C38%2C73&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Lungenkrebs-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KALK0&cHash=e3c44782d684bf077683b1dcff0563e8',
           'Darmkrebs-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771374/?tx_tverzhospitaldata_show%5Bquantile%5D=17%2C30%2C47%2C72&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Darmkrebs-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KADK0&cHash=11cb4bad024311c202ffc41fafbc1d2e',
           'Speiseröhren und Magenkrebs-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/773491/?tx_tverzhospitaldata_show%5Bquantile%5D=2%2C3%2C6%2C15&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Speiser%C3%B6hren%20und%20Magenkrebs-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAOM0&cHash=364a6f260a5792ef6206c30ae1919d11',
           'Prostatakrebs-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771299/?tx_tverzhospitaldata_show%5Bquantile%5D=15%2C40%2C75%2C127&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Prostatakrebs-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAPRK0&cHash=aba8d24f1e9ec2ebcf3ffe761111e997',
           'Pankreaskarzinom-Operation': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771700/?tx_tverzhospitaldata_show%5Bquantile%5D=4%2C7%2C11%2C17&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Pankreaskarzinom-Operation&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAPK0&cHash=173208dc51def8e0ba8cd9debd9ef268',
           'Totalendoprothese der Hüfte': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772231/?tx_tverzhospitaldata_show%5Bquantile%5D=38%2C77%2C124%2C230&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Totalendoprothese%20der%20H%C3%BCfte&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAEH0&cHash=efbb9b98461a2830519f76249ec85cb7',
           'Totalendoprothesen-Wechsel der Hüfte': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771358/?tx_tverzhospitaldata_show%5Bquantile%5D=4%2C8%2C14%2C26&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Totalendoprothesen-Wechsel%20der%20H%C3%BCfte&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KARH0&cHash=e4898fb053c6ad37868a99bc9752feb8',
           'Totalendoprothesen-Wechsel des Knies': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771358/?tx_tverzhospitaldata_show%5Bquantile%5D=5%2C10%2C16%2C30&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Totalendoprothesen-Wechsel%20des%20Knies&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KARK0&cHash=7bc10476b7236fa9c4edba6e33c76421',
           'Totalendoprothese des Knies': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771358/?tx_tverzhospitaldata_show%5Bquantile%5D=58%2C93%2C151%2C253&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Totalendoprothese%20des%20Knies&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAEK0&cHash=03f08c1f87992324d59206f40787b5b5',
           'Behandlung auf einer Schlaganfalleinheit': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771555/?tx_tverzhospitaldata_show%5Bquantile%5D=141%2C350%2C539%2C745&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Behandlung%20auf%20einer%20Schlaganfalleinheit&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KASA0&cHash=ab40287de25dc2cf59c2e637ab1eb940',
           'Multiple Sklerose': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772087/?tx_tverzhospitaldata_show%5Bquantile%5D=1%2C3%2C24%2C57&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Multiple%20Sklerose&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAMUS0&cHash=cc9e17629f1e9cff7ffb3b678f9c2442',
           'Parkinson': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771912/?tx_tverzhospitaldata_show%5Bquantile%5D=2%2C5%2C12%2C42&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Parkinson&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAPAR0&cHash=b4c18b572f4841ef85c661036dc1f4c4',
           'Entbindungen': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771984/?tx_tverzhospitaldata_show%5Bquantile%5D=479%2C726%2C1075%2C1695&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Entbindungen&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAEN0&cHash=147f28b60064cbadb23864ca7d7b01ac',
           'Bauchschlagader: Operation und Stent': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772191/?tx_tverzhospitaldata_show%5Bquantile%5D=4%2C11%2C22%2C34&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Bauchschlagader%3A%20Operation%20und%20Stent%20&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KABAA0&cHash=5dcf8315c289e6dfb45d63e246cc3b56',
           'Durchblutungsstörung der Beine: Operation und Stent': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/773318/?tx_tverzhospitaldata_show%5Bquantile%5D=28%2C118%2C204%2C315&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Durchblutungsst%C3%B6rung%20der%20Beine%3A%20Operation%20und%20Stent&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KASCHK0&cHash=9955d47a7543a02d868c6f3cc0b02806',
           'Halsschlagader: Operation und Stent': 'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/772783/?tx_tverzhospitaldata_show%5Bquantile%5D=7%2C19%2C32%2C51&tx_tverzhospitaldata_show%5Bsearchlabel%5D=Halsschlagader%3A%20Operation%20und%20Stent&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAOH0&cHash=bee2feab4b2c02881380558c5799520c'
           }

treatments_dictionary = {}
treatments_dict_for_db = {}

# Extract treatment code, searchlabel and cHash from urls and store them in a dictionary
for key, value in url_dict.items():
    treatment_name = key
    treatment_code = value.split('treatmentcode%5D=')[1].split('&')[0]
    treatment_searchlabel = value.split('searchlabel%5D=')[1].split('&')[0]
    treatment_cHash = value.split('cHash=')[1]
    treatments_dictionary[treatment_name] = {
        'code': treatment_code,
        'searchlabel': treatment_searchlabel,
        'cHash': treatment_cHash}
    treatments_dict_for_db[treatment_code] = treatment_name

In [14]:
# Save treatments dictionary to csv for database import
treatments_dict_df = pd.DataFrame({'treatment_code': treatments_dict_for_db.keys(), 'treatment_name': treatments_dict_for_db.values()})
treatments_dict_df.to_csv('../data/in/staging/treatments_dict.csv', index=False)

In [None]:
# Get treatments for all hospitals and save to csv in chunks
m = 50
for k in range(len(hospital_id_list)//m):
    k+=11
    print('k:', k)
    print(f'{k*m}-{k*m+m-1}')
    list_for_df_hospital_id, list_for_df_treatment_code, list_for_df_count_number, list_for_df_count_label = get_treatments(hospital_ids[k*m:k*m+m], treatments_dictionary)
    df_treatments = pd.DataFrame({'hospital_id': list_for_df_hospital_id, 'treatment_code': list_for_df_treatment_code, 'count_number': list_for_df_count_number, 'count_label': list_for_df_count_label})
    df_treatments.to_csv(f'../data/in/staging/treatments_chunks/atlas_treatments_sample_{k*m}-{k*m+m-1}.csv', index=False, encoding='utf-8')
    print(f'saved file {k*m}-{k*m+m-1}')
    del df_treatments