In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
pd.set_option('display.max_columns', None)

### Load hospital base directory

In [2]:
df_hospitals = pd.read_json('../data/in/raw/atlas/locations.json', dtype={'zip': str})

### Webscrape departments and other details of the hospitals

In [8]:
# create new dataframes to store department, nursing staff and total treatment infos for each hospital
df_departments = pd.DataFrame(df_hospitals.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})
df_details = pd.DataFrame(df_hospitals.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})

In [9]:
# load website content for specific hospital
def load_hospital_site(hospital_id):
    """
    Summary:
        Retrieve the departments and the respective number of treatments for a defined hospital from bundes-klinik-atlas.de.

    Arguments:
        hospital_id (str): hospital id as used at bundes-klinik-atlas.de

    Returns:
        soup (soup): website content
    """

    request_url = f'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/{hospital_id}'
    response = requests.get(request_url)
    soup = BeautifulSoup(response.content, "html.parser")

    return soup

In [10]:
def get_departments(df, soup, hospital_id):
    """
    Summary:
        Store departments dictionary containing department designations and respective numbers of treatments to a pandas DataFrame.

    Arguments:
        hospital_id (str): hospital id as used at bundes-klinik-atlas.de
        df (pd.DataFrame): DataFrame to store the data in
        ...
    """
    # extract data from soup
    result_list = soup.find(name='ul', class_='rte_ul')
    result_list_elements = result_list.find_all('li')

    # store departments and number of treatments in a list
    departments = {}
    for li in result_list_elements:
        try:
            department = str(li.text.split(':')[0])
            n_treatments = int(li.text.split(':')[1].strip().replace('.', ''))
            departments.update({f"{department}": n_treatments})
        except ValueError:
            department = str(li.text.split(':')[0])
            departments.update({f"{department.replace('\n', ' ').strip()}": 0})
    
    for department, n_treatments in departments.items():
        df.loc[df['hospital_id'] == hospital_id, department]= n_treatments
    
    return df

def get_details(df, soup, hospital_id):
    # extract data from soup
    result_list = soup.find_all(name='div', class_='c-tacho-text__text')

    # extract total treatment count
    treatment_count_number = int(result_list[0].contents[1].text.replace('.',''))
    treatment_count_description = result_list[0].contents[2].text.replace('\n','').strip().replace('(','').replace(')','')
    
    # extract nursing staff quotient
    nursing_staff_quotient_number = float(result_list[1].contents[1].text.replace(',','.'))
    nursing_staff_quotient_description = result_list[1].contents[2].text.replace('\n','').strip().replace('(','').replace(')','')

    # extract provider type
    provider_type = soup.find(name='li', class_='col-2 row-1').contents[2].text.replace('\n','').strip()

    df.loc[df['hospital_id'] == hospital_id, 'total_treatment_count_number'] = treatment_count_number
    df.loc[df['hospital_id'] == hospital_id, 'total_treatment_count_description'] = treatment_count_description
    df.loc[df['hospital_id'] == hospital_id, 'nursing_staff_quotient_number'] = nursing_staff_quotient_number
    df.loc[df['hospital_id'] == hospital_id, 'nursing_staff_quotient_description'] = nursing_staff_quotient_description
    df.loc[df['hospital_id'] == hospital_id, 'provider_type'] = provider_type

    return df

In [11]:
# get department info for each hospital and store it to dataframe
for hospital_id in df_departments.head(1)['hospital_id']:

    soup = load_hospital_site(hospital_id)
    df_departments = get_departments(df_departments, soup, hospital_id)
    df_details = get_details(df_details, soup, hospital_id)
    
    #time.sleep(15) # wait 15 sec between requests as requested by robots.txt of the website

In [65]:
request_url = f'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771003'
response = requests.get(request_url)
soup = BeautifulSoup(response.content, "html.parser")

In [53]:
provider_type = soup.find(name='li', class_='col-2 row-1')
provider_type.contents[2].text.replace('\n','').strip()

'öffentlich'

In [12]:
df_details

Unnamed: 0,hospital_id,total_treatment_count_number,total_treatment_count_description,nursing_staff_quotient_number,nursing_staff_quotient_description,provider_type
0,771003,23479.0,sehr viele,60.11,weit unterdurchschnittlich,öffentlich
1,771011,,,,,
2,771012,,,,,
3,771015,,,,,
4,771016,,,,,
...,...,...,...,...,...,...
1653,773831,,,,,
1654,773836,,,,,
1655,773855,,,,,
1656,773862,,,,,


In [12]:
# store retrieved departments info to csv-file
df_departments[0:50].to_csv('../data/in/staging/atlas_.csv', index=False, encoding='utf-8')

In [None]:
df_test = pd.read_csv('../data/in/staging/atlas_0-49.csv')

### Webscrape treatments of the hospitals

In [5]:
# create new dataframe to store treatment infos for each hospital
df_treatments= pd.DataFrame(df_hospitals['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})

In [20]:
treatments_list = ['Chirurgischer Herzklappenersatz']
treatments_dict = {}
for treatment in treatments_list:
    treatments_dict[treatment] = treatment.replace(' ', '%20')

In [21]:
for hospital_id in df_test['hospital_id'].head(1):
    for treatment, treatment_searchlabel in treatments_dict.items():

        # GET method
        request_url = f'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/{hospital_id}/?tx_tverzhospitaldata_show%5Bsearchlabel%5D={treatment_searchlabel}'
        print(request_url)
        response = requests.get(request_url)

        # parse HTML
        soup = BeautifulSoup(response.content, "html.parser")

        # extracting data
        result_list = soup.find_all(name='div', class_='c-tacho-text__text')
        result_list_elements = result_list.find_all('li')

        



https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/771003/?tx_tverzhospitaldata_show%5Bsearchlabel%5D=Chirurgischer%20Herzklappenersatz


In [6]:
request_url = f'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/{hospital_id}'
print(request_url)
response = requests.get(request_url)

# parse HTML
soup = BeautifulSoup(response.content, "html.parser")

# extracting data
result_list = soup.find(name='div', class_='c-tacho-text__text')
result_list_elements = result_list.find_all('li')

Unnamed: 0,hospital_id
0,771003
1,771011
2,771012
3,771015
4,771016
...,...
1653,773831
1654,773836
1655,773855
1656,773862
