In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys
import os

# add parent directory to sys.path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

from lib.functions_webscrape_atlas import *

pd.set_option('display.max_columns', None)

## Load hospital locations

In [2]:
df_hospitals = pd.read_json('../data/in/raw/atlas/locations.json', dtype={'zip': str})

## Webscrape departments and other details of the hospitals

In [3]:
# create new dataframes to store department, nursing staff and total treatment infos for each hospital
df_departments = pd.DataFrame(df_hospitals.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})
df_details = pd.DataFrame(df_hospitals.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})

In [4]:
# get department info for each hospital and store it to dataframe
for hospital_id in df_departments.head(50)['hospital_id']:

    # load website content for specific hospital
    soup = load_hospital_site(hospital_id)

    # extract specific data from website content
    df_departments = get_departments(df_departments, soup, hospital_id)
    df_details = get_details(df_details, soup, hospital_id)
    
    time.sleep(15) # wait 15 sec between requests as requested by robots.txt of the website

In [7]:
# store retrieved data to csv-files
df_departments[0:50].to_csv('../data/in/staging/atlas_departments_0-49.csv', index=False, encoding='utf-8')
df_details[0:50].to_csv('../data/in/staging/atlas_details_0-49.csv', index=False, encoding='utf-8')

In [16]:
df_departments_0_49 = pd.read_csv('../data/in/staging/atlas_departments_0-49.csv')
df_details_0_49 = pd.read_csv('../data/in/staging/atlas_details_0-49.csv')

In [18]:
df_details_0_49.head(3)

Unnamed: 0,hospital_id,total_treatment_count_number,total_treatment_count_description,nursing_staff_quotient_number,nursing_staff_quotient_description,provider_type
0,771003,23479.0,sehr viele,60.11,weit unterdurchschnittlich,öffentlich
1,771011,13563.0,viele,55.68,unterdurchschnittlich,öffentlich
2,771012,15347.0,viele,48.69,überdurchschnittlich,öffentlich


## Webscrape specific treatments of the hospitals

In [49]:
# create new dataframe to store treatment infos for each hospital
df_treatments= pd.DataFrame(df_hospitals.copy()['link'].apply(lambda x: x.split('/')[-2])).rename(columns={'link': 'hospital_id'})

In [50]:
treatments_list = ['Chirurgischer Herzklappenersatz', 'Totalendoprothese der Hüfte']
treatments_dict = {}
for treatment in treatments_list:
    treatments_dict[treatment] = treatment.replace(' ', '%20')

In [51]:
def get_treatments(df):
    for hospital_id in df_treatments['hospital_id'].head(50):
        for treatment, treatment_searchlabel in treatments_dict.items():

            # GET method
            request_url = f'https://bundes-klinik-atlas.de/krankenhaussuche/krankenhaus/{hospital_id}/?tx_tverzhospitaldata_show%5Bquantile%5D=114%2C202%2C253%2C343&tx_tverzhospitaldata_show%5Bsearchlabel%5D={treatment_searchlabel}&tx_tverzhospitaldata_show%5Bsimplesearch%5D=1&tx_tverzhospitaldata_show%5Btreatmentcode%5D=KAHKO0&cHash=6488c3f4b1c6d9c491b41ed9e9c59a11'
            response = requests.get(request_url)

            # parse HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # extract data
            result_list = soup.find_all(name='div', class_='c-tacho-text__text')

            # extract treatment count
            try:
                treatment_count_number = int(result_list[0].contents[1].text.replace('.',''))
                treatment_count_description = result_list[0].contents[2].text.replace('(','').replace(')','').strip()
            except ValueError:
                treatment_count_number = 0
                treatment_count_description = 'sehr wenige'

            df.loc[df['hospital_id'] == hospital_id, f'treatment_count_number_({treatment})'] = treatment_count_number
            df.loc[df['hospital_id'] == hospital_id, f'treatment_count_description_({treatment})'] = treatment_count_description

            time.sleep(15)
    return df

In [None]:
df_treatments = get_treatments(df_treatments)

In [55]:
df_treatments[0:50].to_csv('../data/in/staging/atlas_treatments_sample_0-49.csv', index=False, encoding='utf-8')