In [1]:
from bs4 import BeautifulSoup
import urllib
import requests
import pandas as pd
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [2]:
url = 'https://health.usnews.com/best-hospitals/pediatric-rankings/cardiology-and-heart-surgery'
pre = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}

#### What does not work

In [3]:
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
print soup

<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
 
You don't have permission to access "http://health.usnews.com/best-hospitals/pediatric-rankings/cardiology-and-heart-surgery" on this server.<p>
Reference #18.70ad717.1524543448.501c55e
</p></body>
</html>



#### Overall Scores

In [4]:
help(requests.get)

Help on function get in module requests.api:

get(url, params=None, **kwargs)
    Sends a GET request.
    
    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response



In [5]:
def get_overall_data(params):
    page = requests.get(url, headers=user_agent, params=params)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.has_attr('id')]
    data = [d for d in script_data if d.attrs['id'] == 'search-init-data'][0]
    data = json.loads(data.text)
    return data

In [6]:
cols = ['Rank', 'Hospital', 'Overall', 'Reputation', 'Volume']

def scrape_one(data, rank_number):
    rank, name = [data['matches'][rank_number][k] for k in ['rank', 'name']]
    score, repu, vol = [j['score'] for j in data['matches'][rank_number]['scores']]
    data_list = [rank, name, score, repu, vol]
    df = pd.DataFrame(columns=cols, data=[data_list])
    return df

scrape_overall_data = lambda data: pd.concat(map(lambda n: scrape_one(data, n), range(10)))

df = pd.concat(map(lambda p: scrape_overall_data(get_overall_data({'page': p+1})), range(2)))
df = df.reset_index(drop = True)
display(df)

Unnamed: 0,Rank,Hospital,Overall,Reputation,Volume
0,1,Texas Children's Hospital,100.0/100,55.6%,10/12
1,2,Boston Children's Hospital,97.3/100,80.2%,12/12
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,92.6/100,14.9%,6/12
3,3,University of Michigan C.S. Mott Children's Hospital,92.6/100,46.1%,11/12
4,5,Children's Hospital of Wisconsin,91.2/100,16.5%,9/12
5,6,Cincinnati Children's Hospital Medical Center,90.7/100,31.4%,8/12
6,7,Children's Hospital Colorado,88.4/100,11.4%,9/12
7,8,Children's Hospital Los Angeles,88.1/100,18.9%,10/12
8,9,Children's Hospital of Philadelphia,86.5/100,73.5%,11/12
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,86.4/100,19.9%,10/12


#### Detail Scores

In [7]:
texts = '\nUSN.ready().then(function() {\n    USN.EventDispatcher.instance.dispatcher.trigger(\'summon:hospitals:profile'
def get_detail_data(url, texts):
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.text.startswith(texts)]
    data = json.loads(script_data[0].text.split('init(')[1].rstrip('\n});\n});\n')[:-7])
    return data

In [8]:
cols = ['Rank', 'Hospital', 'Measure', 'Score', 'Rate']
df = pd.DataFrame(columns=cols)

for page_no in range(2):
    url_param = {'page': page_no+1}
    data = get_overall_data(url_param)
    
    for i in range(10):
        rank, name = [data['matches'][i][k] for k in ['rank', 'name']]
        child_suff = data['matches'][i]['full_scorecard_url']
        url_child = '%s%s' % (pre, child_suff)
        data_child = get_detail_data(url_child, texts)
        all_titles = []
        for group in data_child['scorecard']['measure_groups']:
            for measure in group['measures']:
                title = measure['title']
                score = measure['score']['value']
                label = measure['score']['label']
                data_list = [rank, name, title, score, label]
                df2 = pd.DataFrame(columns=cols, data=[data_list])
                df = df.append(df2)
                all_titles.append(title)
            
df.reset_index(inplace = True, drop = True)
display(df.head())

Unnamed: 0,Rank,Hospital,Measure,Score,Rate
0,1,Texas Children's Hospital,Survival after congenital heart surgery,17/21,High
1,1,Texas Children's Hospital,Survival after certain complex heart procedures,20/24,Very high
2,1,Texas Children's Hospital,Survival after Norwood/hybrid surgery,9/12,High
3,1,Texas Children's Hospital,Survival after heart transplant,5/6,High
4,1,Texas Children's Hospital,Ability to prevent infections throughout hospital,37/38,Excellent


In [9]:
df_rank = df[['Rank', 'Hospital']].drop_duplicates()

In [10]:
df_score = df.pivot(index='Hospital', columns='Measure', values='Score')
df_score = df_score.reset_index().merge(df_rank, on = 'Hospital', how = 'left')[['Rank', 'Hospital']+all_titles]
df_score = df_score.sort_values(by='Rank').reset_index(drop = True)
display(df_score)

Unnamed: 0,Rank,Hospital,Survival after congenital heart surgery,Survival after certain complex heart procedures,Survival after Norwood/hybrid surgery,Survival after heart transplant,Ability to prevent infections throughout hospital,Ability to prevent infections in intensive-care units,Ability to prevent pressure injuries,Number of surgeries,Number of catheter procedures,Number of Norwood or hybrid surgeries,Nurse staffing,Congenital heart program,Adult congenital heart program,Heart transplant program,Advanced clinical services offered,Clinical support services offered,Advanced technologies available,Specialized clinics and programs available,Has fulltime subspecialists available,Recognized as Nurse Magnet hospital,Reputation with physicians in specialty,Commitment to best practices,Commitment to quality improvement,Adoption of health information technology,Active fellowship program,Commitment to clinical research,Help for families,Enlists families in structuring care
0,1,Texas Children's Hospital,17/21,20/24,9/12,5/6,37/38,3/5,5/5,10/12,33/33,12/12,3.7,23/23,10/10,11/11,16/16,9/9,8/8,13/13,19/20,True,55.6%,48/50,15/15,15/16,7/7,12/12,8/8,7/7
1,2,Boston Children's Hospital,14/21,22/24,9/12,5/6,37/38,1/5,5/5,12/12,33/33,12/12,4.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,20/20,True,80.2%,50/50,14/15,16/16,7/7,12/12,8/8,7/7
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,18/21,23/24,11/12,5/6,38/38,2/5,5/5,6/12,18/33,5/12,3.2,23/23,10/10,11/11,16/16,9/9,7/8,13/13,18/20,True,14.9%,50/50,14/15,14/16,7/7,12/12,8/8,7/7
3,3,University of Michigan C.S. Mott Children's Hospital,15/21,19/24,10/12,6/6,32/38,4/5,3/5,11/12,28/33,12/12,3.7,23/23,10/10,10/11,15/16,9/9,7/8,13/13,19/20,True,46.1%,46/50,15/15,14/16,7/7,12/12,8/8,7/7
4,5,Children's Hospital of Wisconsin,18/21,16/24,12/12,3/6,30/38,5/5,5/5,9/12,19/33,11/12,4.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,20/20,True,16.5%,50/50,12/15,16/16,6/7,12/12,8/8,7/7
5,6,Cincinnati Children's Hospital Medical Center,15/21,21/24,10/12,5/6,36/38,1/5,4/5,8/12,28/33,11/12,4.4,23/23,10/10,10/11,16/16,9/9,8/8,13/13,19/20,True,31.4%,49/50,15/15,16/16,7/7,12/12,8/8,7/7
6,7,Children's Hospital Colorado,15/21,22/24,9/12,4/6,36/38,4/5,3/5,9/12,32/33,11/12,3.3,23/23,10/10,11/11,16/16,9/9,7/8,13/13,20/20,True,11.4%,50/50,14/15,16/16,7/7,12/12,8/8,7/7
7,8,Children's Hospital Los Angeles,14/21,16/24,12/12,6/6,36/38,3/5,4/5,10/12,29/33,12/12,3.3,23/23,10/10,10/11,16/16,9/9,7/8,13/13,19/20,True,18.9%,49/50,14/15,16/16,7/7,12/12,8/8,7/7
8,9,Children's Hospital of Philadelphia,11/21,18/24,9/12,3/6,36/38,1/5,5/5,11/12,33/33,12/12,3.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,18/20,True,73.5%,50/50,15/15,16/16,7/7,12/12,8/8,7/7
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,13/21,19/24,11/12,6/6,38/38,4/5,5/5,10/12,32/33,12/12,2.8,23/23,10/10,11/11,16/16,9/9,8/8,13/13,18/20,False,19.9%,50/50,13/15,15/16,7/7,12/12,8/8,7/7


In [12]:
df_rate = df.pivot(index='Hospital', columns='Measure', values='Rate')
df_rate = df_rate.reset_index().merge(df_rank, on = 'Hospital', how = 'left')[['Rank', 'Hospital']+all_titles]
df_rate = df_rate.sort_values(by='Rank').reset_index(drop = True)
display(df_rate)

Unnamed: 0,Rank,Hospital,Survival after congenital heart surgery,Survival after certain complex heart procedures,Survival after Norwood/hybrid surgery,Survival after heart transplant,Ability to prevent infections throughout hospital,Ability to prevent infections in intensive-care units,Ability to prevent pressure injuries,Number of surgeries,Number of catheter procedures,Number of Norwood or hybrid surgeries,Nurse staffing,Congenital heart program,Adult congenital heart program,Heart transplant program,Advanced clinical services offered,Clinical support services offered,Advanced technologies available,Specialized clinics and programs available,Has fulltime subspecialists available,Recognized as Nurse Magnet hospital,Reputation with physicians in specialty,Commitment to best practices,Commitment to quality improvement,Adoption of health information technology,Active fellowship program,Commitment to clinical research,Help for families,Enlists families in structuring care
0,1,Texas Children's Hospital,High,Very high,High,High,Excellent,Average,Excellent,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
1,2,Boston Children's Hospital,High,Very high,High,High,Excellent,Below average,Excellent,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Excellent,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,Very high,Very high,Very high,High,Excellent,Below average,Excellent,Average,Average,Average,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Good,Excellent,Above average,Above average,Excellent,Excellent,Excellent,Excellent
3,3,University of Michigan C.S. Mott Children's Hospital,High,Very high,High,Very high,Above average,Above average,Above average,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Excellent,Above average,Excellent,Excellent,Excellent,Excellent
4,5,Children's Hospital of Wisconsin,Very high,High,Very high,Average,Above average,Excellent,Excellent,High,High,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Good,Excellent,Above average,Excellent,Above average,Excellent,Excellent,Excellent
5,6,Cincinnati Children's Hospital Medical Center,High,Very high,High,High,Excellent,Below average,Above average,Average,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
6,7,Children's Hospital Colorado,High,Very high,High,Average,Excellent,Above average,Above average,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
7,8,Children's Hospital Los Angeles,High,High,Very high,Very high,Excellent,Average,Above average,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
8,9,Children's Hospital of Philadelphia,Average,High,High,Average,Excellent,Below average,Excellent,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,High,Very high,Very high,Very high,Excellent,Above average,Excellent,High,Very high,Very high,Above average,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Very good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent


#### Putting Everything Together

In [34]:
url = 'https://health.usnews.com/best-hospitals/pediatric-rankings/cardiology-and-heart-surgery'
# url = 'https://health.usnews.com/best-hospitals/pediatric-rankings/pulmonology'
pre = 'https://health.usnews.com/'
user_agent = {'User-agent' : 'Mozilla/5.0'}

In [35]:
cols = ['Rank', 'Hospital', 'Overall', 'Reputation', 'Volume']
child_cols = ['Measure', 'Score', 'Rate']
texts = '\nUSN.ready().then(function() {\n    USN.EventDispatcher.instance.dispatcher.trigger(\'summon:hospitals:profile'

In [36]:
def get_overall_data(params):
    page = requests.get(url, headers=user_agent, params=params)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.has_attr('id')]
    data = [d for d in script_data if d.attrs['id'] == 'search-init-data'][0]
    data = json.loads(data.text)
    return data

def scrape_one(data, rank_number):
    rank, name = [data['matches'][rank_number][k] for k in ['rank', 'name']]
    score, repu, vol = [j['score'] for j in data['matches'][rank_number]['scores']]
    data_list = [rank, name, score, repu, vol]
    df = pd.DataFrame(columns=cols, data=[data_list])
    
    child_suff = data['matches'][rank_number]['full_scorecard_url']
    url_child = '%s%s' % (pre, child_suff)
    data_child = get_detail_data(url_child, texts)
    child_group = map(scrape_child_group, data_child['scorecard']['measure_groups'])
    all_titles, df_child = ([tup[0] for tup in child_group], [tup[1] for tup in child_group])
    all_titles = reduce(lambda l, r: l+r, all_titles)
    df_child = pd.concat(df_child)#.reset_index(drop = True)
    df_child['Rank'] = rank; df_child['Hospital'] = name
    
    pivot_params = {'index' : 'Hospital', 'columns' : 'Measure'}
    df_score = df_child.pivot(values='Score', **pivot_params).reset_index()[['Hospital']+all_titles]
    df_score = df.merge(df_score, on = ['Hospital'], how = 'left')
    
    df_rate = df_child.pivot(values='Rate', **pivot_params).reset_index()[['Hospital']+all_titles]
    df_rate = df.merge(df_rate, on = ['Hospital'], how = 'left')
    return df_score#, df_rate

def scrape_child_measure(measure):
    title = measure['title']
    score = measure['score']['value']
    label = measure['score']['label']
    data_list = [title, score, label]
    df = pd.DataFrame(columns=child_cols, data=[data_list])
    return title, df

def scrape_child_group(group): 
    data_group = map(lambda m: scrape_child_measure(m), group['measures'])
    data_group = ([tup[0] for tup in data_group], pd.concat([tup[1] for tup in data_group]))
    return data_group

def get_detail_data(url, texts):
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.text.startswith(texts)]
    data = json.loads(script_data[0].text.split('init(')[1].rstrip('\n});\n});\n')[:-7])
    return data

scrape_data = lambda data: pd.concat(map(lambda n: scrape_one(data, n), range(10)))

df_score = pd.concat(map(lambda p: scrape_data(get_overall_data({'page': p+1})), range(2))) \
                .sort_values(by = 'Rank') \
                .reset_index(drop = True)

In [37]:
display(df_score)

Unnamed: 0,Rank,Hospital,Overall,Reputation,Volume,Success with asthma inpatients,Success in helping patients manage their asthma,Success in managing cystic fibrosis patients,Success in managing neuromuscular weakness disorder,Survival of patients on ventilators,Ability to prevent infections throughout hospital,Ability to prevent infections in intensive-care units,Ability to prevent pressure injuries,Survival after lung transplant,Number of patients,Number of tests and noninvasive procedures,Nurse staffing,Lung transplant program,Advanced clinical services offered,Clinical support services offered,Advanced technologies available,Has fulltime subspecialists available,Recognized as Nurse Magnet hospital,Reputation with physicians in specialty,Commitment to best practices,Commitment to quality improvement,Adoption of health information technology,Active fellowship program,Commitment to clinical research,Help for families,Enlists families in structuring care
0,1,Children's Hospital of Philadelphia,100.0/100,62.6%,18/19,4/5,10/13,12/16,6/6,6/6,42/45,1/5,5/5,4/6,18/19,12/12,3.5,5/5,27/27,9/9,3/3,10/11,True,62.6%,39/41,15/15,16/16,5/5,4/6,8/8,7/7
1,2,Texas Children's Hospital,99.5/100,45.0%,18/19,3/5,10/13,13/16,6/6,6/6,44/45,3/5,5/5,4/6,18/19,12/12,3.7,5/5,27/27,9/9,3/3,11/11,True,45.0%,38/41,15/15,15/16,5/5,4/6,8/8,7/7
2,3,Boston Children's Hospital,99.1/100,55.9%,18/19,5/5,12/13,8/16,6/6,6/6,44/45,1/5,5/5,3/6,18/19,11/12,4.5,5/5,27/27,9/9,3/3,11/11,True,55.9%,41/41,14/15,16/16,5/5,5/6,8/8,7/7
3,4,Cincinnati Children's Hospital Medical Center,98.3/100,61.7%,16/19,4/5,13/13,15/16,6/6,5/6,41/45,1/5,4/5,2/6,16/19,12/12,4.4,3/5,27/27,9/9,3/3,11/11,True,61.7%,40/41,15/15,16/16,5/5,4/6,8/8,7/7
4,5,Children's Hospital of Pittsburgh of UPMC,96.7/100,27.7%,18/19,5/5,11/13,13/16,6/6,5/6,43/45,1/5,5/5,4/6,18/19,11/12,3.4,5/5,27/27,9/9,3/3,11/11,True,27.7%,40/41,14/15,16/16,5/5,5/6,8/8,7/7
5,6,Nationwide Children's Hospital,95.2/100,20.2%,19/19,4/5,13/13,13/16,6/6,6/6,44/45,3/5,4/5,4/6,19/19,12/12,3.2,5/5,27/27,9/9,2/3,11/11,True,20.2%,41/41,15/15,16/16,5/5,3/6,8/8,7/7
6,7,Children's Hospital Colorado,92.8/100,48.0%,14/19,5/5,10/13,11/16,4/6,6/6,42/45,4/5,3/5,,14/19,12/12,3.3,,27/27,9/9,2/3,11/11,True,48.0%,41/41,14/15,16/16,5/5,5/6,8/8,7/7
7,8,St. Louis Children's Hospital-Washington University,91.8/100,21.7%,15/19,5/5,10/13,13/16,6/6,5/6,43/45,3/5,4/5,2/6,15/19,10/12,3.4,5/5,25/27,8/9,2/3,10/11,True,21.7%,39/41,13/15,16/16,4/5,6/6,8/8,7/7
8,9,North Carolina Children's Hospital at UNC,88.1/100,18.9%,11/19,4/5,13/13,11/16,6/6,6/6,41/45,3/5,5/5,3/6,11/19,7/12,3.9,2/5,27/27,9/9,3/3,9/11,True,18.9%,39/41,15/15,15/16,4/5,4/6,8/8,7/7
9,10,Johns Hopkins Children's Center,87.8/100,24.5%,14/19,4/5,13/13,12/16,6/6,6/6,45/45,3/5,4/5,,14/19,9/12,3.4,,27/27,9/9,2/3,11/11,True,24.5%,41/41,13/15,16/16,5/5,5/6,8/8,7/7
