In [1]:
from bs4 import BeautifulSoup
import urllib
import requests
import pandas as pd
import json

user_agent = {'User-agent' : 'Mozilla/5.0'}

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [2]:
url_base = 'https://health.usnews.com/best-hospitals/pediatric-rankings/cardiology-and-heart-surgery'
pre = 'https://health.usnews.com/'

#### What does not work

In [3]:
page = requests.get(url_base)
soup = BeautifulSoup(page.text, 'lxml')
print soup

<html><head>
<title>Access Denied</title>
</head><body>
<h1>Access Denied</h1>
 
You don't have permission to access "http://health.usnews.com/best-hospitals/pediatric-rankings/cardiology-and-heart-surgery" on this server.<p>
Reference #18.2418ae8c.1523939419.158149d
</p></body>
</html>



#### Overall Scores

In [12]:
help(requests.get)

Help on function get in module requests.api:

get(url, params=None, **kwargs)
    Sends a GET request.
    
    :param url: URL for the new :class:`Request` object.
    :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
    :param \*\*kwargs: Optional arguments that ``request`` takes.
    :return: :class:`Response <Response>` object
    :rtype: requests.Response



In [4]:
def get_overall_data(url):
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.has_attr('id')]
    script_data = [d for d in script_data if d.attrs['id'] == 'search-init-data']
    data = script_data[0]
    data = json.loads(data.text)
    return data

In [5]:
cols = ['Rank', 'Hospital', 'Overall', 'Reputation', 'Volume']
df = pd.DataFrame(columns=cols)

for page_no in range(2):
    url_param = {'page': page_no+1}
    url = '%s/?%s' % (url_base, urllib.urlencode(url_param))
    
    data = get_overall_data(url)
    
    for i in range(10):
        rank, name = [data['matches'][i][k] for k in ['rank', 'name']]
        score, repu, vol = [j['score'] for j in data['matches'][i]['scores']]
        data_list = [rank, name, score, repu, vol]
        df2 = pd.DataFrame(columns=cols, data=[data_list])
        df = df.append(df2)

df.reset_index(inplace = True, drop = True)
display(df)

Unnamed: 0,Rank,Hospital,Overall,Reputation,Volume
0,1,Texas Children's Hospital,100.0/100,55.6%,10/12
1,2,Boston Children's Hospital,97.3/100,80.2%,12/12
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,92.6/100,14.9%,6/12
3,3,University of Michigan C.S. Mott Children's Hospital,92.6/100,46.1%,11/12
4,5,Children's Hospital of Wisconsin,91.2/100,16.5%,9/12
5,6,Cincinnati Children's Hospital Medical Center,90.7/100,31.4%,8/12
6,7,Children's Hospital Colorado,88.4/100,11.4%,9/12
7,8,Children's Hospital Los Angeles,88.1/100,18.9%,10/12
8,9,Children's Hospital of Philadelphia,86.5/100,73.5%,11/12
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,86.4/100,19.9%,10/12


#### Detail Scores

In [6]:
texts = '\nUSN.ready().then(function() {\n    USN.EventDispatcher.instance.dispatcher.trigger(\'summon:hospitals:profile'
def get_detail_data(url, texts):
    page = requests.get(url, headers=user_agent)
    soup = BeautifulSoup(page.text, 'lxml')
    script_data = [d for d in soup.find_all('script') if d.text.startswith(texts)]
    data = json.loads(script_data[0].text.split('init(')[1].rstrip('\n});\n});\n')[:-7])
    return data

In [7]:
cols = ['Rank', 'Hospital', 'Measure', 'Score', 'Label']
df = pd.DataFrame(columns=cols)

for page_no in range(2):
    url_param = {'page': page_no+1}
    url = '%s/?%s' % (url_base, urllib.urlencode(url_param))
    
    data = get_overall_data(url)
    
    for i in range(10):
        rank, name = [data['matches'][i][k] for k in ['rank', 'name']]
        full_scorecard_suff = data['matches'][i]['full_scorecard_url']
        url2 = '%s%s' % (pre, full_scorecard_suff)
        d = get_detail_data(url2, texts)
        all_titles = []
        for group in d['scorecard']['measure_groups']:
            for measure in group['measures']:
                title = measure['title']
                score = measure['score']['value']
                label = measure['score']['label']
                data_list = [rank, name, title, score, label]
                df2 = pd.DataFrame(columns=cols, data=[data_list])
                df = df.append(df2)
                all_titles.append(title)
            
df.reset_index(inplace = True, drop = True)
display(df.head())

Unnamed: 0,Rank,Hospital,Measure,Score,Label
0,1,Texas Children's Hospital,Survival after congenital heart surgery,17/21,High
1,1,Texas Children's Hospital,Survival after certain complex heart procedures,20/24,Very high
2,1,Texas Children's Hospital,Survival after Norwood/hybrid surgery,9/12,High
3,1,Texas Children's Hospital,Survival after heart transplant,5/6,High
4,1,Texas Children's Hospital,Ability to prevent infections throughout hospital,37/38,Excellent


In [8]:
display(df.Hospital.value_counts())

Children's Healthcare of Atlanta                                     28
University of Michigan C.S. Mott Children's Hospital                 28
Lucile Packard Children's Hospital Stanford                          28
Children's Mercy Kansas City                                         28
Children's Hospital Colorado                                         28
Primary Children's Hospital                                          28
Ann and Robert H. Lurie Children's Hospital of Chicago               28
Children's Medical Center Dallas                                     28
MUSC Children's Heart Network of South Carolina                      28
Children's Hospital Los Angeles                                      28
Cincinnati Children's Hospital Medical Center                        28
Children's Hospital of Wisconsin                                     28
Seattle Children's Hospital                                          28
New York-Presbyterian Morgan Stanley-Komansky Children's Hospita

In [9]:
df_rank = df[['Rank', 'Hospital']].drop_duplicates()

In [10]:
df2 = df.pivot(index='Hospital', columns='Measure', values='Score')
df2 = df2.reset_index().merge(df_rank, on = 'Hospital', how = 'left')[['Rank', 'Hospital']+all_titles]
df2 = df2.sort_values(by='Rank').reset_index(drop = True)
display(df2)

Unnamed: 0,Rank,Hospital,Survival after congenital heart surgery,Survival after certain complex heart procedures,Survival after Norwood/hybrid surgery,Survival after heart transplant,Ability to prevent infections throughout hospital,Ability to prevent infections in intensive-care units,Ability to prevent pressure injuries,Number of surgeries,Number of catheter procedures,Number of Norwood or hybrid surgeries,Nurse staffing,Congenital heart program,Adult congenital heart program,Heart transplant program,Advanced clinical services offered,Clinical support services offered,Advanced technologies available,Specialized clinics and programs available,Has fulltime subspecialists available,Recognized as Nurse Magnet hospital,Reputation with physicians in specialty,Commitment to best practices,Commitment to quality improvement,Adoption of health information technology,Active fellowship program,Commitment to clinical research,Help for families,Enlists families in structuring care
0,1,Texas Children's Hospital,17/21,20/24,9/12,5/6,37/38,3/5,5/5,10/12,33/33,12/12,3.7,23/23,10/10,11/11,16/16,9/9,8/8,13/13,19/20,True,55.6%,48/50,15/15,15/16,7/7,12/12,8/8,7/7
1,2,Boston Children's Hospital,14/21,22/24,9/12,5/6,37/38,1/5,5/5,12/12,33/33,12/12,4.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,20/20,True,80.2%,50/50,14/15,16/16,7/7,12/12,8/8,7/7
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,18/21,23/24,11/12,5/6,38/38,2/5,5/5,6/12,18/33,5/12,3.2,23/23,10/10,11/11,16/16,9/9,7/8,13/13,18/20,True,14.9%,50/50,14/15,14/16,7/7,12/12,8/8,7/7
3,3,University of Michigan C.S. Mott Children's Hospital,15/21,19/24,10/12,6/6,32/38,4/5,3/5,11/12,28/33,12/12,3.7,23/23,10/10,10/11,15/16,9/9,7/8,13/13,19/20,True,46.1%,46/50,15/15,14/16,7/7,12/12,8/8,7/7
4,5,Children's Hospital of Wisconsin,18/21,16/24,12/12,3/6,30/38,5/5,5/5,9/12,19/33,11/12,4.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,20/20,True,16.5%,50/50,12/15,16/16,6/7,12/12,8/8,7/7
5,6,Cincinnati Children's Hospital Medical Center,15/21,21/24,10/12,5/6,36/38,1/5,4/5,8/12,28/33,11/12,4.4,23/23,10/10,10/11,16/16,9/9,8/8,13/13,19/20,True,31.4%,49/50,15/15,16/16,7/7,12/12,8/8,7/7
6,7,Children's Hospital Colorado,15/21,22/24,9/12,4/6,36/38,4/5,3/5,9/12,32/33,11/12,3.3,23/23,10/10,11/11,16/16,9/9,7/8,13/13,20/20,True,11.4%,50/50,14/15,16/16,7/7,12/12,8/8,7/7
7,8,Children's Hospital Los Angeles,14/21,16/24,12/12,6/6,36/38,3/5,4/5,10/12,29/33,12/12,3.3,23/23,10/10,10/11,16/16,9/9,7/8,13/13,19/20,True,18.9%,49/50,14/15,16/16,7/7,12/12,8/8,7/7
8,9,Children's Hospital of Philadelphia,11/21,18/24,9/12,3/6,36/38,1/5,5/5,11/12,33/33,12/12,3.5,23/23,10/10,11/11,16/16,9/9,8/8,13/13,18/20,True,73.5%,50/50,15/15,16/16,7/7,12/12,8/8,7/7
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,13/21,19/24,11/12,6/6,38/38,4/5,5/5,10/12,32/33,12/12,2.8,23/23,10/10,11/11,16/16,9/9,8/8,13/13,18/20,False,19.9%,50/50,13/15,15/16,7/7,12/12,8/8,7/7


In [11]:
df3 = df.pivot(index='Hospital', columns='Measure', values='Label')
df3 = df3.reset_index().merge(df_rank, on = 'Hospital', how = 'left')[['Rank', 'Hospital']+all_titles]
df3 = df3.sort_values(by='Rank').reset_index(drop = True)
display(df3)

Unnamed: 0,Rank,Hospital,Survival after congenital heart surgery,Survival after certain complex heart procedures,Survival after Norwood/hybrid surgery,Survival after heart transplant,Ability to prevent infections throughout hospital,Ability to prevent infections in intensive-care units,Ability to prevent pressure injuries,Number of surgeries,Number of catheter procedures,Number of Norwood or hybrid surgeries,Nurse staffing,Congenital heart program,Adult congenital heart program,Heart transplant program,Advanced clinical services offered,Clinical support services offered,Advanced technologies available,Specialized clinics and programs available,Has fulltime subspecialists available,Recognized as Nurse Magnet hospital,Reputation with physicians in specialty,Commitment to best practices,Commitment to quality improvement,Adoption of health information technology,Active fellowship program,Commitment to clinical research,Help for families,Enlists families in structuring care
0,1,Texas Children's Hospital,High,Very high,High,High,Excellent,Average,Excellent,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
1,2,Boston Children's Hospital,High,Very high,High,High,Excellent,Below average,Excellent,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Excellent,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
2,3,Ann and Robert H. Lurie Children's Hospital of Chicago,Very high,Very high,Very high,High,Excellent,Below average,Excellent,Average,Average,Average,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Good,Excellent,Above average,Above average,Excellent,Excellent,Excellent,Excellent
3,3,University of Michigan C.S. Mott Children's Hospital,High,Very high,High,Very high,Above average,Above average,Above average,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Excellent,Above average,Excellent,Excellent,Excellent,Excellent
4,5,Children's Hospital of Wisconsin,Very high,High,Very high,Average,Above average,Excellent,Excellent,High,High,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Good,Excellent,Above average,Excellent,Above average,Excellent,Excellent,Excellent
5,6,Cincinnati Children's Hospital Medical Center,High,Very high,High,High,Excellent,Below average,Above average,Average,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
6,7,Children's Hospital Colorado,High,Very high,High,Average,Excellent,Above average,Above average,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
7,8,Children's Hospital Los Angeles,High,High,Very high,Very high,Excellent,Average,Above average,High,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,,Very good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
8,9,Children's Hospital of Philadelphia,Average,High,High,Average,Excellent,Below average,Excellent,Very high,Very high,Very high,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent
9,10,New York-Presbyterian Morgan Stanley-Komansky Children's Hospital,High,Very high,Very high,Very high,Excellent,Above average,Excellent,High,Very high,Very high,Above average,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Excellent,Above average,,Very good,Excellent,Above average,Excellent,Excellent,Excellent,Excellent,Excellent
