In [None]:
import requests as r
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd

## Scraping  `topuniversities.com`

In [None]:
URL = 'https://www.topuniversities.com'

In [None]:
data = r.get(URL + '/sites/default/files/qs-rankings-data/357051.txt').json()['data']

We want to filter by rank and extract the above properties. Some ranks are of the form: `X-Y` to indicate a range and some start with a `=` to indicate that two universities reached the same rank. As these formats complicate parsing, we first want to check whether the universities we interested in (the top 200) have their rank expressed in one of the two formats:

In [None]:
dash = set()
equals = set()
for uni in data:
    rank = uni['rank_display']
    if '-' in rank:
        dash.add(rank)
    if '=' in rank:
        equals.add(int(rank.lstrip('=')))

In [None]:
dash

None of the intervals is relevant for our analysis

In [None]:
any(map(lambda v: v < 201, equals))

Some of the values starting with `=` are interesting for us.

In [None]:
fields = ('title',
          'country',
          'region',
          'url'
         )

# Obtained by inspecting html source
to_scrape = ('total student',
             'total inter',
             'total faculty',
             'inter faculty'
            )

In [None]:
cleaned = []

# As explained above, we are not interested in ranks with '-'
for uni in filter(lambda u: '-' not in u['rank_display'], data):
    # parse rank
    rank = uni['rank_display']
    rank = np.int8(rank.lstrip('='))
    
    # Only keep universities in top 200
    if rank < 201:
        # Retain important fields from ranking table
        clean_uni = {variable: uni[variable] for variable in fields}
        clean_uni['rank'] = rank
        
        cleaned.append(clean_uni)
        
        # Retrieve data from university page
        req = r.get(URL + uni['url'])
        soup = BeautifulSoup(req.text, 'html.parser')
        for field in to_scrape:
            div = soup.find('div', class_=field)
            if div:
                clean_uni[field] = np.int32(div.find('div', class_='number')\
                                               .text.strip().replace(',', ''))
            else:
                print('Could not find', field, 'for', uni['title'])

In [None]:
import pickle

In [None]:
# To work without having to pull everything down.
with open('bcp.pickle', 'wb') as out:
    pickle.dump(cleaned, out)

In [None]:
with open('bcp.pickle', 'rb') as data_source:
    cleaned = pickle.load(data_source)