In [24]:
import requests as r
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd

## Scraping  `topuniversities.com`

In [2]:
URL = 'https://www.topuniversities.com'

In [3]:
data = r.get(URL + '/sites/default/files/qs-rankings-data/357051.txt').json()['data']

We want to filter by rank and extract the above properties. Some ranks are of the form: `X-Y` to indicate a range and some start with a `=` to indicate that two universities reached the same rank. As these formats complicate parsing, we first want to check whether the universities we interested in (the top 200) have their rank expressed in one of the two formats:

In [4]:
dash = set()
equals = set()
for uni in data:
    rank = uni['rank_display']
    if '-' in rank:
        dash.add(rank)
    if '=' in rank:
        equals.add(int(rank.lstrip('=')))

In [5]:
dash

{'401-410',
 '411-420',
 '421-430',
 '431-440',
 '441-450',
 '451-460',
 '461-470',
 '471-480',
 '481-490',
 '491-500',
 '501-550',
 '551-600',
 '601-650',
 '651-700',
 '701-750',
 '751-800',
 '801-1000'}

None of the intervals is relevant for our analysis

In [6]:
any(map(lambda v: v < 201, equals))

True

Some of the values starting with `=` are interesting for us.

In [7]:
fields = (
    'title',
    'country',
    'region',
    'url'
)

# Obtained by inspecting html source
to_scrape = (
    'total student',
    'total inter',
    'total faculty',
    'inter faculty'
)

In [8]:
# ============================= DO NOT EXECUTE IF ALREADY HAVE PICKLE FILE =======================================

cleaned = []

# As explained above, we are not interested in ranks with '-'
for uni in filter(lambda u: '-' not in u['rank_display'], data):
    # parse rank
    rank = uni['rank_display']
    rank = np.uint32(rank.lstrip('='))
    
    # Only keep universities in top 200
    if rank < 201:
        # Retain important fields from ranking table
        clean_uni = {variable: uni[variable] for variable in fields}
        clean_uni['rank'] = rank
        
        cleaned.append(clean_uni)
        
        # Retrieve data from university page
        req = r.get(URL + uni['url'])
        soup = BeautifulSoup(req.text, 'html.parser')
        for field in to_scrape:
            div = soup.find('div', class_=field)
            if div:
                clean_uni[field] = np.uint32(div.find('div', class_='number')\
                                               .text.strip().replace(',', ''))
            else:
                print('Could not find', field, 'for', uni['title'])

Could not find total student for New York University (NYU)
Could not find total inter for New York University (NYU)
Could not find total faculty for New York University (NYU)
Could not find inter faculty for New York University (NYU)
Could not find inter faculty for Indian Institute of Science (IISc) Bangalore


In [9]:
import pickle

In [10]:
# To work without having to pull everything down.
with open('bcp.pickle', 'wb') as out:
    pickle.dump(cleaned, out)

In [11]:
with open('bcp.pickle', 'rb') as data_source:
    cleaned = pickle.load(data_source)

In [12]:
topuniversities = pd.DataFrame(cleaned)

In [26]:
topuniversities.head(10)

Unnamed: 0,country,inter faculty,rank,region,title,total faculty,total inter,total student,url
0,United States,1679.0,1,North America,Massachusetts Institute of Technology (MIT),2982.0,3717.0,11067.0,/universities/massachusetts-institute-technolo...
1,United States,2042.0,2,North America,Stanford University,4285.0,3611.0,15878.0,/universities/stanford-university
2,United States,1311.0,3,North America,Harvard University,4350.0,5266.0,22429.0,/universities/harvard-university
3,United States,350.0,4,North America,California Institute of Technology (Caltech),953.0,647.0,2255.0,/universities/california-institute-technology-...
4,United Kingdom,2278.0,5,Europe,University of Cambridge,5490.0,6699.0,18770.0,/universities/university-cambridge
5,United Kingdom,2964.0,6,Europe,University of Oxford,6750.0,7353.0,19720.0,/universities/university-oxford
6,United Kingdom,2554.0,7,Europe,UCL (University College London),6345.0,14854.0,31080.0,/universities/ucl-university-college-london
7,United Kingdom,2071.0,8,Europe,Imperial College London,3930.0,8746.0,16090.0,/universities/imperial-college-london
8,United States,635.0,9,North America,University of Chicago,2449.0,3379.0,13557.0,/universities/university-chicago
9,Switzerland,1886.0,10,Europe,ETH Zurich - Swiss Federal Institute of Techno...,2477.0,7563.0,19815.0,/universities/eth-zurich-swiss-federal-institu...


## Scraping timeshighereducation.com

In [14]:
URL = 'https://www.timeshighereducation.com'

In [15]:
data = r.get(URL + '/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json').json()['data']

In [16]:
dash = set()
equals = set()
plus = set()
for uni in data:
    rank = uni['rank']
    if '–' in rank:
        dash.add(rank)
    if '=' in rank:
        equals.add(int(rank.lstrip('=')))
    if '+' in rank:
        plus.add(rank)

In [17]:
dash

{'201–250',
 '251–300',
 '301–350',
 '351–400',
 '401–500',
 '501–600',
 '601–800',
 '801–1000'}

In [18]:
any(map(lambda v: v < 201, equals))

True

In [19]:
plus

{'1001+'}

In [20]:
def highereducation_university_builder(uni):
    intern_ratio = np.float32(uni['stats_pc_intl_students'].rstrip('%'))
    staff_ratio = np.float32(uni['stats_student_staff_ratio'])
    res = {
        'title': uni['name'],
        'country': uni['location'],
        'url': uni['url'],
        'total student': np.int32(uni['stats_number_students'].replace(',', ''))
    }
    res['total inter'] = intern_ratio * res['total student']
    res['total faculty'] = res['total student'] / staff_ratio,
    return res

In [21]:
cleaned = []
for uni in filter(lambda u: all(map(lambda sym: sym not in u['rank'], ('–', '+'))), data):
    rank = uni['rank']
    rank = np.int32(rank.lstrip('='))
    if rank < 201:
        clean_uni = highereducation_university_builder(uni)
        clean_uni['rank'] = rank
        cleaned.append(clean_uni)

In [22]:
highereducation = pd.DataFrame(cleaned)

In [25]:
highereducation.head(10)

Unnamed: 0,country,rank,title,total faculty,total inter,total student,url
0,United Kingdom,1,University of Oxford,"(1822.23217389,)",775542.0,20409,/world-university-rankings/university-oxford
1,United Kingdom,2,University of Cambridge,"(1687.06427923,)",643615.0,18389,/world-university-rankings/university-cambridge
2,United States,3,California Institute of Technology,"(339.846153846,)",59643.0,2209,/world-university-rankings/california-institut...
3,United States,3,Stanford University,"(2112.66666667,)",348590.0,15845,/world-university-rankings/stanford-university
4,United States,5,Massachusetts Institute of Technology,"(1284.71267184,)",380018.0,11177,/world-university-rankings/massachusetts-insti...
5,United States,6,Harvard University,"(2283.82032261,)",528476.0,20326,/world-university-rankings/harvard-university
6,United States,7,Princeton University,"(958.433712915,)",190920.0,7955,/world-university-rankings/princeton-university
7,United Kingdom,8,Imperial College London,"(1390.96495883,)",872135.0,15857,/world-university-rankings/imperial-college-lo...
8,United States,9,University of Chicago,"(2181.45168001,)",338125.0,13525,/world-university-rankings/university-chicago
9,Switzerland,10,ETH Zurich – Swiss Federal Institute of Techno...,"(1317.3287327,)",730854.0,19233,/world-university-rankings/eth-zurich-swiss-fe...


## Analysis of the university rankings

In this section, we will answer the following questions for both of the university rankings:

- Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?
- Answer the previous question aggregating the data by (c) country and (d) region.


In [117]:
# Spice up our print display
# Taken from https://stackoverflow.com/questions/8924173/how-do-i-print-bold-text-in-python
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# Example:
print(color.BOLD + 'Hello World !' + color.END)

END_STR = color.END
BU_STR  = color.BOLD+color.UNDERLINE


[1mHello World ![0m


In [32]:
# Duplicate the dataframes to avoid destroying their content while playing around
topuniversity_df = topuniversities
highereducation_df = highereducation

In [67]:
# Obtain the working dataframe from the one just saved
topuniversities = topuniversity_df
highereducation = highereducation_df

In [78]:
# Lets define a useful order of columns that we find meaningful:
TOP_UNI_BASE_COL=["rank", "country", "region", "title"]
FAC_STUD_RATIO_COL=["total faculty", "total student", "faculty student ratio"]
INTER_STUD_RATIO_COL=["total inter", "total student", "inter student ratio"]

In [124]:
topuniversities["faculty student ratio"] = topuniversities["total faculty"]/topuniversities["total student"]
topuniversities["inter student ratio"]   = topuniversities["total inter"]/topuniversities["total student"]

# In the display, only output meaningful information regarding to the filter at hand
print(BU_STR+color.BLUE+"Analysis of faculty to student ratio and international student ratio per school\n"+END_STR)
print(BU_STR+"Dataframe with ratios"+END_STR)
display(topuniversities.head(10))

print(BU_STR + "Schools sorted by their faculty to student ratio" + END_STR)
display(topuniversities.sort_values("faculty student ratio", ascending=False)[TOP_UNI_BASE_COL+FAC_STUD_RATIO_COL].head(10))

print(BU_STR + "Schools sorted by their international student ratio" + END_STR)
display(topuniversities.sort_values("inter student ratio", ascending=False)[TOP_UNI_BASE_COL+INTER_STUD_RATIO_COL].head(10))

[1m[4m[94mAnalysis of faculty to student ratio and international student ratio per school
[0m
[1m[4mDataframe with ratios[0m


Unnamed: 0,country,inter faculty,rank,region,title,total faculty,total inter,total student,url,faculty student ratio,inter student ratio
0,United States,1679.0,1,North America,Massachusetts Institute of Technology (MIT),2982.0,3717.0,11067.0,/universities/massachusetts-institute-technolo...,0.26945,0.335863
1,United States,2042.0,2,North America,Stanford University,4285.0,3611.0,15878.0,/universities/stanford-university,0.26987,0.227422
2,United States,1311.0,3,North America,Harvard University,4350.0,5266.0,22429.0,/universities/harvard-university,0.193945,0.234785
3,United States,350.0,4,North America,California Institute of Technology (Caltech),953.0,647.0,2255.0,/universities/california-institute-technology-...,0.422616,0.286918
4,United Kingdom,2278.0,5,Europe,University of Cambridge,5490.0,6699.0,18770.0,/universities/university-cambridge,0.292488,0.356899
5,United Kingdom,2964.0,6,Europe,University of Oxford,6750.0,7353.0,19720.0,/universities/university-oxford,0.342292,0.37287
6,United Kingdom,2554.0,7,Europe,UCL (University College London),6345.0,14854.0,31080.0,/universities/ucl-university-college-london,0.204151,0.477928
7,United Kingdom,2071.0,8,Europe,Imperial College London,3930.0,8746.0,16090.0,/universities/imperial-college-london,0.244251,0.543567
8,United States,635.0,9,North America,University of Chicago,2449.0,3379.0,13557.0,/universities/university-chicago,0.180645,0.249244
9,Switzerland,1886.0,10,Europe,ETH Zurich - Swiss Federal Institute of Techno...,2477.0,7563.0,19815.0,/universities/eth-zurich-swiss-federal-institu...,0.125006,0.381681


[1m[4mSchools sorted by their faculty to student ratio[0m


Unnamed: 0,rank,country,region,title,total faculty,total student,faculty student ratio
3,4,United States,North America,California Institute of Technology (Caltech),953.0,2255.0,0.422616
15,16,United States,North America,Yale University,4940.0,12402.0,0.398323
5,6,United Kingdom,Europe,University of Oxford,6750.0,19720.0,0.342292
4,5,United Kingdom,Europe,University of Cambridge,5490.0,18770.0,0.292488
16,17,United States,North America,Johns Hopkins University,4462.0,16146.0,0.276353
1,2,United States,North America,Stanford University,4285.0,15878.0,0.26987
0,1,United States,North America,Massachusetts Institute of Technology (MIT),2982.0,11067.0,0.26945
185,186,United States,North America,University of Rochester,2569.0,9636.0,0.266604
18,19,United States,North America,University of Pennsylvania,5499.0,20639.0,0.266437
17,18,United States,North America,Columbia University,6189.0,25045.0,0.247115


[1m[4mSchools sorted by their international student ratio[0m


Unnamed: 0,rank,country,region,title,total inter,total student,inter student ratio
34,35,United Kingdom,Europe,London School of Economics and Political Scien...,6748.0,9760.0,0.691393
11,12,Switzerland,Europe,Ecole Polytechnique Fédérale de Lausanne (EPFL),5896.0,10343.0,0.570047
7,8,United Kingdom,Europe,Imperial College London,8746.0,16090.0,0.543567
198,200,Netherlands,Europe,Maastricht University,8234.0,16385.0,0.502533
47,47,United States,North America,Carnegie Mellon University,6385.0,13356.0,0.478062
6,7,United Kingdom,Europe,UCL (University College London),14854.0,31080.0,0.477928
91,92,United Kingdom,Europe,University of St Andrews,4030.0,8800.0,0.457955
41,41,Australia,Oceania,The University of Melbourne,18030.0,42182.0,0.427434
126,127,United Kingdom,Europe,Queen Mary University of London,6806.0,16135.0,0.421816
25,26,Hong Kong,Asia,The University of Hong Kong,8230.0,20214.0,0.407144


In [125]:
# The metodology used in this answer comes from: https://stackoverflow.com/questions/35307732/how-to-sum-in-pandas-by-unique-index-in-several-columns
topuniversities_country =topuniversities.groupby("country")["total inter", "total student", "total faculty"].sum().reset_index()
topuniversities_country["faculty student ratio"] = topuniversities_country["total faculty"]/topuniversities_country["total student"]
topuniversities_country["inter student ratio"]   = topuniversities_country["total inter"]/topuniversities_country["total student"]

print(BU_STR+color.BLUE+"Analysis of faculty to student ratio and international student ratio per country\n"+END_STR)

print(BU_STR+"Resulting dataframe sorted by country"+END_STR)
display(topuniversities_country.head(10))

print(BU_STR+"Countries sorted by their faculty to student ratio"+END_STR)
display(topuniversities_country.sort_values("faculty student ratio", ascending=False)[["country"]+FAC_STUD_RATIO_COL].head(10))

print(BU_STR+"Countries sorted by their international student ratio"+END_STR)
display(topuniversities_country.sort_values("inter student ratio", ascending=False)[["country"]+INTER_STUD_RATIO_COL].head(10))

[1m[4m[94mAnalysis of faculty to student ratio and international student ratio per country
[0m
[1m[4mResulting dataframe sorted by country[0m


Unnamed: 0,country,total inter,total student,total faculty,faculty student ratio,inter student ratio
0,Argentina,27109.0,122301.0,16421.0,0.134267,0.221658
1,Australia,106359.0,301994.0,22034.0,0.072962,0.352189
2,Austria,19667.0,63446.0,4117.0,0.06489,0.30998
3,Belgium,17013.0,115067.0,8046.0,0.069924,0.147853
4,Brazil,3052.0,92283.0,7550.0,0.081814,0.033072
5,Canada,73239.0,281514.0,29317.0,0.10414,0.260161
6,Chile,991.0,27003.0,2260.0,0.083694,0.0367
7,China,26833.0,235898.0,27220.0,0.115389,0.113748
8,Denmark,9543.0,67223.0,11916.0,0.177261,0.14196
9,Finland,3065.0,34566.0,3902.0,0.112885,0.088671


[1m[4mCountries sorted by their faculty to student ratio[0m


Unnamed: 0,country,total faculty,total student,faculty student ratio
23,Russia,6709.0,30233.0,0.22191
8,Denmark,11916.0,67223.0,0.177261
24,Saudi Arabia,1062.0,6040.0,0.175828
25,Singapore,9444.0,58466.0,0.16153
18,Malaysia,2755.0,17902.0,0.153893
17,Japan,28395.0,186222.0,0.152479
27,South Korea,19851.0,140071.0,0.141721
30,Switzerland,15323.0,109112.0,0.140434
32,United Kingdom,79934.0,583621.0,0.136962
15,Israel,2249.0,16531.0,0.136047


[1m[4mCountries sorted by their international student ratio[0m


Unnamed: 0,country,total inter,total student,inter student ratio
1,Australia,106359.0,301994.0,0.352189
32,United Kingdom,199426.0,583621.0,0.341705
12,Hong Kong,24499.0,78838.0,0.310751
2,Austria,19667.0,63446.0,0.30998
30,Switzerland,32995.0,109112.0,0.302396
25,Singapore,16168.0,58466.0,0.276537
5,Canada,73239.0,281514.0,0.260161
21,New Zealand,12439.0,48173.0,0.258215
14,Ireland,8187.0,34794.0,0.235299
20,Netherlands,46044.0,197631.0,0.23298


In [126]:
# The metodology used in this answer comes from: https://stackoverflow.com/questions/35307732/how-to-sum-in-pandas-by-unique-index-in-several-columns
topuniversities_region =topuniversities.groupby("region")["total inter", "total student", "total faculty"].sum().reset_index()
topuniversities_region["faculty student ratio"] = topuniversities_region["total faculty"]/topuniversities_region["total student"]
topuniversities_region["inter student ratio"]   = topuniversities_region["total inter"]/topuniversities_region["total student"]

print(BU_STR+color.BLUE+"Analysis of faculty to student ratio and international student ratio per region\n"+END_STR)
print(BU_STR+"Resulting dataframe sorted by region"+END_STR)
display(topuniversities_region.head(10))

print(BU_STR+"Regions sorted by their faculty to student ratio"+END_STR)
display(topuniversities_region.sort_values("faculty student ratio", ascending=False)[["region"]+FAC_STUD_RATIO_COL].head(10))

print(BU_STR+"Regions sorted by their international student ratio"+END_STR)
display(topuniversities_region.sort_values("inter student ratio", ascending=False)[["region"]+INTER_STUD_RATIO_COL].head(10))

[1m[4m[94mAnalysis of faculty to student ratio and international student ratio per region
[0m
[1m[4mResulting dataframe sorted by region[0m


Unnamed: 0,region,total inter,total student,total faculty,faculty student ratio,inter student ratio
0,Africa,3325.0,19593.0,1733.0,0.08845,0.169703
1,Asia,110100.0,807003.0,106734.0,0.13226,0.136431
2,Europe,449364.0,1957251.0,218358.0,0.111564,0.229589
3,Latin America,34737.0,396902.0,43126.0,0.108657,0.08752
4,North America,292116.0,1546353.0,182123.0,0.117776,0.188906
5,Oceania,118798.0,350167.0,25347.0,0.072385,0.339261


[1m[4mRegions sorted by their faculty to student ratio[0m


Unnamed: 0,region,total faculty,total student,faculty student ratio
1,Asia,106734.0,807003.0,0.13226
4,North America,182123.0,1546353.0,0.117776
2,Europe,218358.0,1957251.0,0.111564
3,Latin America,43126.0,396902.0,0.108657
0,Africa,1733.0,19593.0,0.08845
5,Oceania,25347.0,350167.0,0.072385


[1m[4mRegions sorted by their international student ratio[0m


Unnamed: 0,region,total inter,total student,inter student ratio
5,Oceania,118798.0,350167.0,0.339261
2,Europe,449364.0,1957251.0,0.229589
4,North America,292116.0,1546353.0,0.188906
0,Africa,3325.0,19593.0,0.169703
1,Asia,110100.0,807003.0,0.136431
3,Latin America,34737.0,396902.0,0.08752


## Merging the two resulting DataFrames

We now want to merge the to dataframes we obtained into a single one, by using the name of the university as a reference point.  
We also want to keep the rank of each university in both website as a separate column in the resulting dataframe.


In [30]:
topuniversities.head(1)


Unnamed: 0,country,inter faculty,rank,region,title,total faculty,total inter,total student,url
0,United States,1679.0,1,North America,Massachusetts Institute of Technology (MIT),2982.0,3717.0,11067.0,/universities/massachusetts-institute-technolo...


In [29]:

highereducation.head(1)

Unnamed: 0,country,rank,title,total faculty,total inter,total student,url
0,United Kingdom,1,University of Oxford,"(1822.23217389,)",775542.0,20409,/world-university-rankings/university-oxford
