# Assignment 1

Obtain the 200 top-ranking universities in www.topuniversities.com (ranking 2018). In particular, extract the following fields for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). Some information is not available in the main list and you have to find them in the details page. Store the resulting dataset in a pandas DataFrame. 

By aggregating data in following ways:
* by country
* by region

answer which are the best universities in terms of: 

* ratio between faculty members and students
* ratio of international students

In [70]:
# Imports
import requests
import pandas as pd
import seaborn
%matplotlib inline
from bs4 import BeautifulSoup as bs
from IPython.core import display as ICD # Used to display multiple dataframes from same cell

In [2]:
# Constants
TOTAL_STAFF_COUNT = 'total_staff_count'
INTERNATIONAL_STAFF_COUNT = 'international_staff_count'
TOTAL_STUDENT_COUNT = 'total_student_count'
INTERNATIONAL_STUDENT_COUNT = 'international_student_count'

# URLs
# Top Universities
TOP_UNIVERSITIES_BASE_URL = 'https://www.topuniversities.com'
TOP_UNIVERSITIES_RANKNING_URL = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1507982846110'
# Times Higher Education
TIMES_HIGHER_EDUCATION_BASE_URL = 'https://www.timeshighereducation.com'
TIMES_HIGHER_EDUCATION_RANKING_URL = 'https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

In [3]:
def form_region_dictionary(items, country_key, region_key):
    dictionary = {}
    for item in items:
        # If region is present, update it's contents if needed
        if (item[region_key] in dictionary) and (item[country_key] not in dictionary[item[region_key]]):
            dictionary[item[region_key]].append(item[country_key])
        # If region is not present, create it and put first item
        elif item[region_key] not in dictionary:
            dictionary[item[region_key]] = [item[country_key]]
    return dictionary

def add_region(item, country_key, region_dictionary, default_value='Unknown'):
    result = item.copy()
    result['region'] = default_value
    
    for region, countries in region_dictionary.items():
        if result[country_key] in countries:
            result['region'] = region
            break
    
    return result

def percentage_from_string(word):
    perc = word.strip()
    if perc[-1:] == '%':
        perc = perc[:-1]
    return float(perc) / 100

def int_from_string(word):
    return int("".join(i for i in list(word.strip()) if i.isdigit()))

def float_from_string(word):
    return float("".join(i for i in list(word.strip()) if i.isdigit() or i=='.'))
    
# Helper methods for 'Top Universities'
def tu_parse_ranking(json):
    return list(map(lambda wrapper: { 
            'country' : wrapper['country'],
            'name' : wrapper['title'], 
            'rank' : wrapper['rank_display'], 
            'region' : wrapper['region'],
            'url' : wrapper['url']
        }, json))

def tu_staff_and_students(university):    
    UNI_URL = TOP_UNIVERSITIES_BASE_URL + university['url']
    r = requests.get(UNI_URL)
    
    soup = bs(r.text, 'html.parser')
    result = university.copy()
    
    # Total faculty staff count
    total_staff_wrapper = soup.find('div', class_='total faculty')
    total_staff_count = 0
    
    if total_staff_wrapper:
        total_staff_count = int_from_string(total_staff_wrapper.find('div', class_='number').text)
    
    result[TOTAL_STAFF_COUNT] = total_staff_count
    
    # International faculty staff count
    international_staff_wrapper = soup.find('div', class_='inter faculty')
    international_staff_count = 0
    
    if international_staff_wrapper:
        international_staff_count = int_from_string(international_staff_wrapper.find('div', class_='number').text)

    result[INTERNATIONAL_STAFF_COUNT] = international_staff_count
    
    # Total students count
    total_student_wrapper = soup.find('div', class_='total student')
    total_student_count = 0
    
    if total_student_wrapper:
        total_student_count = int_from_string(total_student_wrapper.find('div', class_='number').text)
    
    result[TOTAL_STUDENT_COUNT] = total_student_count
    
    # International students count
    international_student_wrapper = soup.find('div', class_='total inter')
    international_student_count = 0
    
    if international_student_wrapper:
        international_student_count = int_from_string(international_student_wrapper.find('div', class_='number').text)
    
    result[INTERNATIONAL_STUDENT_COUNT] = international_student_count
    
    return result

# Helper methods for 'Times Higher Education'
def the_parse_ranking(json):
    return list(map(lambda wrapper: {
            'country' : wrapper['location'],
            'name' : wrapper['name'],
            'rank' : wrapper['rank'],
            'url' : wrapper['url']
        }, json))

def the_staff_and_students(university):
    UNI_URL = TIMES_HIGHER_EDUCATION_BASE_URL + university['url']
    r = requests.get(UNI_URL)

    soup = bs(r.text, 'html.parser')
    result = university.copy()

    pane = soup.find('div', class_='panel-pane pane-data-stats')
    elements = pane.find_all('li')
    
    international_student_perc = None
    staff_to_student_ratio = None
    total_student_count = None
    
    for element in elements:
        # International student percentage        
        if element.find('div', class_='keystats pc_intl_students') != None:
            international_student_perc_text = element.find('div', class_='value').text
            international_student_perc = percentage_from_string(international_student_perc_text)
        # Total faculty staff ratio
        elif element.find('div', class_='keystats student_staff_ratio') != None:
            staff_to_student_ratio_text = element.find('div', class_='value').text
            staff_to_student_ratio = float_from_string(staff_to_student_ratio_text)
        # Total student count
        elif element.find('div', class_='keystats number_students') != None:
            total_student_count_text = element.find('div', class_='value').text
            total_student_count = int_from_string(total_student_count_text)
    
    result[TOTAL_STAFF_COUNT] = int(total_student_count / staff_to_student_ratio)
    
    # There is no data avilable on international staff count
    result[INTERNATIONAL_STAFF_COUNT] = 0
    
    result[TOTAL_STUDENT_COUNT] = total_student_count
    result[INTERNATIONAL_STUDENT_COUNT] = int(total_student_count * international_student_perc)
        
    return result

#### Retrieving data from 'TopUniversities' ranking site



In [6]:
r = requests.get(TOP_UNIVERSITIES_RANKNING_URL)
response_json = r.json()['data']

universities = tu_parse_ranking(response_json)
universities = list(map(tu_staff_and_students, universities))

# Forming 'Top Universities' data frame
tu_df = pd.DataFrame.from_dict(universities)

#### Forming region dictionary

In the next part of the exercise we will retrieve data from 'Times Higher Education' university ranking website which does not have region information coupled with inofrmation about the universities. We can solve this problem by forming 'region dictionary' whose keys will be regions introduced in 'Top Universities' data frame, and whose values will be countries associated with said regions and which were also introduced in 'Top Universities' data frame.

In [7]:
region_dictionary = form_region_dictionary(response_json, 'country', 'region')

#### Retrieving data from 'Times Higher Education' ranking site



In [8]:
r = requests.get(TIMES_HIGHER_EDUCATION_RANKING_URL)
response_json = r.json()['data']

universities = the_parse_ranking(response_json)
universities = list(map(lambda u: add_region(u, 'country', region_dictionary), universities))
universities = list(map(the_staff_and_students, universities))

# Forming 'Times Higher Education' data frame
the_df = pd.DataFrame.from_dict(universities)

#### Adjusting ranks in data frames

Dataframes which we obtained from both ranking sites contain ranks which are not unique to every univeristy. Cases exist where serveral universities share the same rank. We will adjust this and make rank unique to each university in each dataframe by utilizing original order in which univerities appeared on ladders.

In [9]:
tu_df['rank'] = range(1, len(tu_df['rank']) + 1)
the_df['rank'] = range(1, len(the_df['rank']) + 1)

# Save the datasets to use them again later
the_df.to_pickle("the_dataset")
tu_df.to_pickle("tu_dataset")


tu_df.head()

Unnamed: 0,country,international_staff_count,international_student_count,name,rank,region,total_staff_count,total_student_count,url
0,United States,1679,3717,Massachusetts Institute of Technology (MIT),1,North America,2982,11067,/universities/massachusetts-institute-technolo...
1,United States,2042,3611,Stanford University,2,North America,4285,15878,/universities/stanford-university
2,United States,1311,5266,Harvard University,3,North America,4350,22429,/universities/harvard-university
3,United States,350,647,California Institute of Technology (Caltech),4,North America,953,2255,/universities/california-institute-technology-...
4,United Kingdom,2278,6699,University of Cambridge,5,Europe,5490,18770,/universities/university-cambridge


In [10]:
the_df.head()

Unnamed: 0,country,international_staff_count,international_student_count,name,rank,region,total_staff_count,total_student_count,url
0,United Kingdom,0,7755,University of Oxford,1,Europe,1822,20409,/world-university-rankings/university-oxford
1,United Kingdom,0,6436,University of Cambridge,2,Europe,1687,18389,/world-university-rankings/university-cambridge
2,United States,0,596,California Institute of Technology,3,North America,339,2209,/world-university-rankings/california-institut...
3,United States,0,3485,Stanford University,4,North America,2112,15845,/world-university-rankings/stanford-university
4,United States,0,3800,Massachusetts Institute of Technology,5,North America,1284,11177,/world-university-rankings/massachusetts-insti...


In [6]:
# Read saved dataset to avoid recrawling both entire sites
the_df = pd.read_pickle("the_dataset")
tu_df = pd.read_pickle("tu_dataset")

#### Matching entries by name
Let us look at the universities that are present in both data sets. Some are present in both rankings, and some are unique to either ranking. We should make sure that each university only occurs once in each dataset to avoid Problems when matching the two sets. 

In [85]:
def duplicate_list(universities):
    """
    Outputs a list of all duplicate entries in provided list
    """
    counts = {}
    for university in universities:
        if university in counts:
            counts[university] += 1
        else:
            counts[university] = 1
    return [k for k, v in counts.items() if v  > 1]

# Show duplicate universities by name
the_duplicates = the_df[the_df['name'].isin(create_count_dict(the_df['name']))]
if the_duplicates.shape[0] > 0:
    ICD.display(the_duplicates)
else:
    print("No duplicates for the_df")
tu_duplicates = tu_df[tu_df['name'].isin(create_count_dict(tu_df['name']))]
if tu_duplicates.shape[0] > 0:
    ICD.display(tu_duplicates)
else:
    print("No duplicates for tu_df")



Unnamed: 0,country,international_staff_count,international_student_count,name,rank,region,total_staff_count,total_student_count,url
192,United States,0,6009,Northeastern University,193,North America,1341,18780,/world-university-rankings/northeastern-univer...
923,China,0,1333,Northeastern University,924,Asia,2208,33348,/world-university-rankings/northeastern-univer...


No duplicates for tu_df


The names of the universities are unique apart from one exception. To make sure we can match the universities one to one we need to include another variable, in this case we use region in addition to university name to uniquely identify each university.

In [89]:
# University names for both rankings
the_names = set(zip(the_df.name, the_df.region))
assert len(the_names) == the_df.shape[0], \
    "# of university names ({}) does not match # of entries ({})".format(len(the_names), the_df.shape[0])
tu_names = set(zip(tu_df.name, tu_df.region))
assert len(tu_names) == tu_df.shape[0], \
    "# of university names ({}) does not match # of entries ({})".format(len(tu_names), tu_df.shape[0])

# Universities present in both data sets
in_both = the_names.intersection(tu_names)
print("Number of universities in both rankings: {}".format(len(in_both)))

# Universities only present in the_names
only_in_the = the_names.difference(tu_names)
print("Number of universities only in the_names: {}".format(len(only_in_the)))

# Universities only present in tu_names
only_in_tu = tu_names.difference(the_names)
print("Number of universities only in tu_names: {}".format(len(only_in_tu)))

Number of universities in both rankings: 476
Number of universities only in the_names: 626
Number of universities only in tu_names: 483


In [124]:
import difflib

def find_matches(word, words):
    return difflib.get_close_matches(word, words, n=3, cutoff=0.6)

for word in only_in_the:
    matches = find_matches(word, only_in_tu)
    if matches:
        print(matches)

In [125]:
merged_df = pd.merge(the_df, tu_df, how='inner', on=['name', 'region'], suffixes=('_the', '_tu'))
merged_df.head()

Unnamed: 0,country_the,international_staff_count_the,international_student_count_the,name,rank_the,region,total_staff_count_the,total_student_count_the,url_the,country_tu,international_staff_count_tu,international_student_count_tu,rank_tu,total_staff_count_tu,total_student_count_tu,url_tu
0,United Kingdom,0,7755,University of Oxford,1,Europe,1822,20409,/world-university-rankings/university-oxford,United Kingdom,2964,7353,6,6750,19720,/universities/university-oxford
1,United Kingdom,0,6436,University of Cambridge,2,Europe,1687,18389,/world-university-rankings/university-cambridge,United Kingdom,2278,6699,5,5490,18770,/universities/university-cambridge
2,United States,0,3485,Stanford University,4,North America,2112,15845,/world-university-rankings/stanford-university,United States,2042,3611,2,4285,15878,/universities/stanford-university
3,United States,0,5284,Harvard University,6,North America,2283,20326,/world-university-rankings/harvard-university,United States,1311,5266,3,4350,22429,/universities/harvard-university
4,United States,0,1909,Princeton University,7,North America,958,7955,/world-university-rankings/princeton-university,United States,246,1793,13,1007,8069,/universities/princeton-university
