# 02 - Data from the Web

In [None]:
# Needed imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import difflib
%matplotlib inline

## Some functions we need

In [None]:
def parseInt(numStr):
    cleaned = [x for x in numStr if x.isdigit()]
    return int("".join(cleaned))

def parseDecimal(numStr):
    cleaned = [x for x in numStr if x.isdigit() or x == '.']
    return float("".join(cleaned))

## Task 1
- Obtain the 200 top-ranking universities in www.topuniversities.com ([ranking 2018](https://www.topuniversities.com/university-rankings/world-university-rankings/2018)) 

In [None]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt')
qs_dataset = r.json()['data']

In [None]:
qs_dataset[0]

- Extract for each university: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). The not available information is gathered from the pages relative the relative university. 

In [None]:
results = []
for university in qs_dataset[:200]:
    url = 'https://www.topuniversities.com' + university['url']
    details_html = requests.get(url)
    soup = BeautifulSoup(details_html.text, 'html.parser')

    def parseAttribute(className):
        attr = soup.find('div', class_=className)
        if attr is not None:
            return parseInt(attr.find('div', class_='number').text)
        else:
            return 0 # Missing value 
        # I would return None
        
    results.append({
            'name': university['title'],
            'rank': parseInt(university['rank_display']),
            'country': university['country'],
            'region': university['region'],
            'faculty_international': parseAttribute('inter faculty'),
            'faculty_total': parseAttribute('total faculty'),
            'students_international': parseAttribute('total inter'),
            'students_total': parseAttribute('total student')
        })

- Store the resulting dataset in a DataFrame and store it to file (for later reuse without recomputing it)

In [None]:
qs_df = pd.DataFrame.from_dict(results).set_index('name')
qs_df.to_pickle('qs_dataset')

In [None]:
qs_df = pd.read_pickle('qs_dataset')
qs_df.head()

- We now add two new columns 'faculty_students_ratio' and 'international_ratio' so that, sorting along those columns we can find which are the best universities in term of (a) ratio between faculty members and students and (b) ratio of international students.

In [None]:
qs_df['faculty_students_ratio'] = qs_df['faculty_total']/qs_df['students_total']
qs_df['international_ratio'] = qs_df['students_international']/qs_df['students_total']
qs_df.head()

In [None]:
qs_df.sort_values('faculty_students_ratio', ascending=False).head()

In [None]:
qs_df.sort_values('international_ratio', ascending=False).head()

- We answer the same question aggregating the data by (c) country and (d) region.

In [None]:
# Inputs:
# -grouping_col: we will group data with regard to this column. We will pass 'country' or 'region'.
# -value_col: for each group we will pick the university with higher value in this column. We will pass 
#     'faculty_students_ratio' or 'international_ratio'
def aggregate_analyze(df, grouping_col, value_col):
    # We group by 'grouping_col' and compute, for each group, the index corresponding to the entry with higher 
    # 'value_col'. These indices are stored in maxIdx.
    maxIdx = df.groupby([grouping_col])[value_col].idxmax()
    
    # We use those indices to access the original table and retrieve the needed list of universities. 
    # Moreover, we keep the columns we are interested in, i.e. [grouping_col, 'name', value_col].
    res = df.loc[maxIdx][[grouping_col, 'name', value_col]].set_index(grouping_col)
    res = res.sort_values(value_col, ascending=False)
    
    # We finally plot the data in a bar chart
    res.plot.bar(sort_columns=True)
    display(res)

In [None]:
qs_df = qs_df.reset_index()
aggregate_analyze(qs_df, 'country', 'faculty_students_ratio')

In [None]:
aggregate_analyze(qs_df, 'country', 'international_ratio')

In [None]:
aggregate_analyze(qs_df, 'region', 'faculty_students_ratio')

In [None]:
aggregate_analyze(qs_df, 'region', 'international_ratio')

## Task 2
We now obtain the 200 top-ranking universities from www.timeshighereducation.com ([ranking 2018](http://timeshighereducation.com/world-university-rankings/2018/world-ranking)) and repeat the previous analysis.

In [None]:
r = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
the_dataset = r.json()['data']

In [None]:
the_dataset[0]

In [None]:
results = []
# For each univerity we gather the needed information and extract the ratios we are interested into
for university in the_dataset[:200]:
    results.append({
            'name': university['name'],
            'rank': parseInt(university['rank']),
            'country': university['location'],
            'faculty_students_ratio': 1 / parseDecimal(university['stats_student_staff_ratio']),
            'international_ratio': parseDecimal(university['stats_pc_intl_students']) / 100,
            'students_total': parseInt(university['stats_number_students'])
        })

the_df = pd.DataFrame.from_dict(results).set_index('name')

# Compute derived attributes (where applicable)
the_df['students_international'] = (the_df['students_total'] * the_df['international_ratio']).astype('int')
the_df['faculty_total'] = (the_df['students_total'] * the_df['faculty_students_ratio']).astype('int')

In [None]:
# Map countries to regions using the QS dataset
mapping = qs_df[['country', 'region']].set_index('country').to_dict()['region']

# Manually add missing mappings
mapping['Luxembourg'] = 'Europe'
mapping['Russian Federation'] = 'Europe'

the_df['region'] = the_df['country'].replace(mapping)
the_df.head()

In [None]:
the_df = the_df.reset_index()
aggregate_analyze(the_df, 'country', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df, 'country', 'international_ratio')

In [None]:
aggregate_analyze(the_df, 'region', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df, 'region', 'international_ratio')

In [None]:
qs_df = qs_df.set_index('name')
the_df = the_df.set_index('name')
qs_df.set_index('name')
the_df.index = the_df.index.map(lambda x: difflib.get_close_matches(x, qs_df.index, 1)[0])
qs_df