In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import difflib
%matplotlib inline

In [None]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt')
qs_dataset = r.json()['data']

In [None]:
qs_dataset[0]

In [None]:
def parseInt(numStr):
    cleaned = [x for x in numStr if x.isdigit()]
    return int("".join(cleaned))

def parseDecimal(numStr):
    cleaned = [x for x in numStr if x.isdigit() or x == '.']
    return float("".join(cleaned))
    
results = []
for university in qs_dataset[:200]:
    url = 'https://www.topuniversities.com' + university['url']
    details_html = requests.get(url)
    soup = BeautifulSoup(details_html.text, 'html.parser')

    def parseAttribute(className):
        attr = soup.find('div', class_=className)
        if attr is not None:
            return parseInt(attr.find('div', class_='number').text)
        else:
            return 0 # Missing value
        
    results.append({
            'name': university['title'],
            'rank': parseInt(university['rank_display']),
            'country': university['country'],
            'region': university['region'],
            'faculty_international': parseAttribute('inter faculty'),
            'faculty_total': parseAttribute('total faculty'),
            'students_international': parseAttribute('total inter'),
            'students_total': parseAttribute('total student')
        })

In [None]:
qs_df = pd.DataFrame.from_dict(results).set_index('name')
qs_df.to_pickle('qs_dataset')

In [None]:
qs_df = pd.read_pickle('qs_dataset')
qs_df.head()

In [None]:
qs_df['faculty_students_ratio'] = qs_df['faculty_total']/qs_df['students_total']
qs_df['international_ratio'] = qs_df['students_international']/qs_df['students_total']
qs_df.head()

In [None]:
qs_df.sort_values('faculty_students_ratio', ascending=False).head()

In [None]:
qs_df.sort_values('international_ratio', ascending=False).head()

In [None]:
def aggregate_analyze(df, grouping_col, value_col):
    maxIdx = df.groupby([grouping_col])[value_col].idxmax()
    res = df.loc[maxIdx][[grouping_col, 'name', value_col]].set_index(grouping_col)
    res.sort_values(value_col, ascending=False).plot.bar(sort_columns=True)
    display(res)

In [None]:
qs_df = qs_df.reset_index()
aggregate_analyze(qs_df, 'country', 'faculty_students_ratio')

In [None]:
aggregate_analyze(qs_df, 'country', 'international_ratio')

In [None]:
aggregate_analyze(qs_df, 'region', 'faculty_students_ratio')

In [None]:
aggregate_analyze(qs_df, 'region', 'international_ratio')

In [None]:
r = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
the_dataset = r.json()['data']

In [None]:
the_dataset[0]

In [None]:
results = []
for university in the_dataset[:200]:
    results.append({
            'name': university['name'],
            'rank': parseInt(university['rank']),
            'country': university['location'],
            'faculty_students_ratio': 1 / parseDecimal(university['stats_student_staff_ratio']),
            'international_ratio': parseDecimal(university['stats_pc_intl_students']) / 100,
            'students_total': parseInt(university['stats_number_students'])
        })

the_df = pd.DataFrame.from_dict(results).set_index('name')

# Compute derived attributes (where applicable)
the_df['students_international'] = (the_df['students_total'] * the_df['international_ratio']).astype('int')
the_df['faculty_total'] = (the_df['students_total'] * the_df['faculty_students_ratio']).astype('int')

In [None]:
# Map countries to regions using the QS dataset
mapping = qs_df[['country', 'region']].set_index('country').to_dict()['region']

# Manually add missing mappings
mapping['Luxembourg'] = 'Europe'
mapping['Russian Federation'] = 'Europe'

the_df['region'] = the_df['country'].replace(mapping)
the_df.head()

In [None]:
the_df.sort_values('faculty_students_ratio', ascending=False).head()

In [None]:
the_df.sort_values('international_ratio', ascending=False).head()

In [None]:
the_df = the_df.reset_index()
aggregate_analyze(the_df, 'country', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df, 'country', 'international_ratio')

In [None]:
aggregate_analyze(the_df, 'region', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df, 'region', 'international_ratio')

In [None]:
not_matched_counter = 0

qs_df_ = qs_df.copy()
the_df_ = the_df.copy()
def mapName(name):
    global not_matched_counter
    tolerance = 0.99
    res = difflib.get_close_matches(name, the_df_['name'], 1, tolerance)
    if len(res) > 0:
        back_res = difflib.get_close_matches(res[0], qs_df_['name'], 1, tolerance)
        if len(back_res) > 0 and name == back_res[0]:
            return res[0]
        #else:
        #    print(name, res, back_res)
    not_matched_counter += 1
    return name

qs_df_['name'] = qs_df_['name'].copy().map(mapName)
print(not_matched_counter, 'universities were not matched.')
total = qs_df_.merge(the_df_, on='name', how='inner', suffixes=('_qs', '_the'))
total

In [None]:
import seaborn as sns
corr = total.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)