# 02 - Data from the Web

In [None]:
# Needed imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import difflib
import seaborn as sns
import numpy as np
%matplotlib inline

## Some helper functions we need

This functions will be used throughout the homework.

In [None]:
# parses a string to an integer, removing invalid digits
def parseInt(numStr):
    cleaned = [x for x in numStr if x.isdigit()]
    return int("".join(cleaned))

# parses a string to a float, removing invalid digits
def parseDecimal(numStr):
    cleaned = [x for x in numStr if x.isdigit() or x == '.']
    return float("".join(cleaned))

## Task 1


### Remaks @todo

- Some values of student and faculty may be missing. They are set to 0.
- There is no university with rank #198. Therefore the interval of the rank is [1,201]
- Not every field was available in the main webpage, the missing information was retrieved from the description page of each university.


### a. Load data
- Obtain the 200 top-ranking universities in www.topuniversities.com ([ranking 2018](https://www.topuniversities.com/university-rankings/world-university-rankings/2018)) 

In [None]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt')
qs_dataset = r.json()['data']

In [None]:
qs_dataset[0]

- For each university, we extract: name, rank, country and region, number of faculty members (international and total) and number of students (international and total). The detailed information (faculty and students) is gathered from the respective page of each university. 

In [None]:
results = []
for university in qs_dataset[:200]:
    url = 'https://www.topuniversities.com' + university['url']
    details_html = requests.get(url)
    soup = BeautifulSoup(details_html.text, 'html.parser')

    def parseAttribute(className):
        attr = soup.find('div', class_=className)
        if attr is not None:
            return parseInt(attr.find('div', class_='number').text)
        else:
            return 0 # Missing value
        
    results.append({
            'name': university['title'],
            'rank': parseInt(university['rank_display']),
            'country': university['country'],
            'region': university['region'],
            'faculty_international': parseAttribute('inter faculty'),
            'faculty_total': parseAttribute('total faculty'),
            'students_international': parseAttribute('total inter'),
            'students_total': parseAttribute('total student')
        })

- Convert the resulting dataset into a DataFrame and store it as a file (for reusing it later without parsing the content again).

In [None]:
qs_df = pd.DataFrame.from_dict(results).set_index('name')
qs_df.to_pickle('qs_dataset')

In [None]:
qs_df = pd.read_pickle('qs_dataset')
qs_df.head()

### b. Add the columns 'faculty_students_ratio' and 'international_ratio'

**'faculty_students_ratio'** represents the faculty-to-students ratio.

**'international_ratio'** represents the fraction of international students. 

In [None]:
qs_df['faculty_students_ratio'] = qs_df['faculty_total']/qs_df['students_total']
qs_df['international_ratio'] = qs_df['students_international']/qs_df['students_total']
# for this analysis, keep only the columns we are interested in 
qs_df_analysis = qs_df.loc[:, ['region', 'country','rank', 'faculty_students_ratio', 'international_ratio']]
qs_df_analysis.head()

- By sorting along these columns we can respectively find the best universities in terms of (a) ratio between faculty members and students and (b) ratio of international students.

In [None]:
sorted_ = qs_df_analysis.sort_values('faculty_students_ratio', ascending=False).reset_index()
sorted_.index = range(1, len(sorted_) + 1) # start index from 1
sorted_.head()

In [None]:
sorted_ = qs_df_analysis.sort_values('international_ratio', ascending=False).reset_index()
sorted_.index = range(1, len(sorted_) + 1) # start index from 1
sorted_.head()

- We answer the same question aggregating the data by (c) **country** and (d) **region**. For clarity reasons, here we show only the best value within each group, along with the university associated with that value.

In [None]:
# We define a helper function for aggregating the data and drawing the graph.
# This function will be used for the analysis of both QS and THE datasets.
# In our case, 'grouping_col' will be either 'country' or 'region,
# and 'value_col' will be either 'faculty_students_ratio' or 'international_ratio'.
def aggregate_analyze(df, grouping_col, value_col, title=""):
    '''
    Arguments:
    -- df: the DataFrame to process
    -- grouping_col: the column by which the data will be grouped
    -- value_col: the variabile to average within each group
    '''

    grouped = df.groupby([grouping_col]).mean()\
        .loc[:, [value_col]]\
        .sort_values(value_col, ascending=False)
    grouped = grouped.rename(columns={value_col: 'average_'+value_col})
    
    grouped_with_rank = grouped.reset_index()
    grouped_with_rank.index = range(1, len(grouped) + 1) # start index from 1
    
    # plot
    ax = grouped.plot.bar(title = title)
    ax.legend_.remove()
    ax.set_ylabel(value_col)
    
    return grouped_with_rank

#     # We group by 'grouping_col' and compute, for each group, the index corresponding to the entry with higher 
#     # 'value_col'. These indices are stored in maxIdx.
#     maxIdx = df.groupby([grouping_col], agg=p)[value_col].idxmax()

#     # We use those indices to access the original table and retrieve the needed list of universities. 
#     # Moreover, we keep the columns we are interested in, i.e. [grouping_col, 'name', value_col].
#     res = df.loc[maxIdx][[grouping_col, 'name', value_col]].set_index(grouping_col)
#     res = res.sort_values(value_col, ascending=False)
    
#     # We finally plot the data in a bar chart
#     display(res)
    
    

In [None]:
aggregate_analyze(qs_df_analysis, 'country', 'faculty_students_ratio', title="Average of faculty/student ratio by country").head(20)

In [None]:
aggregate_analyze(qs_df_analysis, 'country', 'international_ratio', title="Average of intenational ratio by country")

In [None]:
aggregate_analyze(qs_df_analysis, 'region', 'faculty_students_ratio', title="Average of faculty/student ratio by region")

In [None]:
aggregate_analyze(qs_df_analysis, 'region', 'international_ratio', title="Average of international ratio by country")

## Task 2
We now obtain the 200 top-ranking universities from www.timeshighereducation.com ([ranking 2018](http://timeshighereducation.com/world-university-rankings/2018/world-ranking)) and repeat the previous analysis.

In [None]:
r = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')
the_dataset = r.json()['data']

In [None]:
the_dataset[0]

In [None]:
results = []
for university in the_dataset[:200]:
    results.append({
            'name': university['name'],
            'rank': parseInt(university['rank']),
            'country': university['location'],
            'faculty_students_ratio': 1 / parseDecimal(university['stats_student_staff_ratio']),
            'international_ratio': parseDecimal(university['stats_pc_intl_students']) / 100,
            'students_total': parseInt(university['stats_number_students'])
        })

the_df = pd.DataFrame.from_dict(results).set_index('name')

# Compute derived attributes (where applicable)
the_df['students_international'] = (the_df['students_total'] * the_df['international_ratio']).astype('int')
the_df['faculty_total'] = (the_df['students_total'] * the_df['faculty_students_ratio']).astype('int')

In [None]:
# Map countries to regions using the QS dataset
mapping = qs_df[['country', 'region']].set_index('country').to_dict()['region']

# Manually add missing mappings
mapping['Luxembourg'] = 'Europe'
mapping['Russian Federation'] = 'Europe'

the_df['region'] = the_df['country'].replace(mapping)

# for this analysis keep only the columns we are interested in
the_df_analysis = the_df.loc[:, ['region', 'rank', 'country', 'faculty_students_ratio', 'international_ratio']]
the_df_analysis.head()

In [None]:
sorted_ = the_df_analysis.sort_values('faculty_students_ratio', ascending=False).reset_index()
sorted_.index = range(1, len(sorted_) + 1) # start index from 1
sorted_.head()

In [None]:
sorted_ = the_df_analysis.sort_values('international_ratio', ascending=False).reset_index()
sorted_.index = range(1, len(sorted_) + 1) # start index from 1
sorted_.head()

In [None]:
aggregate_analyze(the_df_analysis, 'country', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df_analysis, 'country', 'international_ratio')

In [None]:
aggregate_analyze(the_df_analysis, 'region', 'faculty_students_ratio')

In [None]:
aggregate_analyze(the_df_analysis, 'region', 'international_ratio')

In [None]:
not_matched_counter = 0

qs_df_ = qs_df.copy().reset_index()
the_df_ = the_df.copy().reset_index()
def mapName(name):
    global not_matched_counter
    tolerance = 0.6
    res = difflib.get_close_matches(name, the_df_['name'], 1, tolerance)
    if len(res) > 0:
        back_res = difflib.get_close_matches(res[0], qs_df_['name'], 1, tolerance)
        if len(back_res) > 0 and name == back_res[0]:
            return res[0]
    not_matched_counter += 1
    return name

qs_df_['name'] = qs_df_['name'].map(mapName)
print(not_matched_counter, 'universities were not matched.')
merged_df = qs_df_.merge(the_df_, on='name', how='inner', suffixes=('_qs', '_the'))

# clean the table (only one column for the region and the country)
merged_df["region"] = merged_df.region_qs
merged_df["country"] = merged_df.country_qs
merged_df.drop(['region_qs', 'region_the', 'country_qs', 'country_the'], axis = 1, inplace = True)

merged_df

In [None]:
[for e in col_level2 for _ in (0, 1)]

In [None]:
# d = {'faculty_total' : merged_df[["faculty_total_qs", "faculty_total_the"]]}
# pd.concat(d.values(), axis=1, keys=d.keys())
cleaned = merged_df.copy()[['name',"faculty_total_qs", "faculty_total_the", 'rank_qs', 'rank_the']]

col_level1 = ['','faculty_total', 'rank', ]
col_level2 = ['name','faculty_total_qs', 'faculty_total_the', 'rank_qs', 'rank_the']

cleaned.columns \
    = pd.MultiIndex(levels=[col_level1, col_level2], labels=[[0, 1, 1, 2, 2], range(len(col_level2))])
cleaned

In [None]:
cols_qs = merged_df[['rank_qs', 'faculty_total_qs', 'students_international_qs',
                  'students_total_qs', 'faculty_students_ratio_qs', 'international_ratio_qs']]
cols_the = merged_df[['rank_the', 'faculty_total_the', 'students_international_the',
                  'students_total_the', 'faculty_students_ratio_the', 'international_ratio_the']]
cols_qs.columns = cols_qs.columns.map(lambda x: x.rstrip('_qs'))
cols_the.columns = cols_the.columns.map(lambda x: x.rstrip('_the'))
cols_qs.corrwith(cols_the)

In [None]:
corr = merged_df.corr()
plt.figure(figsize=(10,10))
sns.heatmap(corr, annot=True,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
corr_pairs = abs(corr).unstack().reset_index()
corr_pairs.columns = [['var1', 'var2', 'corr']]
corr_pairs = corr_pairs[corr_pairs['var1'] > corr_pairs['var2']] # Remove redundant entries
corr_pairs[corr_pairs['corr'] > 0.5].sort_values('corr', ascending=False)

In [None]:
rankings = merged_df[['name', 'rank_qs', 'rank_the']].copy()
rankings['rank_avg'] = (rankings['rank_qs'] + rankings['rank_the'])/2
rankings.sort_values('rank_avg', ascending=True).head(10)