In [None]:
%matplotlib inline
from bs4 import BeautifulSoup
import requests
import pickle
import os.path
import scipy.stats as stats
import numpy as np

In [None]:
# Try to cache the data with pickle if set to True
CACHE = True

In [None]:
# We use the form URL of public is academia
req = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')

In [None]:
main = BeautifulSoup(req.text, 'html.parser')

In [None]:
print(main.body.prettify())

**We can see that the Unité académique/Academic Unit input has the name `zz_x_UNITE_ACAD` and consists of one option for each section.**

In [None]:
# Find the Unité académique element in the form
unit = main.find(attrs={"name": "zz_x_UNITE_ACAD"})
print(unit.prettify())

**We can easily extract the 'value' (section id) for the Informatique section using BeautifulSoup:**

In [None]:
# We use soup.find instead of soup.find_all because we can see
# that each unit has a unique name.
unit_IN_option = unit.find(text="Informatique")
unit_IN_id = unit_IN_option.parent['value']
unit_IN_id

In [None]:
period = main.find(attrs={"name": "zz_x_PERIODE_ACAD"})
print(period.prettify())

In [None]:
period_options = period.find_all("option")

In [None]:
for option in period_options:
    print(option['value'])

**We can get rid of the first one which is null**

In [None]:
period_options = period_options[1:]

In [None]:
# Create a dict mapping year period to period id 
year_dict = {option.text : option["value"] for option in period_options}
year_dict

In [None]:
semester = main.find(attrs={"name": "zz_x_PERIODE_PEDAGO"})
print(semester.prettify())

In [None]:
options = semester.find_all("option")
options

In [None]:
# Only keep options that contain "Bachelor"
bachelor_options = [option for option in options if "Bachelor" in option.text]
bachelor_options

In [None]:
# Create a dict mapping semester number to semester id 
bachelor_dict = {option.text : option["value"] for option in bachelor_options}
bachelor_dict

Now we have enough parameters to try to do a request to the request url. We found it using postman, it's a simple "get" request on `http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.bhtml`. We analysed the request and the field names are the same as the `input` names, but with leading `ww` instead of `zz`, for instance: `zz_x_UNITE_ACAD` becomes `ww_x_UNITE_ACAD`.

In [None]:
BASE_URL = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

In [None]:
fields = {"unit": "ww_x_UNITE_ACAD", "year": "ww_x_PERIODE_ACAD", "semester": "ww_x_PERIODE_PEDAGO"}
fields

**Let's try to do a request of the Bachelor semester 1 for 2016-2017**

In [None]:
params = {fields["unit"] : unit_IN_id, fields["year"] : year_dict["2016-2017"], fields["semester"] : bachelor_dict["Bachelor semestre 1"]}
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answer.text

**We get a 404: we need more fields to have a correct query. We noticed on postman that we needed three other fields, that we're going to get from the form**

In [None]:
all_inputs = main.body.find_all("input")
all_inputs_values = {inp["name"] : inp["value"] for inp in all_inputs}
print(all_inputs_values)

**We see on postman that the additional fields we need are `ww_i_reportmodel`, `ww_i_reportModelXsl.`** Let's try again

In [None]:
params["ww_i_reportmodel"] = all_inputs_values["ww_i_reportmodel"]
params["ww_i_reportModelXsl"] = all_inputs_values["ww_i_reportModelXsl"]

In [None]:
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

**Even though we get a 200, this is still not the output we need. One last parameter is missing, but it is not in the form as an input. By inspecting our postman requests, we noticed that `ww_x_GPS` is included.**

In [None]:
# For now we just put it to -1
params["ww_x_GPS"] = -1

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

** It works!**

In [None]:
import pandas as pd

In [None]:
lines = answersoup.body.table.find_all('tr')
print("\n=============\n".join([str(line) for line in lines[:5]]))

In [None]:
lines[-1]

**We can see that all lines except the three first ones correspond to students. Let's discard the first two.**

In [None]:
header = lines[2]
lines = lines[3:]

In [None]:
header

**We are interested in "Civilité" (gender) and "No Sciper". They have index `0` and `10` TODO update this description**

In [None]:
df = pd.DataFrame()
items = []
for line in lines:
    td_list = line.find_all("td")
    items.append([th.text for th in td_list])
    
df = df.append(items)#[[item[0]] for item in items])

# Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
df = df[[0,1,4,6,10]]
df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
df.index = [item[10] for item in items]

In [None]:
df.sample(10)

In [None]:
df.index.is_unique

**Sciper is unique, as it should be.**

In [None]:
df.groupby("Gender")["Gender"].count()

**TODO: description and comments of the below functions which create a dataframe for each semester**

In [None]:
def make_params(year_str, semester_str, semester_dict):
    params = {
          fields["year"] : year_dict[year_str],
          fields["semester"] : semester_dict[semester_str]}
    const_params = {fields["unit"] : unit_IN_id, 
                "ww_i_reportmodel" : all_inputs_values["ww_i_reportmodel"],
                "ww_i_reportModelXsl" : all_inputs_values["ww_i_reportModelXsl"],
                "ww_x_GPS" : -1}
    params.update(const_params)
    return params

def get_df_for_semester(year_str, semester_str, semester_dict):
    params = make_params(year_str, semester_str, semester_dict)
    
    # If we want to use cache
    if CACHE:
        backup_path = os.path.join("cache", year_str + semester_str + ".bak")
        
        # If backup exists
        if os.path.isfile(backup_path):
            with open(backup_path, 'rb') as handle:
                answer = pickle.load(handle)
        else:
            answer = requests.get(BASE_URL, params=params)
            
            # Create backup
            with open(backup_path, 'wb') as handle:
                pickle.dump(answer, handle)
    # If we do not want to use cache
    else:
        answer = requests.get(BASE_URL, params=params)

    answersoup = BeautifulSoup(answer.text, "html.parser")
    lines = answersoup.body.table.find_all('tr')
    
    df = pd.DataFrame()
    
    # We assume we get a positive result if there are more than three lines
    # because the student data starts on the fourth line
    if len(lines) > 3:
        lines = lines[3:]
        items = []
        for line in lines:
            td_list = line.find_all("td")
            
            ## we break if we encounter an error, for example HES passerelle
            if(len(td_list)) == 0:
                break
            items.append([th.text for th in td_list])

        df = df.append(items)

        # Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
        df = df[[0,1,4,6,10]]
        #print(items[0][10])
        sciper_index = [item[10] for item in items] 
        year_index = [year_str for item in items]
        semester_index = [semester_str for item in items]
        df.index = [sciper_index, year_index, semester_index]
        df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
        df["Year"] = year_str
        df["Semester"] = semester_str

    return df

# Check that Tim appears in the list!
df_ex = get_df_for_semester("2013-2014", "Bachelor semestre 3", bachelor_dict)
df_ex.loc['223744']

In [None]:
df1 = get_df_for_semester("2013-2014", "Bachelor semestre 1", bachelor_dict)
df2 = get_df_for_semester("2014-2015", "Bachelor semestre 2", bachelor_dict)
df3 = pd.concat([df1, df2])
df3.sample(10)

In [None]:
# Concatenate all bachelor semesters
dfs = []
for year in year_dict:
    for bachelor in bachelor_dict:
        dfs.append(get_df_for_semester(year, bachelor, bachelor_dict))
        
df_bachelors = pd.concat(dfs)
df_bachelors.sample(5)

In [None]:
# Example, let's see Tim
df_bachelors[df_bachelors['SCIPER']=="223744"].sort_values(["Year", "Semester"])

In [None]:
df_bachelors.index.is_unique
## TODO check null values

In [None]:
# TODO COMMENT

def did_graduate(group):
    return (group['Semester'] == "Bachelor semestre 1").any() and (group['Semester'] == 'Bachelor semestre 6').any()
 
def grad_time_gender(group):
        last_sem = group['Year'].max()
        first_sem = group['Year'].min()
        delta = int(last_sem.split('-')[-1]) - int(first_sem.split('-')[0])
        
        gender = group['Gender'][0]
        return pd.Series([delta, gender], index=["Grad time", "Gender"])

graduation_time = df_bachelors.groupby(by="SCIPER").apply(lambda x : grad_time_gender(x) if did_graduate(x) else None).dropna()
graduation_time.sample(10)

In [None]:
graduation_time["Grad time"].hist()

** This seems right, knowing that EPFL is hard, and also that people starting after 2013 didn't even have a chance of graduating yet. **  

**Now let's look at the gender bias**

In [None]:
gender_mean_grad = graduation_time.groupby("Gender").mean()
gender_mean_grad

**It appears that the female students graduate in less time. But is the difference significant? We use the "two sample T-Test" to see if the difference in means is statistically significant, with the null hypothesis that the two means are identical. Because we know that the sample size is different for the two genders, and we don't know if the variance is the same, we use `equal_var=False` to apply "Welch's t-test"**

In [None]:
stats.ttest_ind(
    a=graduation_time[graduation_time["Gender"] == "Madame"]["Grad time"], 
    b=graduation_time[graduation_time["Gender"] == "Monsieur"]["Grad time"], equal_var=False)

**The statistical test reveals a pvalue of 0.26, which is too large to reject the null hypothesis**

In [None]:
# Only keep options that contain "Master"
master_options = [option for option in options if "Master" in option.text]
master_options

**We can ignore the other options: we do not take into account internships because they are usually away from EPFL**

In [None]:
master_dict = {option.text : option["value"] for option in master_options}
master_dict

In [None]:
get_df_for_semester("2015-2016", "Master semestre 1", master_dict)

In [None]:
# Concatenate all master semesters
dfs = []
for year in year_dict:
    for master in master_dict:
        dfs.append(get_df_for_semester(year, master, master_dict))

df_masters = pd.concat(dfs)
df_masters.sample(5)

**To calculate the average stay, we need to keep the records who have a Master Project, meaning that they finished their studies.**

In [None]:
# Master projects
df_masters[df_masters["Semester"].str.contains("Projet")]

**There is definitely missing data, as only 128 Master Projects are registered, and most of them appear in 2016-2017. Indeed, looking at is-academia, some years contain no data for Master Projects. We conclude, that we cannot simply use Master Project to determine the end of the studies.**

**Thus, to calculate the average stay, we do:**

* discard record if no "Master semestre 2"
* discard record if exists "Spécialisation" but no "Master semestre 3"
* calculate delta between last and first semester
* add 6 months if no record for "Master project"

In [None]:
len(df_masters)

In [None]:
def finished_master(group):
    # Drop if no "Master semestre 2"
    if not (group['Semester'].isin(["Master semestre 2"])).any():
        return False
    # Drop if has a Specialization or a Minor, but no "Master semestre 3"
    if group['Specialization'].any() or group['Minor'].any():
        if not (group['Semester'].isin(["Master semestre 3"])).any():
            return False
    
    return True
        
    
df_masters_finished = df_masters.groupby(by="SCIPER").filter(finished_master)

In [None]:
print(len(df_masters), len(df_masters_finished))

In [None]:
# Calculates the number of semesters. Adds a semester if no entry for a "Projet Master"
def master_time(group):
    delta = len(group['Semester'])
    
    if not (group['Semester'].isin(["Projet Master automne", "Projet Master printemps"])).any():
        delta += 1
    
    return delta

master_stay_semesters = df_masters_finished.groupby(by="SCIPER").apply(master_time)
master_stay_semesters.head(10)

In [None]:
# We check the person who did 7 semesters
df_masters.loc["146742"]

In [None]:
master_stay_semesters.hist()

**So the average stay at epfl for master students is:**

In [None]:
print("{0:.2f} semesters".format(master_stay_semesters.mean()))

In [None]:
# Returns True if has Specialization
def has_spec(group):
    return bool(group['Specialization'].any())

# Calculate master time and find specialization
def master_time_spec(group):
    delta = master_time(group)
    
    # Replace empty cells with Nan so we can drop them
    specs = group["Specialization"].replace("", np.nan)
    specs = specs.dropna()
    return pd.Series([delta, specs[0]], index=["Master stay", "Specialization"])

df_masters_spec = df_masters_finished.groupby(by="SCIPER").filter(has_spec)
df_masters_spec = df_masters_spec.groupby(by="SCIPER").apply(master_time_spec)

In [None]:
print("Number of students with specializations: %d" % len(df_masters_spec))

In [None]:
spec_avg = df_masters_spec.groupby(by="Specialization").mean()

In [None]:
spec_avg.sort_values(by="Master stay").plot(kind="bar")

**Now let's compare each specialization with the general dataset to see if there is a statistical difference between the means**

In [None]:
def compare_spe_general(spe):
    return stats.ttest_ind(a=spe["Master stay"], b=master_stay_semesters, equal_var=False)

# TODO check why there are problems
df_masters_spec.groupby(by="Specialization").apply(compare_spe_general)

In [None]:
# Actually some specialization contain only 1 person. This might be a problem for the t-test:
df_masters_spec.groupby(by="Specialization").count()
# maybe we should restrict to specialization that contain at least N values