In [None]:
%matplotlib inline
from bs4 import BeautifulSoup
import requests
import pickle
import os.path
import scipy.stats as stats
import numpy as np
import pandas as pd

In [None]:
# Try to cache the data with pickle if set to True
CACHE = True

# 1st Part : Bachelor students  

**First, we examine the details of IS-Academia web form to find out the various form parameters
that we need for a successful request!**

In [None]:
# We use the form URL of public is-academia
req = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')

In [None]:
main = BeautifulSoup(req.text, 'html.parser')

In [None]:
print(main.body.prettify())

**We can see that the Unité académique/Academic Unit input has the name `zz_x_UNITE_ACAD` and consists of one option for each section.**

In [None]:
# Find the Unité académique element in the form
unit = main.find(attrs={"name": "zz_x_UNITE_ACAD"})
print(unit.prettify())

**We can easily extract the 'value' (section id) for the Informatique section using BeautifulSoup:**

In [None]:
# We use soup.find instead of soup.find_all because we can see
# that each unit has a unique name.
unit_IN_option = unit.find(text="Informatique")
unit_IN_id = unit_IN_option.parent['value']
unit_IN_id

**Now we look at the options for the Year field**

In [None]:
period = main.find(attrs={"name": "zz_x_PERIODE_ACAD"})
print(period.prettify())

In [None]:
period_options = period.find_all("option")

In [None]:
for option in period_options:
    print(option['value'])

**Great! We have an ID for each academic year. We can get rid of the first option, which is null**

In [None]:
period_options = period_options[1:]

In [None]:
# Create a dict mapping year period to period id 
year_dict = {option.text : option["value"] for option in period_options}
year_dict

**Now let's find out how to choose the semester type in the form**

In [None]:
semester = main.find(attrs={"name": "zz_x_PERIODE_PEDAGO"})
print(semester.prettify())

In [None]:
options = semester.find_all("option")
options

In [None]:
# Only keep options that contain "Bachelor"
bachelor_options = [option for option in options if "Bachelor" in option.text]
bachelor_options

In [None]:
# Create a dict mapping semester number to semester id 
bachelor_dict = {option.text : option["value"] for option in bachelor_options}
bachelor_dict

**Now we have enough parameters to try to do a request to the request url. We found it using postman, it's a simple GET request on `http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.bhtml`. We analysed the request and the field names are the same as the `input` names, but with leading `ww` instead of `zz`, for instance: `zz_x_UNITE_ACAD` becomes `ww_x_UNITE_ACAD`.**

In [None]:
BASE_URL = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

In [None]:
fields = {"unit": "ww_x_UNITE_ACAD", "year": "ww_x_PERIODE_ACAD", "semester": "ww_x_PERIODE_PEDAGO"}
fields

**Let's try to do a request of the Bachelor semester 1 for 2016-2017**

In [None]:
params = {fields["unit"] : unit_IN_id, fields["year"] : year_dict["2016-2017"], fields["semester"] : bachelor_dict["Bachelor semestre 1"]}
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answer.text

**We get a 404: we need more fields to have a correct query. We noticed on postman that we needed to fill two or three other fields, which we're going to get from the form**

In [None]:
all_inputs = main.body.find_all("input")
all_inputs_values = {inp["name"] : inp["value"] for inp in all_inputs}
print(all_inputs_values)

**We see on postman that the additional fields we need are `ww_i_reportmodel`, `ww_i_reportModelXsl.` Let's try again!**

In [None]:
params["ww_i_reportmodel"] = all_inputs_values["ww_i_reportmodel"]
params["ww_i_reportModelXsl"] = all_inputs_values["ww_i_reportModelXsl"]

In [None]:
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

**Even though we get a 200, this is still not the output we need. One last parameter is missing, but it is not in the form as an input. By inspecting our postman requests, we noticed that `ww_x_GPS` is included.**

In [None]:
# We found out using postman interceptor that setting the value to -1 requests all of the tables for a request ("Tous")
params["ww_x_GPS"] = -1

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

** It works!**

In [None]:
lines = answersoup.body.table.find_all('tr')
print("\n=============\n".join([str(line) for line in lines[:5]]))

In [None]:
lines[-1]

**We can see that all lines except the three first ones correspond to students. Let's discard the first two.**

In [None]:
header = lines[2]
lines = lines[3:]

In [None]:
header

**We are interested in "Civilité" (gender) and "No Sciper", "Spécialisation", and "Mineur". **

In [None]:
df = pd.DataFrame()
items = []
for line in lines:
    td_list = line.find_all("td")
    items.append([th.text for th in td_list])
    
df = df.append(items)

# Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
df = df[[0,1,4,6,10]]
df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
# We set the index to be the SCIPER number
df.index = [item[10] for item in items]

In [None]:
df.sample(10)

In [None]:
df.index.is_unique

**Sciper is unique, as it should be.**

In [None]:
df.groupby("Gender")["Gender"].count()

**Here we create two functions to create a dataframe for each semester**
* `make_params` is an utility function that creates the parameters for the http request
* `get_df_for_semester` makes a request (or takes from the cache) to return a nice dataframe containing one semester of course records.


In [None]:
def make_params(year_str, semester_str, semester_dict):
    params = {
          fields["year"] : year_dict[year_str],
          fields["semester"] : semester_dict[semester_str]}
    const_params = {fields["unit"] : unit_IN_id, 
                "ww_i_reportmodel" : all_inputs_values["ww_i_reportmodel"],
                "ww_i_reportModelXsl" : all_inputs_values["ww_i_reportModelXsl"],
                "ww_x_GPS" : -1}
    params.update(const_params)
    return params

def get_df_for_semester(year_str, semester_str, semester_dict):
    params = make_params(year_str, semester_str, semester_dict)
    
    # If we want to cache our server requests (used for testing
    # to avoid repeat requests to the server)
    if CACHE:
        CACHE_DIR = "cache"
        
        # Create directory if it doesn't exist
        if not os.path.exists(CACHE_DIR):
            os.makedirs(CACHE_DIR)
        backup_path = os.path.join(CACHE_DIR, year_str + semester_str + ".bak")        
        
        # If backup exists
        if os.path.isfile(backup_path):
            with open(backup_path, 'rb') as handle:
                answer = pickle.load(handle)
        else:
            answer = requests.get(BASE_URL, params=params)
            
            # Create backup
            with open(backup_path, 'wb') as handle:
                pickle.dump(answer, handle)
    # If we do not want to use cache
    else:
        answer = requests.get(BASE_URL, params=params)

    answersoup = BeautifulSoup(answer.text, "html.parser")
    lines = answersoup.body.table.find_all('tr')
    
    df = pd.DataFrame()
    
    # We assume we get a positive result if there are more than three lines
    # because the student data starts on the fourth line
    if len(lines) > 3:
        lines = lines[3:]
        items = []
        for line in lines:
            td_list = line.find_all("td")
            
            ## We break if we encounter an error, for example HES passerelle. 
            ## We do not consider students coming from the HES passerelle as they arrive at EPFL only at the third semester.
            if(len(td_list)) == 0:
                break
            items.append([th.text for th in td_list])

        df = df.append(items)

        # Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
        df = df[[0,1,4,6,10]]
        sciper_index = [item[10] for item in items] 
        year_index = [year_str for item in items]
        semester_index = [semester_str for item in items]
        df.index = [sciper_index, year_index, semester_index]
        df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
        df["Year"] = year_str
        df["Semester"] = semester_str

    return df

# Check that Tim appears in the list!
df_ex = get_df_for_semester("2013-2014", "Bachelor semestre 3", bachelor_dict)
df_ex.loc['223744']

**Tim indeed shows up in our dataframe! Now let's request all of the Informatique bachelor data and concatenate it all into a single dataframe**

In [None]:
# Concatenate all bachelor semesters
dfs = []
for year in year_dict:
    for bachelor in bachelor_dict:
        dfs.append(get_df_for_semester(year, bachelor, bachelor_dict))
        
df_bachelors = pd.concat(dfs)
df_bachelors.sample(5)

In [None]:
# Example, let's see Tim
df_bachelors[df_bachelors['SCIPER']=="223744"].sort_values(["Year", "Semester"])

In [None]:
# We check that the index is unique
df_bachelors.index.is_unique

In [None]:
## We check that we don't have any null values
df_bachelors.isnull().sum()

** Here we create two functions that will allow us to select only the students who have obtained their bachelor, and in how much time (in semesters) they did so **

In [None]:
# returns true if the student has graduated
def did_graduate(group):
    return (group['Semester'] == "Bachelor semestre 1").any() and (group['Semester'] == 'Bachelor semestre 6').any()


def grad_time_gender(group):
        delta = group["Semester"].count()
        gender = group['Gender'][0]
        return pd.Series([delta, gender], index=["Grad time", "Gender"])

# Get graduation time for all bachelor students
graduation_time = df_bachelors.groupby(by="SCIPER").apply(lambda x : grad_time_gender(x) if did_graduate(x) else None).dropna()
graduation_time.sample(10)

In [None]:
graduation_time.hist()

** In this histogram we see that someone did their bachelor in 4 semesters! That should not be possible. We check. We also note that the histogram x axis looks a bit odd, as there seems to be a hole between semesters 7 and 8 and there isn't.**

In [None]:
graduation_time[graduation_time["Grad time"] == 4]
df_bachelors.loc["204222"]

** This means that there is missing data, or maybe Monsieur Séguy changed sections and came back. He is the only outlier **  

**Now let's look at the gender bias**

In [None]:
gender_mean_grad = graduation_time.groupby("Gender").mean()
gender_mean_grad

**It appears that the female students graduate in slightly less time. But is the difference significant? We use the "two sample T-Test" to see if the difference in means is statistically significant, the null hypothesis being that the two means are identical. Because we know that the sample size is different for the two genders, and we don't know if the variance is the same, we use `equal_var=False` to apply "Welch's t-test," which does not assume equal sample size or variance.**

In [None]:
stats.ttest_ind(
    a=graduation_time[graduation_time["Gender"] == "Madame"]["Grad time"], 
    b=graduation_time[graduation_time["Gender"] == "Monsieur"]["Grad time"], equal_var=False)

**The statistical test reveals a pvalue of 0.24, which is too large to reject the null hypothesis. We have not found a statistically significant difference between genders.**

# 2nd Part : Master students  

**We now have to fetch the registration data for Master's students and put it in a new dataframe.**

In [None]:
# Only keep options that contain "Master"
master_options = [option for option in options if "Master" in option.text]
master_options

**We can ignore the other semester types: we do not take into account internships because they are usually away from EPFL**

In [None]:
master_dict = {option.text : option["value"] for option in master_options}
master_dict

In [None]:
get_df_for_semester("2015-2016", "Master semestre 1", master_dict)

**As before, we fetch the data for each semester and create a single dataframe**

In [None]:
# Concatenate all master semesters
dfs = []
for year in year_dict:
    for master in master_dict:
        dfs.append(get_df_for_semester(year, master, master_dict))

df_masters = pd.concat(dfs)
df_masters.sample(5)

**To calculate the average stay, we need to keep the records who have a Master Project, meaning that they finished their studies.**

In [None]:
# Master projects
df_masters[df_masters["Semester"].str.contains("Projet")]

**There is definitely missing data, as only 128 Master Projects are registered, and most of them appear in 2016-2017. Indeed, looking at is-academia, some years contain no data for Master Projects. We conclude that we simply cannot simply use the Master Project to determine the end of the studies.**

**Thus, to calculate the average stay, we use the following algorithm:**

* discard record if no "Master semestre 2"
* discard record if exists "Spécialisation" or "Mineur" but no "Master semestre 3"
* calculate the delta between last and first semester
* add 6 months if student has no record of a "Master project"

In [None]:
len(df_masters)

In [None]:
# A function to find people that we consider to have finished their master
def finished_master(group):
    # Drop if no "Master semestre 1" or no "Master semester 2"
    if not ((group['Semester'].isin(["Master semestre 1"])).any() and (group['Semester'].isin(["Master semestre 2"])).any()):
        return False
    # Drop if has a Specialization or a Minor, but no "Master semestre 3"
    if group['Specialization'].any() or group['Minor'].any():
        if not (group['Semester'].isin(["Master semestre 3"])).any():
            return False
    
    return True
         
df_masters_finished = df_masters.groupby(by="SCIPER").filter(finished_master)

In [None]:
print(len(df_masters), len(df_masters_finished))

In [None]:
# Calculates the number of semesters. 
def master_time(group):
    delta = len(group['Semester'])
    
    # We add a semester if no entry for a "Projet Master"
    if not (group['Semester'].isin(["Projet Master automne", "Projet Master printemps"])).any():
        delta += 1 
    
    return delta

master_stay_semesters = df_masters_finished.groupby(by="SCIPER").apply(master_time)
master_stay_semesters.head(10)

**Let's see if our algorithm gives reasonable results! The describe() function is always useful to see if anything crazy is happening**

In [None]:
master_stay_semesters.describe()

In [None]:
# We check for people who did 8 semesters
master_stay_semesters[master_stay_semesters == 8]

**We check one of the 8-semester students to see what they have been up to.**

In [None]:
df_masters.loc["181244"]

**It's time to create a histogram of the average stay**

In [None]:
master_stay_semesters.hist()

**So the average stay at epfl for master students is:**

In [None]:
print("{0:.2f} semesters".format(master_stay_semesters.mean()))

**Now we calculate the average stay for each specialization offered by the department.**

In [None]:
# Returns True if has Specialization
def has_spec(group):
    return bool(group['Specialization'].any())

# Calculate master time and find specialization
def master_time_spec(group):
    delta = master_time(group)
    
    # Replace empty cells with Nan so we can drop them
    specs = group["Specialization"].replace("", np.nan)
    specs = specs.dropna()
    return pd.Series([delta, specs[0]], index=["Master stay", "Specialization"])

df_masters_spec = df_masters_finished.groupby(by="SCIPER").filter(has_spec)
df_masters_spec = df_masters_spec.groupby(by="SCIPER").apply(master_time_spec)

In [None]:
print("Number of students with specializations: %d" % len(df_masters_spec))

In [None]:
spec_avg = df_masters_spec.groupby(by="Specialization").mean()
spec_avg.sort_values(by="Master stay").plot(kind="bar").axhline(y=master_stay_semesters.mean(), color='red')

** Here we see that people who take a specilalization usually take more time than the general average (in red), which is expected** 

In [None]:
# Before doing any statistical tests, we check that we have enough entries for each specialization
df_masters_spec.groupby(by="Specialization").count()

**Now let's compare each specialization with the general dataset to see if there is a statistical difference between the means. We use Welch's t-test again because we don't want to assume anything about the sample size or variance of each specialization. Some of the specializations have too few students to use statistical tests so we do not consider them.**

In [None]:
def compare_spe_general(spe):
    # We consider only specializations where at least 5 people have graduated from.  
    if len(spe) <= 5:
        return None
    return stats.ttest_ind(a=spe["Master stay"], b=master_stay_semesters, equal_var=False)[1] # we output the p-value 

# Run statistical tests on each specialization; drop NaN results
df_masters_spec.groupby(by="Specialization").apply(compare_spe_general).dropna().sort_values()

** We see that some specializations have a statistically significant difference with the overall mean. For example for a confidence level of at least 95% "Foundations of Software", "Internet Computing", "Signals, Images and Interfaces", and "Computer Engineering - SP" would stand out. We must be careful though because some of these specializations have a small sample size.**

## BONUS: We perform the same gender-based study from Part 1 but with Master's students this time. ##

In [None]:
# First, convert length of stay data to dataframe
df_stay = master_stay_semesters.to_frame()
df_stay.columns = ['Master stay']
df_stay.head()

**We extract each student's starting year year as an integer to use for the plot. Then we create a dataframe with the year, the gender, and the mean stay length for students of that gender who started in that year.**

In [None]:
def get_start_year(group):
    return group['Year'].first()
    
# Get a row from each finished student's first year so that we can plot stay length vs. start year
finished_start_year = df_masters_finished.sort_values(by='Year').groupby('SCIPER').apply(lambda g:int(g['Year'].min().split('-')[0]))
finished_start_year = finished_start_year.to_frame()
finished_start_year.columns = ['Start year']
finished_start_year.head()

# Join then groupby to get the mean stay length for a given (startYear,gender)
df_stay_by_year_gender = df_masters_finished.join(finished_start_year, on='SCIPER')
df_stay_by_year_gender = df_stay_by_year_gender.join(df_stay,on='SCIPER')
mean_year_gender = df_stay_by_year_gender.groupby(['Start year', 'Gender']).mean()
mean_year_gender

In [None]:
# Unstack the genders to get the 
myg = mean_year_gender.unstack()
myg
# TODO scatter plot!

**We didn't have time to make a scatter plot but can compare the trends over time between the two genders with this table!**