In [None]:
from bs4 import BeautifulSoup
import requests
import pickle
import os.path

In [None]:
# Try to cache the data with pickle if set to True
CACHE = True

In [None]:
# We use the form URL of public is academia
req = requests.get('http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247')

In [None]:
main = BeautifulSoup(req.text, 'html.parser')

In [None]:
print(main.body.prettify())

**We can see that the Unité académique/Academic Unit input has the name `zz_x_UNITE_ACAD` and consists of one option for each section.**

In [None]:
# Find the Unité académique element in the form
unit = main.find(attrs={"name": "zz_x_UNITE_ACAD"})
print(unit.prettify())

**We can easily extract the 'value' (section id) for the Informatique section using BeautifulSoup:**

In [None]:
# We use soup.find instead of soup.find_all because we can see
# that each unit has a unique name.
unit_IN_option = unit.find(text="Informatique")
unit_IN_id = unit_IN_option.parent['value']
unit_IN_id

In [None]:
period = main.find(attrs={"name": "zz_x_PERIODE_ACAD"})
print(period.prettify())

In [None]:
period_options = period.find_all("option")

In [None]:
for option in period_options:
    print(option['value'])

**We can get rid of the first one which is null**

In [None]:
period_options = period_options[1:]

In [None]:
# Create a dict mapping year period to period id 
year_dict = {option.text : option["value"] for option in period_options}
year_dict

In [None]:
semester = main.find(attrs={"name": "zz_x_PERIODE_PEDAGO"})
print(semester.prettify())

In [None]:
options = semester.find_all("option")
options

In [None]:
# Only keep options that contain "Bachelor"
bachelor_options = [option for option in options if "Bachelor" in option.text]
bachelor_options

In [None]:
# Create a dict mapping semester number to semester id 
bachelor_dict = {option.text : option["value"] for option in bachelor_options}
bachelor_dict

Now we have enough parameters to try to do a request to the request url. We found it using postman, it's a simple "get" request on `http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.bhtml`. We analysed the request and the field names are the same as the `input` names, but with leading `ww` instead of `zz`, for instance: `zz_x_UNITE_ACAD` becomes `ww_x_UNITE_ACAD`.

In [None]:
BASE_URL = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

In [None]:
fields = {"unit": "ww_x_UNITE_ACAD", "year": "ww_x_PERIODE_ACAD", "semester": "ww_x_PERIODE_PEDAGO"}
fields

**Let's try to do a request of the Bachelor semester 1 for 2016-2017**

In [None]:
params = {fields["unit"] : unit_IN_id, fields["year"] : year_dict["2016-2017"], fields["semester"] : bachelor_dict["Bachelor semestre 1"]}
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answer.text

**We get a 404: we need more fields to have a correct query. We noticed on postman that we needed three other fields, that we're going to get from the form**

In [None]:
all_inputs = main.body.find_all("input")
all_inputs_values = {inp["name"] : inp["value"] for inp in all_inputs}
print(all_inputs_values)

**We see on postman that the additional fields we need are `ww_i_reportmodel`, `ww_i_reportModelXsl.`** Let's try again

In [None]:
params["ww_i_reportmodel"] = all_inputs_values["ww_i_reportmodel"]
params["ww_i_reportModelXsl"] = all_inputs_values["ww_i_reportModelXsl"]

In [None]:
params

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

**Even though we get a 200, this is still not the output we need. One last parameter is missing, but it is not in the form as an input. By inspecting our postman requests, we noticed that `ww_x_GPS` is included.**

In [None]:
# For now we just put it to -1
params["ww_x_GPS"] = -1

In [None]:
answer = requests.get(BASE_URL, params=params)
answer.status_code

In [None]:
answersoup = BeautifulSoup(answer.text, "html.parser")
print(answersoup.prettify())

** It works!**

In [None]:
import pandas as pd

In [None]:
lines = answersoup.body.table.find_all('tr')
print("\n=============\n".join([str(line) for line in lines[:5]]))

In [None]:
lines[-1]

**We can see that all lines except the three first ones correspond to students. Let's discard the first two.**

In [None]:
header = lines[2]
lines = lines[3:]

In [None]:
header

**We are interested in "Civilité" (gender) and "No Sciper". They have index `0` and `10` TODO update this description**

In [None]:
df = pd.DataFrame()
items = []
for line in lines:
    td_list = line.find_all("td")
    items.append([th.text for th in td_list])
    
df = df.append(items)#[[item[0]] for item in items])

# Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
df = df[[0,1,4,6,10]]
df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
df.index = [item[10] for item in items]

In [None]:
df.sample(10)

In [None]:
df.index.is_unique

**Sciper is unique, as it should be.**

In [None]:
df.groupby("Gender")["Gender"].count()

**TODO: description and comments of the below functions which create a dataframe for each semester**

In [None]:
def make_params(year_str, semester_str):
    params = {
          fields["year"] : year_dict[year_str],
          fields["semester"] : bachelor_dict[semester_str]}
    const_params = {fields["unit"] : unit_IN_id, 
                "ww_i_reportmodel" : all_inputs_values["ww_i_reportmodel"],
                "ww_i_reportModelXsl" : all_inputs_values["ww_i_reportModelXsl"],
                "ww_x_GPS" : -1}
    params.update(const_params)
    return params

def get_df_for_semester(year_str, semester_str):
    params = make_params(year_str, semester_str)
    
    # If we want to use cache
    if CACHE:
        backup_path = os.path.join("cache", year_str + semester_str + ".bak")
        
        # If backup exists
        if os.path.isfile(backup_path):
            with open(backup_path, 'rb') as handle:
                answer = pickle.load(handle)
        else:
            answer = requests.get(BASE_URL, params=params)
            
            # Create backup
            with open(backup_path, 'wb') as handle:
                pickle.dump(answer, handle)
    # If we do not want to use cache
    else:
        answer = requests.get(BASE_URL, params=params)

    answersoup = BeautifulSoup(answer.text, "html.parser")
    lines = answersoup.body.table.find_all('tr')
    
    df = pd.DataFrame()
    
    # We assume we get a positive result if there are more than three lines
    # because the student data starts on the fourth line
    if len(lines) > 3:
        lines = lines[3:]
        items = []
        for line in lines:
            td_list = line.find_all("td")
            items.append([th.text for th in td_list])

        df = df.append(items)

        # Keep: gender 0, name 1, specialization 4, minor 6, sciper 10
        df = df[[0,1,4,6,10]]
        print(items[0][10])
        df.index = [item[10] for item in items]
        df.columns = ['Gender', 'Name', 'Specialization', 'Minor', 'SCIPER']
        df["Year"] = year_str
        df["Semester"] = semester_str

    return df

# Check that Tim appears in the list!
df_ex = get_df_for_semester("2013-2014", "Bachelor semestre 3")
df_ex.loc['223744']

In [None]:
df1 = get_df_for_semester("2013-2014", "Bachelor semestre 1")
df2 = get_df_for_semester("2014-2015", "Bachelor semestre 2")
df3 = pd.concat([df1, df2])
df3.sample(10)

In [None]:
dfs = []
try:
    for year in year_dict:
        for bachelor in bachelor_dict:
            dfs.append(get_df_for_semester(year, bachelor))
except:
    print(year, bachelor)
        
df_all = pd.concat(dfs)
df_all.sample(10)

In [None]:
get_df_for_semester("2008-2009", "Bachelor semestre 4")