In [1]:
import re
from datetime import date
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

from polimi_scraper.config import logger, DataPath
from polimi_scraper.utils import occupancy_soup

In [2]:
URL = "https://www7.ceda.polimi.it/spazi/spazi/controller/OccupazioniGiornoEsatto.do"

In [3]:
sede_df = pd.read_parquet("sede.parquet")
sede_df

Unnamed: 0,codice_patrimonio,tipo_patrimonio,infoWindow,radius,center.lat,center.lng
0,COE,GSede,<strong>Como</strong>,500,45.80511,9.092083
1,CRG,GSede,<strong>Cremona</strong>,500,45.14639,10.001993
2,LCF,GSede,<strong>Lecco</strong>,500,45.849515,9.396736
3,MIB,GSede,<strong>Milano Bovisa</strong>,500,45.503867,9.160379
4,PCL,GSede,<strong>Piacenza</strong>,500,45.046598,9.702332
5,MIA,GSede,<strong>Milano Città Studi</strong>,500,45.48112,9.232041
6,MNI,GSede,<strong>Mantova</strong>,500,45.160258,10.788716
7,MID,GSede,<strong>Sesto Ulteriano</strong>,500,45.393487,9.257306
8,MIC,GSede,<strong>Servizi</strong>,500,45.461827,9.180636
9,MIF,GSede,<strong>Milano Tortona</strong>,500,45.450113,9.158053


## Experiment

In [6]:
parameters = {
    "csic": "MIB",
    "categoria": "tutte",
    "tipologia": "tutte",
    "giorno_day": 1,
    "giorno_month": 2,
    "giorno_year": 2025,
    "evn_visualizza": "",
}

In [7]:
soup = BeautifulSoup(requests.get(URL, params=parameters).text, "html.parser")
soup

<!DOCTYPE html>

<html class="no-js">
<head>
<title>Occupazioni per Data</title>
<link href="https://webcommons.polimi.it/webcommons/assets/ateneo2014.css.jsp?v=5&amp;lang=it&amp;dt_version=1.10" rel="stylesheet" type="text/css"/>
<link href="https://webcommons.polimi.it/webcommons/ajax/libs/jqueryui/1.12.1/themes/polij.css.jsp?v=5&amp;lang=it" rel="stylesheet" type="text/css"/>
<link href="https://webcommons.polimi.it/webcommons/assets/desktop.css.jsp?v=5&amp;lang=it" rel="stylesheet" type="text/css"/>
<link href="/spazi/assets/css/base.css?__load_ts=1737140432539" rel="stylesheet" type="text/css"/>
<script src="https://webcommons.polimi.it/webcommons/ajax/libs/jqueryui/1.12.1/ui.js.jsp?v=5&amp;lang=it&amp;dt_version=1.10" type="text/javascript"></script>
<script src="https://webcommons.polimi.it/webcommons/ajax/libs/desktop.js.jsp?v=5&amp;lang=it" type="text/javascript"></script>
<script type="text/javascript">
	$(document).ready(function(){
		$('#loading').remove();
	});
</script>
<

In [8]:
classrooms = soup.find("td", {"class": "MatriceOccupazioni"}).find_all(
    "td", {"class": "dove"}
)
classrooms

[<td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=4517" title="Milano Bovisa - Via Durando - Edificio B1 - Piano Terzo"> F.LLI CASTIGLIONI  </a>
 </td>,
 <td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=2092" title="Milano Bovisa - Via Durando - Edificio B2 - Piano Terra"> B2.0.1 - D.I.  </a>
 </td>,
 <td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=1301" title="Milano Bovisa - Via Durando - Edificio B2 - Piano Primo"> B2.1.10  </a>
 </td>,
 <td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=2108" title="Milano Bovisa - Via Durando - Edificio B2 - Piano Primo"> B2.1.11  </a>
 </td>,
 <td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=1906" title="Milano Bovisa - Via Durando - Edificio B2 - Piano Primo"> B2.1.12  </a>
 </td>,
 <td class="dove" rowspan="1">
 <a href="Aula.do?evn_init=espandi&amp;idaula=1905" title="Milano Bovisa - Via Durando - Edificio B2 - 

In [9]:
classroom_url = urljoin(URL, classrooms[0].find("a")["href"])
classroom_url

'https://www7.ceda.polimi.it/spazi/spazi/controller/Aula.do?evn_init=espandi&idaula=4517'

In [10]:
classroom_soup = BeautifulSoup(requests.get(classroom_url).text, "html.parser")
classroom_soup

<!DOCTYPE html>

<html class="no-js">
<head>
<title>Dettagli Aula</title>
<link href="https://webcommons.polimi.it/webcommons/assets/ateneo2014.css.jsp?v=5&amp;lang=it&amp;dt_version=1.10" rel="stylesheet" type="text/css"/>
<link href="https://webcommons.polimi.it/webcommons/ajax/libs/jqueryui/1.12.1/themes/polij.css.jsp?v=5&amp;lang=it" rel="stylesheet" type="text/css"/>
<link href="https://webcommons.polimi.it/webcommons/assets/desktop.css.jsp?v=5&amp;lang=it" rel="stylesheet" type="text/css"/>
<link href="/spazi/assets/css/base.css?__load_ts=1737140432539" rel="stylesheet" type="text/css"/>
<script src="https://webcommons.polimi.it/webcommons/ajax/libs/jqueryui/1.12.1/ui.js.jsp?v=5&amp;lang=it&amp;dt_version=1.10" type="text/javascript"></script>
<script src="https://webcommons.polimi.it/webcommons/ajax/libs/desktop.js.jsp?v=5&amp;lang=it" type="text/javascript"></script>
<script type="text/javascript">
	$(document).ready(function(){
		$('#loading').remove();
	});
</script>
</head>


In [11]:
list(classroom_soup.find("em", string="Codice vano").parent.stripped_strings)[1]

'MIB0212003026'

In [12]:
classroom_soup.find("h4", string="Allestimenti").find_next("tbody").find_all("td")

[<td class="Dati1">
 	
 		ATTREZZATURE
     </td>,
 <td class="Dati1">
    
 		Radio microfono   	
 	</td>,
 <td class="Dati1">
 
 		SI
 	</td>,
 <td class="Dati1">
 	
 		ATTREZZATURE
     </td>,
 <td class="Dati1">
    
 		Oscurabile   	
 	</td>,
 <td class="Dati1">
 
 		SI
 	</td>,
 <td class="Dati1">
 	
 		ATTREZZATURE
     </td>,
 <td class="Dati1">
    
 		Cattedra cablata   	
 	</td>,
 <td class="Dati1">
 
 		SI
 	</td>,
 <td class="Dati1">
 	
 		ATTREZZATURE
     </td>,
 <td class="Dati1">
    
 		Video proiettore con PC   	
 	</td>,
 <td class="Dati1">
 
 		SI
 	</td>,
 <td class="Dati1">
 	
 		ATTREZZATURE
     </td>,
 <td class="Dati1">
    
 		Microfono   	
 	</td>,
 <td class="Dati1">
 
 		SI
 	</td>]

In [13]:
classroom_soup.find(
    "td", string=re.compile(r"\s+Postazioni dotate di presa elettrica\s+")
).find_next("td").string.strip()

AttributeError: 'NoneType' object has no attribute 'find_next'

## Execute

In [4]:
classroom_data_list = []

for _, sede_series in tqdm(sede_df.iterrows(), total=sede_df.shape[0], desc="Sede"):
    logger.info("Processing sede {}", sede_series["codice_patrimonio"])
    sede_soup = occupancy_soup(sede_series["codice_patrimonio"], date(2025, 2, 3))
    occupancy_table = sede_soup.find("td", {"class": "MatriceOccupazioni"})
    if occupancy_table is None:
        logger.warning(
            "No classrooms found for sede {}", sede_series["codice_patrimonio"]
        )
        continue
    classroom_urls = [
        urljoin(URL, classroom.find("a")["href"])
        for classroom in occupancy_table.find_all("td", {"class": "dove"})
    ]
    for classroom_url in tqdm(classroom_urls, desc="Aula"):
        classroom_soup = BeautifulSoup(
            requests.get(classroom_url, timeout=5).text,
            "html.parser",
        )
        classroom_data = {}
        for key in ("Sigla", "Codice vano", "Indirizzo"):
            classroom_data[key] = list(
                classroom_soup.find("em", string=key).parent.stripped_strings
            )[1]
        electrical_socket_field = classroom_soup.find(
            "td", string=re.compile(r"\s+Postazioni dotate di presa elettrica\s+")
        )
        if electrical_socket_field is not None:
            logger.debug("Found electrical socket field")
            classroom_data["plugs"] = (
                electrical_socket_field.find_next("td").string.strip() == "SI"
            )
        classroom_data_list.append(classroom_data)
    logger.success("Processed sede {}", sede_series["codice_patrimonio"])

Sede:   0%|          | 0/11 [00:00<?, ?it/s]

[32m2025-02-01 19:35:07.680[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede COE[0m
[32m2025-02-01 19:35:08.028[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede CRG[0m


Aula:   0%|          | 0/17 [00:00<?, ?it/s]

[32m2025-02-01 19:35:13.976[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [32m[1mProcessed sede CRG[0m
[32m2025-02-01 19:35:13.977[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede LCF[0m


Aula:   0%|          | 0/29 [00:00<?, ?it/s]

[32m2025-02-01 19:35:14.792[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:15.098[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:15.404[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:15.712[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:16.026[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:16.766[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:17.024[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<modul

Aula:   0%|          | 0/110 [00:00<?, ?it/s]

[32m2025-02-01 19:35:27.586[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:30.864[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:31.793[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:33.734[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:35.066[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:40.290[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:35:41.118[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<modul

Aula:   0%|          | 0/33 [00:00<?, ?it/s]

[32m2025-02-01 19:36:25.033[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [32m[1mProcessed sede PCL[0m
[32m2025-02-01 19:36:25.034[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede MIA[0m


Aula:   0%|          | 0/154 [00:00<?, ?it/s]

[32m2025-02-01 19:36:26.680[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:27.677[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:28.383[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:28.684[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:31.489[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:32.203[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m30[0m - [34m[1mFound electrical socket field[0m
[32m2025-02-01 19:36:32.926[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<modul

Aula:   0%|          | 0/10 [00:00<?, ?it/s]

[32m2025-02-01 19:37:36.360[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m35[0m - [32m[1mProcessed sede MNI[0m
[32m2025-02-01 19:37:36.361[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede MID[0m
[32m2025-02-01 19:37:36.545[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede MIC[0m
[32m2025-02-01 19:37:36.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede MIF[0m
[32m2025-02-01 19:37:36.855[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing sede GEM[0m


In [5]:
classroom_data_df = (
    pd.DataFrame(classroom_data_list)
    .rename(
        columns={
            "Sigla": "name",
            "Codice vano": "codice_patrimonio",
            "Indirizzo": "address",
        }
    )
    .drop_duplicates(subset="codice_patrimonio")
    .convert_dtypes()
)
classroom_data_df["plugs"] = classroom_data_df["plugs"].fillna(False)
classroom_data_df

Unnamed: 0,name,codice_patrimonio,address,plugs
0,A.1.1-CR,CRG0102001001,"Via Sesto, 39 - 26100 - Cremona (CR)",False
1,A.2.1-CR,CRG0102002001,"Via Sesto, 39 - 26100 - Cremona (CR)",False
2,A.2.2-CR,CRG0102002002,"Via Sesto, 39 - 26100 - Cremona (CR)",False
3,AULA STUDIO GIALLA 1,CRG0101000038b,"Via Sesto, 41 - 26100 - Cremona (CR)",False
4,AULA STUDIO GIALLA 2,CRG0101000038a,"Via Sesto, 41 - 26100 - Cremona (CR)",False
...,...,...,...,...
348,A 1.2,MNI0101001025,"Via Scarsellini, 15 - 46100 - Mantova (MN)",False
349,A.1.3,MNI0101001001,"Via Scarsellini, 15 - 46100 - Mantova (MN)",False
350,A.1.4,MNI0101001073,"Via Scarsellini, 15 - 46100 - Mantova (MN)",False
351,A.1.5,MNI0101001061,"Via Scarsellini, 15 - 46100 - Mantova (MN)",False


In [7]:
classroom_data_df.to_parquet(DataPath.RAW_CLASSROOMS)