In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
from time import time

import pandas as pd
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement

from polimi_scraper.config import logger, DataPath

## Experiment

In [13]:
driver = Firefox()
driver.get("https://maps.polimi.it/maps/")
driver.implicitly_wait(5)

In [14]:
driver.find_elements(By.TAG_NAME, "iframe")

[<selenium.webdriver.remote.webelement.WebElement (session="eb5fa08b-030d-48ab-833c-0786c93f676f", element="ba81f3cc-b8d4-495f-b60e-cd142969e077")>]

In [15]:
driver.switch_to.frame(0)  # Entire application is inside an iframe
sidebar_frame = driver.find_element(By.CSS_SELECTOR, "iframe#sidebarFrame")
driver.switch_to.frame(sidebar_frame)

In [16]:
for _ in range(5):
    anchor = driver.find_elements(By.CSS_SELECTOR, "a.Link")[0]
    anchor.click()

In [17]:
driver.switch_to.parent_frame()

In [18]:
polygons = driver.find_elements(By.TAG_NAME, "polygon")

In [19]:
polygons[0].get_attribute("id")

'COE040100S028'

In [20]:
polygons[0].get_attribute("points")

'39.3100,12.4958 38.0074,12.4958 38.0074,12.3458 37.9474,12.3458 37.9474,11.6458 38.0074,11.6458 38.0074,11.4958 39.3100,11.4958 '

In [21]:
sidebar_frame = driver.find_element(By.CSS_SELECTOR, "iframe#sidebarFrame")
driver.switch_to.frame(sidebar_frame)

In [9]:
driver.back()

In [10]:
driver.find_elements(By.TAG_NAME, "polygon")

[]

## Execute

In [3]:
def parse_polygons(polygons: list[WebElement]) -> list[tuple[str, float, float]]:
    parsed_polygons = []
    for polygon in polygons:
        id_ = polygon.get_attribute("id")
        points = polygon.get_attribute("points")
        x, y = [], []
        for point in points.split():
            point_x, point_y = point.split(",")
            x.append(float(point_x))
            y.append(float(point_y))
        center_x = (min(x) + max(x)) / 2
        center_y = (min(y) + max(y)) / 2
        parsed_polygons.append((id_, center_x, center_y))
    return parsed_polygons

In [3]:
def get_polygons(driver: Firefox, depth: int = 0):
    polygons = []
    # driver is in the sidebar frame
    if depth == 5:  # We have reached the floor depth in the recursion tree
        driver.switch_to.parent_frame()  # SVG is in the parent frame
        # driver is in the parent frame
        before = time()
        polygons = parse_polygons(driver.find_elements(By.TAG_NAME, "polygon"))
        after = time()
        time_to_find = after - before

        breadcrumb = " > ".join(
            element.text
            for element in driver.find_elements(By.CSS_SELECTOR, "#breadcrumb a")
        )

        logger.info("Looking for SVG in path {}", breadcrumb)
        logger.debug("Took {} s to find polygons", time_to_find)

        # Loading the SVG takes about 3 s, so we can only
        # find polygons this fast if no new data was loaded
        # which represents an error
        if time_to_find < 0.1 or (number_polygons := len(polygons)) == 0:
            logger.warning("No polygons found")
            polygons = []
        else:
            logger.success("Found {} polygons", number_polygons)
        # `driver.back()` only works in the sidebar frame
        # (for whatever reason)
        sidebar_frame = driver.find_element(By.CSS_SELECTOR, "iframe#sidebarFrame")
        driver.switch_to.frame(sidebar_frame)
        # driver is back to the sidebar frame
    else:
        # We can't simply iterate over the links because the DOM is dynamic
        # and references to old links become stale when we go back
        # so we find the links every time and index them manually
        number_links = len(driver.find_elements(By.CSS_SELECTOR, "a.Link"))
        logger.debug("Found {} links at depth {}", number_links, depth)
        for link_index in range(number_links):
            link = driver.find_elements(By.CSS_SELECTOR, "a.Link")[link_index]
            link.click()
            polygons.extend(get_polygons(driver, depth + 1))
            driver.back()
    return polygons

In [4]:
logger.add("maps.log", level="INFO")

driver = Firefox()
driver.implicitly_wait(10)

driver.get("https://maps.polimi.it/maps/")
app_frame = driver.find_element(By.CSS_SELECTOR, "iframe#appFrame")
driver.switch_to.frame(app_frame)
sidebar_frame = driver.find_element(By.CSS_SELECTOR, "iframe#sidebarFrame")
driver.switch_to.frame(sidebar_frame)

try:
    polygons = get_polygons(driver)
finally:
    driver.close()
    driver.quit()

[32m2025-02-01 10:50:17.423[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_polygons[0m:[36m38[0m - [34m[1mFound 7 links at depth 0[0m
[32m2025-02-01 10:50:17.588[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_polygons[0m:[36m38[0m - [34m[1mFound 1 links at depth 1[0m
[32m2025-02-01 10:50:17.735[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_polygons[0m:[36m38[0m - [34m[1mFound 3 links at depth 2[0m
[32m2025-02-01 10:50:17.922[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_polygons[0m:[36m38[0m - [34m[1mFound 1 links at depth 3[0m
[32m2025-02-01 10:50:18.110[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_polygons[0m:[36m38[0m - [34m[1mFound 5 links at depth 4[0m
[32m2025-02-01 10:50:20.635[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_polygons[0m:[36m17[0m - [1mLooking for SVG in path Home > Como > Como > Via Anzani > Edificio 4 > Seminterrato[0m
[32m2025-02-01 10:50:20.636[0m | [34m[1mD

## Store

In [14]:
polygons_df = pd.DataFrame(polygons, columns=["codice_patrimonio", "x", "y"]).drop_duplicates(subset="codice_patrimonio")
polygons_df

Unnamed: 0,codice_patrimonio,x,y
0,COE040100S028,38.62870,11.99580
1,COE040100S029,38.62870,10.85415
2,COE040100S019,37.28940,19.47040
3,COE040100S006,6.67450,22.98930
4,COE040100S007,9.38015,27.24425
...,...,...,...
25557,PCL010100V015,50.90710,18.59805
25558,PCL010100V012,37.27550,15.70600
25559,PCL010100V013,48.97860,18.84365
25560,PCL010100V014,49.04015,17.67600


In [15]:
polygons_df.to_parquet(DataPath.RAW_POLYGONS)