In [None]:
! pip install unidecode
! pip install shortuuid

In [None]:
from bs4 import BeautifulSoup
from unidecode import unidecode
import math
import re
import os
import shortuuid
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
def extract_polygons_from_html(html_file):
    with open(html_file, 'r') as f:
        soup = BeautifulSoup(f, 'html.parser')
        polygons = soup.find_all('polygon')
        return polygons

# https://stackoverflow.com/a/69079951
def to_snake_case(string):
    string = re.sub(r'(?<=[a-z])(?=[A-Z])|[^a-zA-Z]', ' ', string).strip().replace(' ', '_')
    return ''.join(string.lower())


def calculate_view_box(points):
    try:     
        x_values = []
        y_values = []

        for point in points.split(' '):
            # check that point has the comma separator
            if ',' in point:
                x, y = point.split(',')
                x_values.append(float(x))
                y_values.append(float(y))

        min_x = math.floor(min(x_values))
        min_y = math.floor(min(y_values))
        max_x = math.ceil(max(x_values))
        max_y = math.ceil(max(y_values))

        width = max_x - min_x
        height = max_y - min_y

        return min_x, min_y, width, height
    except Exception as e:
        print(f'Error point: {point}')
        print(f'Error: {e}')
        return 0, 0, 0, 0
    
def extract_data_info(data_info):
    try:
        soup = BeautifulSoup(data_info, 'html.parser')
        elements = soup.find_all('div')
        # remove html tags
        elements = [unidecode(element.text) for element in elements]
        if 'Comarca' in data_info:
            comarca = elements[0].split(': ')[1]
            capital = elements[1].split(': ')[1]
            pais = elements[2]
            return comarca, capital, pais
        if 'Alger' or 'Andorra' in data_info:
            comarca = elements[0]
            capital = elements[1]
            pais = ''
            return comarca, capital, pais
    except Exception as e:
        print(f'Error: {e}')
        return '', '', ''

def generate_svg(polygon, path):

    points = polygon['points']
    points = points.replace('\n', '').replace('\t', '')
    min_x, min_y, width, height = calculate_view_box(points)
    data_info = polygon['data-info'] if 'data-info' in polygon.attrs else ''
    
    polygon_class = polygon['class'][0]

    if polygon_class != 'catno':
        comarca, _, _ = extract_data_info(data_info)
        comarca = to_snake_case(comarca)
        output_file = os.path.join(path, f'{comarca}.svg')

    else:
        # get random uuid to avoid overwriting
        uuid = shortuuid.uuid()
        comarca = f'catno_{uuid}'
        output_file = os.path.join(path, 'catno', f'{comarca}.svg')

    with open(output_file, 'w') as f:
        f.write(f'<svg xmlns="http://www.w3.org/2000/svg" viewBox="{min_x} {min_y} {width} {height}">')
        f.write(f'\n  <polygon')
        f.write(f'\n    id="{comarca}"')
        f.write(f'\n    class="{polygon_class}"')
        f.write(f'\n    points="{points}"')
        f.write(f'\n    data-info="{data_info}"')
        f.write(f'\n    style="fill:#cccccc;stroke:#000000;stroke-width:0.5;"')
        f.write(f'\n  />')
        f.write(f'\n</svg>')


In [None]:
# load index.html and extract each polygon and save it as a svg file
html_file = 'index.html'

cwd = os.getcwd()
path = os.path.join(os.path.dirname(cwd), 'svg')

# create folder if it does not exist
if not os.path.exists(path):
    os.makedirs(path)

catno_path = os.path.join(path, 'catno')
if not os.path.exists(catno_path):
    os.makedirs(catno_path)

# empty catno folder
for file in os.listdir(catno_path):
    os.remove(os.path.join(catno_path, file))

polygons = extract_polygons_from_html(html_file)
for polygon in polygons:
    generate_svg(polygon, path)