In [None]:
import pandas as pd
import numpy as np
import zipfile
import xml.etree.ElementTree as ET
import os
import re

import os, zipfile, xml.etree.ElementTree as ET
import geopandas as gpd
from shapely.geometry import Point, LineString, Polygon
from shapely.geometry.base import BaseGeometry

In [2]:
# ------------------ CONFIG ------------------
KMZ_PATH = "../data/KSE_2019.kmz"   # <- your KMZ file
OUT_DIR  = "../output/kse_extract"    # outputs folder
os.makedirs(OUT_DIR, exist_ok=True)
# --------------------------------------------

In [3]:

# ---- 1) Read KML inside the KMZ (typically 'doc.kml') ----
with zipfile.ZipFile(KMZ_PATH, "r") as z:
    kml_bytes = z.read("doc.kml")

ns = {"kml": "http://www.opengis.net/kml/2.2"}
root = ET.fromstring(kml_bytes)

In [4]:
# ---- 2) Walk folders & collect Placemarks ----
def iter_container(elem, path=None):
    if path is None: path = []
    for e in list(elem):
        tag = e.tag.split("}")[-1]
        if tag in ("Folder", "Document"):
            nm = e.findtext("kml:name", default="", namespaces=ns)
            yield from iter_container(e, path + [nm])
        elif tag == "Placemark":
            yield path, e

def coords_text_to_xylist(text):
    # KML coords: lon,lat[,alt] lon,lat[,alt] ...
    pts = []
    for t in text.strip().split():
        p = t.split(",")
        if len(p) >= 2:
            pts.append((float(p[0]), float(p[1])))
    return pts

def placemark_to_record(path, pm):
    name = pm.findtext("kml:name", default=None, namespaces=ns)
    desc = pm.findtext("kml:description", default=None, namespaces=ns)
    folder = "/".join([p for p in path if p])

    geom = None
    # Point
    el = pm.find(".//kml:Point/kml:coordinates", ns)
    if el is not None and el.text:
        pts = coords_text_to_xylist(el.text)
        if pts: geom = Point(pts[0])
    # LineString
    if geom is None:
        el = pm.find(".//kml:LineString/kml:coordinates", ns)
        if el is not None and el.text:
            pts = coords_text_to_xylist(el.text)
            if len(pts) >= 2: geom = LineString(pts)
    # Polygon (outer boundary)
    if geom is None:
        el = pm.find(".//kml:Polygon/kml:outerBoundaryIs//kml:coordinates", ns)
        if el is not None and el.text:
            ring = coords_text_to_xylist(el.text)
            if len(ring) >= 3: geom = Polygon(ring)

    if not isinstance(geom, BaseGeometry):
        return None

    rec = {"folder": folder, "name": name, "description": desc, "geometry": geom}

    # ExtendedData (optional)
    ext = pm.find("kml:ExtendedData", ns)
    if ext is not None:
        for d in ext.findall("kml:Data", ns):
            n = d.attrib.get("name")
            v = d.findtext("kml:value", default=None, namespaces=ns)
            if n: rec[n] = v
        for sd in ext.findall(".//kml:SimpleData", ns):
            n = sd.attrib.get("name")
            if n: rec[n] = sd.text

    return rec

records = []
for path, pm in iter_container(root, []):
    rec = placemark_to_record(path, pm)
    if rec: records.append(rec)

In [5]:

# ---- 3) Build GeoDataFrame (WGS84) ----
gdf = gpd.GeoDataFrame(records, geometry=[r["geometry"] for r in records], crs=4326)

In [6]:
# ---- 4) Filter Tauron / Stacje ----
mask_all = (
    gdf["folder"].str.contains(r"/OSD/Tauron/", case=False, na=False) &
    gdf["folder"].str.contains(r"/Stacje",      case=False, na=False)
)
tauron_stacje = gdf[mask_all].copy()

In [7]:
# ---- 5a) Points only: exact lon/lat ----
pts = tauron_stacje[tauron_stacje.geometry.geom_type == "Point"].copy()
pts["lon"] = pts.geometry.x
pts["lat"] = pts.geometry.y

# ---- 5b) All geometries as points: use centroids where not point ----
allg = tauron_stacje.copy()
is_point = allg.geometry.geom_type == "Point"
# For more accurate centroids you could reproject to a metric CRS first.
allg.loc[~is_point, "geometry"] = allg.loc[~is_point, "geometry"].centroid
allg["lon"] = allg.geometry.x
allg["lat"] = allg.geometry.y

print(f"Total Tauron/Stacje features: {len(tauron_stacje)} "
      f"(points: {len(pts)}, with-centroids: {len(allg)})")

# ---- 5c) Extract city from folder path ------------------------------------


def extract_city(folder: str) -> str | None:
    if not isinstance(folder, str):
        return None
    # Typical patterns seen:
    # KSE/OSD/Tauron/O/Wrocław/stacje
    # KSE/OSD/Tauron/Wrocław/stacje
    # Be robust to case and optional extra segment after Tauron (e.g., "O")
    m = re.search(r"/Tauron/(?:[^/]+/)?([^/]+)/stacje", folder, flags=re.IGNORECASE)
    if m:
        return m.group(1)
    return None

# add 'city' to both datasets
pts["city"]  = pts["folder"].apply(extract_city)
allg["city"] = allg["folder"].apply(extract_city)

# optional: also a lowercase/ASCII-friendly variant for filenames
try:
    from unidecode import unidecode  # pip install unidecode
    pts["city_ascii"]  = pts["city"].apply(lambda x: unidecode(x) if isinstance(x, str) else x)
    allg["city_ascii"] = allg["city"].apply(lambda x: unidecode(x) if isinstance(x, str) else x)
except Exception:
    # fallback without unidecode
    pts["city_ascii"]  = pts["city"].str.normalize("NFKD").str.encode("ascii", "ignore").str.decode("ascii")
    allg["city_ascii"] = allg["city"].str.normalize("NFKD").str.encode("ascii", "ignore").str.decode("ascii")

# quick sanity check
print(pts[["folder","city"]].head(8))

Total Tauron/Stacje features: 609 (points: 609, with-centroids: 609)
                                     folder           city
2621  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2622  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2623  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2624  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2625  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2626  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2627  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała
2628  KSE/OSD/Tauron/O/Bielsko Biała/stacje  Bielsko Biała



  allg.loc[~is_point, "geometry"] = allg.loc[~is_point, "geometry"].centroid


In [8]:
allg = allg.drop(columns=['city_ascii','description'])
allg.reset_index(inplace=True)

In [9]:
allg

Unnamed: 0,index,folder,name,geometry,lon,lat,city
0,2621,KSE/OSD/Tauron/O/Bielsko Biała/stacje,Aleksandrowice,POINT (19.01869 49.80938),19.018688,49.809380,Bielsko Biała
1,2622,KSE/OSD/Tauron/O/Bielsko Biała/stacje,Andrychów,POINT (19.32829 49.85808),19.328290,49.858082,Bielsko Biała
2,2623,KSE/OSD/Tauron/O/Bielsko Biała/stacje,Białka,POINT (19.6805 49.71096),19.680497,49.710961,Bielsko Biała
3,2624,KSE/OSD/Tauron/O/Bielsko Biała/stacje,Bielsko,POINT (19.03695 49.83413),19.036954,49.834133,Bielsko Biała
4,2625,KSE/OSD/Tauron/O/Bielsko Biała/stacje,Brzeszcze,POINT (19.15752 49.98941),19.157522,49.989413,Bielsko Biała
...,...,...,...,...,...,...,...
604,4197,KSE/OSD/Tauron/O/Wrocław/stacje,Wrocław Żmigrodzka,POINT (17.03331 51.14241),17.033310,51.142415,Wrocław
605,4198,KSE/OSD/Tauron/O/Wrocław/stacje,Zacharzyce,POINT (17.09663 51.03389),17.096634,51.033888,Wrocław
606,4199,KSE/OSD/Tauron/O/Wrocław/stacje,Środa Śląska,POINT (16.59934 51.18615),16.599336,51.186150,Wrocław
607,4200,KSE/OSD/Tauron/O/Wrocław/stacje,Żmigród,POINT (16.92322 51.46862),16.923224,51.468622,Wrocław


In [10]:

# ---- 6) Save outputs ----
# CSV with lon/lat
# pts[["folder","name","lon","lat"]].to_csv(os.path.join(OUT_DIR, "tauron_stacje_points_lonlat.csv"), index=False)
# allg[["folder","name","lon","lat"]].to_csv(os.path.join(OUT_DIR, "tauron_stacje_all_lonlat.csv"), index=False)

# GeoJSON
pts.to_file(os.path.join(OUT_DIR, "tauron_stacje_points.geojson"), driver="GeoJSON")
allg.to_file(os.path.join(OUT_DIR, "tauron_stacje_all.geojson"), driver="GeoJSON")

# GeoPackage (both versions as layers)
gpkg = os.path.join(OUT_DIR, "tauron_stacje.gpkg")
pts.to_file(gpkg, layer="points", driver="GPKG")
allg.to_file(gpkg, layer="all_as_points", driver="GPKG")

print("Saved to:", OUT_DIR)

Saved to: ../output/kse_extract
