# Preprocess Spain and Basque Country buildings

In [None]:
import geopandas as gpd
import pandas as pd
from pathlib import Path
import os

## Spain

In [None]:
# Folder containing the GML files
folder = Path("/data/uscuni-ulce/extension/spain/")

# Find all .gml files
gml_files = list(folder.glob("*.gml"))

# Read and concatenate into one GeoDataFrame
gdfs = [gpd.read_file(f,columns = ["geometry","beginning", "end","value","currentUse"],use_arrow=True).to_crs("EPSG:3035") for f in gml_files]

In [None]:
spain_raw = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True)).explode()
spain_raw

In [None]:
# Compute area in meters (ensure projected CRS)
area_mask = spain_raw.area > 50000

# CurrentUse is public services or NaN
use_mask = (spain_raw["currentUse"] == "4_3_publicServices") | (spain_raw["currentUse"].isna())

# Drop rows matching both conditions
selected = spain_raw.drop(spain_raw[area_mask & use_mask].index)

In [None]:
selected=selected.rename(columns = {"value":"floor_area"})

In [None]:
selected

In [None]:
selected.to_parquet("/data/uscuni-ulce/extension/spain/clean_3035.parquet")

## Araba/Alava

In [None]:
!ls "/data/uscuni-ulce/extension/basque/"

In [None]:
# Folder containing the GML files
folder = Path("/data/uscuni-ulce/extension/basque/araba_alava")

# Find all .gml files
gml_files = list(folder.glob("*.gml"))

# Read and concatenate into one GeoDataFrame
gdfs = [gpd.read_file(f,columns = ["geometry","anyPoint", "value"],use_arrow=True).to_crs("EPSG:3035") for f in gml_files]

In [None]:
araba_alava = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))


In [None]:
araba_alava = araba_alava.rename(columns={"value":"height","anyPoint":"beginning"})


In [None]:
# Remove buildings where height is zero
araba_alava_cleaned = araba_alava[araba_alava["height"] != 0].copy()

# Optional: reset index
araba_alava_cleaned.reset_index(drop=True, inplace=True)

# Visualize
araba_alava_cleaned.explore()

In [None]:
araba_alava_cleaned.to_parquet("/data/uscuni-ulce/extension/basque/araba_alava/clean_3035.parquet")

## Bizkaia

In [None]:
# Folder containing the GML files
folder = Path("/data/uscuni-ulce/extension/basque/bizkaia")

# Find all .gml files
gml_files = list(folder.glob("*.gml"))

# Read and concatenate into one GeoDataFrame
gdfs = [gpd.read_file(f,use_arrow=True).to_crs("EPSG:3035") for f in gml_files]

In [None]:
bizkaia = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True)).explode()
bizkaia["area"] = bizkaia.area
bizkaia[["geometry","end","beginLifespanVersion", "numberOfFloorsAboveGround","area"]].explore()

In [None]:
bizkaia_filtered = bizkaia[bizkaia.area < 50000]
bizkaia_filtered[["geometry","end","beginLifespanVersion", "numberOfFloorsAboveGround"]].to_parquet("/data/uscuni-ulce/extension/basque/bizkaia/clean_3035.parquet")

In [None]:
buffer_distance = 5  # meters: distance to consider a neighbor as nearby

# Buildings to always remove (area > 50,000 & 1 floor or NaN)
always_remove = bizkaia[
    ((bizkaia["area"] > 50000) & ((bizkaia["numberOfFloorsAboveGround"] == 1) | bizkaia["numberOfFloorsAboveGround"].isna()))
]

# Buildings to conditionally remove (area > 10,000 & 1 floor or NaN)
conditional = bizkaia[
    ((bizkaia["area"] > 10000) & ((bizkaia["numberOfFloorsAboveGround"] == 1) | bizkaia["numberOfFloorsAboveGround"].isna()))
]

# Buildings with more than 1 floor
tall_buildings = bizkaia[bizkaia["numberOfFloorsAboveGround"] > 1]

# Union of tall buildings with buffer for intersection checks
tall_union = unary_union(tall_buildings.geometry.buffer(buffer_distance))

# Conditional removal: only keep buildings that do NOT intersect any tall building (with buffer)

mask_no_tall_neighbor = ~conditional.geometry.intersects(tall_union)
conditional_remove = conditional[mask_no_tall_neighbor]

# Combine all removals
all_remove = pd.concat([always_remove, conditional_remove])


#  Keep remaining buildings
bizkaia_cleaned = bizkaia.drop(all_remove.index)

# Optional: keep only selected columns
cols_to_keep = ["geometry", "end", "beginLifespanVersion", "numberOfFloorsAboveGround", "area"]
bizkaia_cleaned = bizkaia_cleaned[cols_to_keep]

# Stats
print(f"Original buildings: {len(bizkaia)}")
print(f"Buildings removed: {len(all_remove)}")
print(f"Buildings remaining: {len(bizkaia_cleaned)}")

bizkaia_cleaned.explore()

In [None]:
bizkaia_cleaned.to_parquet(
    "/data/uscuni-ulce/extension/basque/bizkaia/clean_3035.parquet"
)

Gipuzkoa

In [None]:
gipuzkoa = gpd.read_file("/data/uscuni-ulce/extension/basque/ES.GFA.BU.gml")
gipuzkoa.columns

In [None]:
gipuzkoa_filtered = gipuzkoa[~((gipuzkoa["numberOfFloorsAboveGround"]==0)&(gipuzkoa["numberOfFloorsBelowGround"]>0))]
gipuzkoa_filtered.explore()

In [None]:
gipuzkoa_filtered.to_parquet("/data/uscuni-ulce/extension/basque/gipuzkoa/clean_3035.parquet")