# Pre-process Czech housing estates

The data is extremely heterogenous. We use additional data to pre-process it prior morphological assessment.

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

### Generate a GeoDataFrame that covers the socialist housing developments

In [2]:
!ls /data/uscuni-ulce/additional_data/housing_estates_locations

HEs_delimitation.xlsx  ZSJ_2013.sbn	 ZSJ_2016.cpg  ZSJ_2016.shp
prague		       ZSJ_2013.sbx	 ZSJ_2016.dbf  ZSJ_2016.shp.xml
ZSJ_2013.cpg	       ZSJ_2013.shp	 ZSJ_2016.prj  ZSJ_2016.shx
ZSJ_2013.dbf	       ZSJ_2013.shp.xml  ZSJ_2016.sbn
ZSJ_2013.prj	       ZSJ_2013.shx	 ZSJ_2016.sbx


In [3]:
xcl = pd.read_excel(
    "/data/uscuni-ulce/additional_data/housing_estates_locations/HEs_delimitation.xlsx"
)
xcl.columns

Index(['MUNICIPALITY CODE', 'MUNICIPALITY NAME', 'BSU CODE_2014',
       'BSU CODE6_2014', 'BSU NAME_CURRENT', 'poznámky', 'SOC HE',
       'SOC&POSTSOC HE', 'Unnamed: 8', 'BSU CODE_1974', 'BSU NAME_1974',
       'DELIMITATION OK?', 'BSU CODE_1980', 'ZSJ1980', 'DELIMITATION OK?.1',
       'BSU CODE_1992', 'BSU NAME_1992', 'DELIMITATION OK?.2', 'BSU CODE_2001',
       'BSU NAME_2001', 'DELIMITATION OK?.3', 'BSU CODE_2014.1',
       'BSU NAME_2014', 'DELIMITATION OK?.4'],
      dtype='object')

In [5]:
estates = gpd.read_file(
    "/data/uscuni-ulce/additional_data/housing_estates_locations/ZSJ_2016.shp"
)
estates[
    [
        "KOD_ZSJ",
        "KOD_UTJ",
        "KOD_KU",
        "KOD_MOaMC",
        "KOD_OBEC",
        "KOD_ZUJ",
        "KOD_OKRES",
        "KOD_LAU1",
        "KOD_KRAJ",
        "KOD_CZNUTS",
    ]
]

Unnamed: 0,KOD_ZSJ,KOD_UTJ,KOD_KU,KOD_MOaMC,KOD_OBEC,KOD_ZUJ,KOD_OKRES,KOD_LAU1,KOD_KRAJ,KOD_CZNUTS
0,000019,600016,600016,,554979,554979,40436,CZ0412,3051,CZ041
1,000027,600024,600024,,554979,554979,40436,CZ0412,3051,CZ041
2,000035,600032,600032,,535826,535826,40282,CZ0311,3034,CZ031
3,000043,600041,600041,,581291,581291,40703,CZ0641,3115,CZ064
4,000051,600059,600059,,547786,547786,40584,CZ0523,3085,CZ052
...,...,...,...,...,...,...,...,...,...,...
22500,391883,991881,991881,,592935,592935,40754,CZ0646,3115,CZ064
22501,391891,991899,991899,,592935,592935,40754,CZ0646,3115,CZ064
22502,391905,991902,991902,,592935,592935,40754,CZ0646,3115,CZ064
22503,392316,990833,990833,,555177,555177,40436,CZ0412,3051,CZ041


In [6]:
# estates = gpd.read_file('/data/uscuni-ulce/additional_data/housing_estates_locations/ZSJ_2013.shp')
# estates[['KOD_ZSJ', 'KOD_UTJ', 'KOD_KU']]

In [7]:
estates["merge"] = estates["KOD_ZSJ"].astype(np.int64)

In [10]:
merged = pd.merge(
    estates,
    xcl[["BSU CODE6_2014", "SOC HE", "SOC&POSTSOC HE"]],
    left_on="merge",
    right_on="BSU CODE6_2014",
)

In [11]:
merged[merged["SOC HE"] == "yes"].shape

(428, 28)

In [12]:
merged = merged[merged["SOC&POSTSOC HE"] == "yes"]

In [14]:
cz_soc_housing = merged.to_crs(epsg=3035)

### Process all buildings, in all regions that have socialist housing

In [None]:
from libpysal.graph import Graph

regions_datadir = "/data/uscuni-ulce/"
simplfied_buildings_dir = "/data/uscuni-ulce/processed_data/buildings/"
buildings_dir = "/data/uscuni-ulce/processed_data/buildings/"

In [16]:
region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)
inp, _ = cz_soc_housing.sindex.query(region_hulls.geometry, predicate="intersects")
czech_regions = region_hulls.iloc[np.unique(inp)]

In [18]:
%%time
for region_id, _ in czech_regions.iterrows():
    ## read region building polygons
    buildings = gpd.read_parquet(
        simplfied_buildings_dir + f"buildings_{region_id}.parquet"
    )
    buildings["area"] = buildings.area
    buildings["perimeter"] = buildings.length

    ### mark buildings in socialist housing
    _, blg_ix = buildings.sindex.query(cz_soc_housing.geometry, predicate="intersects")
    relevant = pd.Series(0, index=buildings.index, dtype=bool)
    relevant.iloc[np.unique(blg_ix)] = True

    # create a contiguity graph for the  buildings in socialist housing polygons
    contig = Graph.build_fuzzy_contiguity(buildings[relevant], buffer=1e-3)

    # get potetial socialist housing buildings
    min_area = 180
    min_perimeter = 55
    mask = (
        (buildings["area"] > min_area)
        & (buildings["perimeter"] > min_perimeter)
        & relevant
    )
    masked = buildings[mask].copy()

    ## merge potential socialist housing buildings, based on buffered graph contiguity
    masked.geometry = masked.buffer(1e-3)
    subgraph = contig.subgraph(masked.index)
    merged = masked.dissolve(subgraph.component_labels).explode()

    # add the rest of the data to the merged subset and save
    fixed = pd.concat(
        [buildings[~relevant], merged, buildings[(relevant) & (~mask)]],
        ignore_index=True,
    )
    assert not fixed.index.duplicated().any()
    fixed.drop(columns=["area", "perimeter"]).to_parquet(
        buildings_dir + f"buildings_{region_id}.parquet"
    )

CPU times: user 14.1 s, sys: 2.13 s, total: 16.2 s
Wall time: 16 s
