In [6]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")
graph_dir = data_dir + "neigh_graphs/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"

In [63]:
focus_areas = {
    3103: "karlin",
    13295: "vinohrady",
    909: "mala strani",
    4429: "holyne",
    4406: "housing estate",
    2265: "stare mesto",
    1544: "nusle",
    18215: "malesice",
}


def generate_enc_groups(
    tessellation, enclosures, include_random_sample=False, random_sample_size=1000
):
    buffers = enclosures.loc[list(focus_areas.keys())].buffer(500)
    group_dict = pd.Series(focus_areas).reset_index(drop=True).to_dict()
    areas, tids = tessellation.sindex.query(buffers, predicate="intersects")
    tess_groups = pd.Series(areas, index=tessellation.index[tids]).replace(group_dict)

    if include_random_sample:
        random_sample_index = (
            tessellation[~tessellation.index.isin(tess_groups_ilocs)]
            .sample(random_sample_size, random_state=1)
            .index
        )
        random_sample = pd.Series("random", index=random_sample_index)
        tess_groups = pd.concat((tess_groups, random_sample))

    return tess_groups[tess_groups.index.drop_duplicates()]


def get_tess_groups_original_ilocs(tessellation, tess_groups):
    return (
        pd.Series(np.arange(0, len(tessellation)), index=tessellation.index)
        .loc[tess_groups.index]
        .values
    )

In [64]:
def print_dendro(tess_groups, X_train):
    groups = tess_groups.groupby(tess_groups).apply(
        lambda group: X_train.loc[group.index.values].mean()
    )
    groups = groups.to_frame().unstack(level=1)
    clusterer = AgglomerativeClustering(
        linkage="single", compute_full_tree=True, compute_distances=True
    )

    group_model = clusterer.fit(groups)
    lm = get_linkage_matrix(group_model)
    fix, ax = plt.subplots(figsize=(10, 10))
    # # Plot the corresponding dendrogram
    r = dendrogram(lm, ax=ax, labels=groups.index.values)
    return r

In [65]:
region_id = 69300

In [66]:
tessellation = gpd.read_parquet(chars_dir + f"tessellations/chars_{region_id}.parquet")
enclosures = gpd.read_parquet(chars_dir + f"enclosures/chars_{region_id}.parquet")

In [67]:
tess_groups = generate_enc_groups(tessellation, enclosures, include_random_sample=True)
tess_groups_ilocs = get_tess_groups_original_ilocs(tessellation, tess_groups)

In [69]:
plotting = gpd.GeoDataFrame(
    tess_groups.to_frame(), geometry=tessellation.iloc[tess_groups_ilocs].geometry
)
plotting.columns = ["neighbourhood", "geometry"]
plotting.explore(column="neighbourhood", categorical=True)