In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from core.utils import used_keys

In [2]:
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

In [3]:
region_id = 4182

tessellations_dir = graph_dir = enclosures_dir = '../data/ms_buildings/'
chars_dir = '../data/ms_buildings/chars/'

In [4]:
primary = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [5]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [6]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

### assign tessellations to existing clusters

In [8]:
clusters = gpd.read_parquet('../data/clusters_umap_69333_100_3_gaussian_euclidean_ward_euclidean_50.pq')

In [11]:
enclosures = gpd.read_parquet(enclosures_dir + f'enclosure_{region_id}.parquet')

In [12]:
tessellation_subset = tessellation.loc[X_train.index.values]

In [13]:
inp, res = tessellation_subset.representative_point().geometry.sindex.query(clusters.geometry, predicate='intersects')

In [14]:
inp.shape

(190396,)

In [15]:
tessellation_subset = tessellation_subset.iloc[res]
X_train_subset = X_train.iloc[res]

In [16]:
y = inp.astype('str')

### predictive model

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_train_subset, y, test_size=0.15, random_state=42)

In [19]:
clf = RandomForestClassifier(random_state=0, n_jobs=-1, verbose=True)

In [20]:
%%time
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.5s


CPU times: user 2min 24s, sys: 345 ms, total: 2min 24s
Wall time: 8.15 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.0s finished


In [21]:
clf.score(X_test, y_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.1s finished


0.9484593837535014

In [22]:
from sklearn import model_selection

gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    X_train_subset,
    y,
    groups=tessellation_subset.enclosure_index,
)
split_label = np.empty(len(X_train_subset), dtype=float)
for i, (train, test) in enumerate(splits):
    split_label[test] = i

In [23]:
train = split_label != 0
X_train = X_train_subset.loc[train]
y_train = y[train]

test = split_label == 0
X_test = X_train_subset.loc[test]
y_test = y[test]

In [24]:
rf_spatial_cv = RandomForestClassifier(random_state=0, n_jobs=-1)
rf_spatial_cv.fit(X_train, y_train)

In [25]:
rf_spatial_cv.score(X_test, y_test)

0.6201246008062405

In [26]:
new_labels = clf.predict(X_train_subset)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.6s finished


In [27]:
new_labels = rf_spatial_cv.predict(X_train_subset)

In [28]:
from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color

In [29]:
plotting = tessellation_subset

plotting["label"] = new_labels

In [30]:
layer = SolidPolygonLayer.from_geopandas(
    gdf=plotting[["geometry", "label"]], get_fill_color=get_color(plotting['label'].values.astype(int)), opacity=0.15
)



In [32]:
m = Map(layer, basemap_style=CartoBasemap.Positron)
m

Map(basemap_style=<CartoBasemap.Positron: 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json'>, la…