# Assign morphotopes to the tree

Assign morphotopes (and noise) from the extension countries to the level 7 of the existing tree to retain consistency of the taxonomy.

In [2]:
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from libpysal.graph import read_parquet
import numpy as np

In [7]:
v = "v10"
v_ext = "v10_ext1"

model_params = "_post_processing_v1"

clusters_dir = "/data/uscuni-ulce/processed_data/clusters/"
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
regions_datadir = "/data/uscuni-ulce/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [46]:
country = "fr_sp_nl_be"

region_hulls = gpd.read_parquet(
    regions_datadir + "regions/" + f"{country}_regions_hull.parquet"
)

In [19]:
cdata = pd.read_parquet(f"/data/uscuni-ulce/processed_data/clusters/cdata_{v}.pq")
# cdata = cdata.iloc[:, :-2]
cdata = cdata.drop("limLPS", axis=1)
cdata.index = cdata.index.astype(str)

In [8]:
region_id = 370160 # bcn

In [9]:
clusters = pd.read_parquet(
        f"{clusters_dir}clusters_{region_id}_{v_ext}.pq", columns=["final", "morph"]
    )

In [12]:
# [noise] get morphotopes and noise groups values
is_noise = clusters["final"].values == -1
tq1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
noise_graph = tq1.subgraph(clusters[is_noise].index)
components = noise_graph.component_labels
noise_groups = chars[is_noise].groupby(components).median()

In [26]:
# [noise] approax limAre
noise_groups["limAre"] = (
    chars[is_noise]
    .groupby(components)["sdbAre"]
    .apply(lambda x: x.sort_values(ascending=True)[-min(30, x.shape[0]) :].sum())
)
noise_groups = noise_groups[cdata.columns]
noise_groups.index = "-1_" + noise_groups.index.astype(str)

In [14]:
morphotope_groups = chars[~is_noise].groupby(clusters[~is_noise].morph).median()

In [16]:
morphotope_chars = pd.read_parquet(f"/data/uscuni-ulce/processed_data/morphotopes/morph_chars_{region_id}.pq")

In [21]:
morphotope_groups["limAre"] = morphotope_chars['limAre']

In [23]:
morphotope_groups = morphotope_groups[cdata.columns]

In [28]:
all_data = pd.concat((cdata, morphotope_groups, noise_groups))
scalar = StandardScaler()
vals = scalar.fit_transform(all_data)
all_data = pd.DataFrame(
    vals, index=all_data.index, columns=all_data.columns
).fillna(0)

In [29]:
# setup tree data
morphotope_query_data = all_data.loc[morphotope_groups.index]
noise_query_data = all_data.loc[noise_groups.index]
tree_data = all_data.loc[cdata.index]

In [30]:
## query a knn classifier and assign predictions
tree = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
tree = tree.fit(X=tree_data.values, y=cdata.index.values)
morphotope_predictions = tree.predict(morphotope_query_data.values)
noise_predictions = tree.predict(noise_query_data.values)

In [40]:
# reassign morphotope labels
clusters.loc[~is_noise, 'final'] = clusters.loc[~is_noise, 'morph'].map(
    dict(zip(morphotope_groups.index, morphotope_predictions.astype(int)))
)

In [43]:
# map noise group clusters to building ids
bid_to_ngroup = noise_graph.component_labels
bid_to_prediction = bid_to_ngroup.map(
    pd.Series(
        noise_predictions, noise_query_data.index.str.split("_").str[-1].astype(int)
    ).to_dict()
)
clusters.loc[bid_to_prediction.index.values, "final"] = (
    bid_to_prediction.values.astype(int)
)

In [47]:
def assign_to_tree(region_id, cdata):
    # read cluster data and characters
    clusters = pd.read_parquet(
        f"{clusters_dir}clusters_{region_id}_{v_ext}.pq", columns=["final", "morph"]
    )
    chars = pd.read_parquet(f"{chars_dir}primary_chars_{region_id}.parquet")
    chars = chars[chars.index >= 0]
    assert (clusters.index == chars.index).all()

    # [noise] get morphotopes and noise groups values
    is_noise = clusters["final"].values == -1
    tq1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
    noise_graph = tq1.subgraph(clusters[is_noise].index)
    components = noise_graph.component_labels
    noise_groups = chars[is_noise].groupby(components).median()

    # [noise] approax limAre
    noise_groups["limAre"] = (
        chars[is_noise]
        .groupby(components)["sdbAre"]
        .apply(lambda x: x.sort_values(ascending=True)[-min(30, x.shape[0]) :].sum())
    )
    noise_groups = noise_groups[cdata.columns]
    noise_groups.index = "-1_" + noise_groups.index.astype(str)

    # get morphotope data
    morphotope_groups = chars[~is_noise].groupby(clusters[~is_noise].morph).median()
    morphotope_chars = pd.read_parquet(f"/data/uscuni-ulce/processed_data/morphotopes/morph_chars_{region_id}.pq")
    morphotope_groups["limAre"] = morphotope_chars['limAre']
    morphotope_groups = morphotope_groups[cdata.columns]

    # scale all data
    all_data = pd.concat((cdata, morphotope_groups, noise_groups))
    scalar = StandardScaler()
    vals = scalar.fit_transform(all_data)
    all_data = pd.DataFrame(
        vals, index=all_data.index, columns=all_data.columns
    ).fillna(0)

    # setup tree data
    morphotope_query_data = all_data.loc[morphotope_groups.index]
    noise_query_data = all_data.loc[noise_groups.index]
    tree_data = all_data.loc[cdata.index]

    ## query a knn classifier and assign predictions
    tree = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
    tree = tree.fit(X=tree_data.values, y=cdata.index.values)
    morphotope_predictions = tree.predict(morphotope_query_data.values)
    noise_predictions = tree.predict(noise_query_data.values)

    # reassign morphotope labels
    clusters.loc[~is_noise, 'final'] = clusters.loc[~is_noise, 'morph'].map(
        dict(zip(morphotope_groups.index, morphotope_predictions.astype(int)))
    )

    with_noise = clusters.final.copy()

    # map noise group clusters to building ids
    bid_to_ngroup = noise_graph.component_labels
    bid_to_prediction = bid_to_ngroup.map(
        pd.Series(
            noise_predictions, noise_query_data.index.str.split("_").str[-1].astype(int)
        ).to_dict()
    )
    clusters.loc[bid_to_prediction.index.values, "final"] = (
        bid_to_prediction.values.astype(int)
    )
    return with_noise, clusters.final

In [48]:
%%time

for region_id, _ in region_hulls.iterrows():
    print(region_id)
    assigned, final_without_noise = assign_to_tree(region_id, cdata)
    clusters = gpd.read_parquet(f"{clusters_dir}clusters_{region_id}_{v_ext}.pq")
    clusters["final"] = assigned
    clusters["final_without_noise"] = final_without_noise
    clusters.to_parquet(f'{clusters_dir}clusters_{region_id}_{v}.pq')

153563
153635
153755
153834
153898
153980
154160
154261
154301
154668
154781
154825
154847
155055
155692
155767
156194
156735
159559
159963
160012
160108
161416
161879
163553
164369
165181
165522
165773
166316
166620
166701
168449
168799
170327
170919
171713
172066
173379
173453
174264
175198
175843
176215
177591
180371
182608
182694
182811
183183
184133
185549
187044
187088
187345
188184
188199
189070
189311
193570
193662
194114
194162
195237
195609
196156
196995
197335
198390
198667
199158
199198
199613
199734
199742
201437
201939
202313
202944
203557
203563
204078
204563
204814
205080
205193
205495
206251
206297
209659
211241
211470
211894
212400
212608
212645
213303
213830
213859
214149
214451
214589
215767
216439
216614
216811
218036
219213
219744
220257
220628
221871
221989
222742
223627
224494
224803
226135
226480
227396
228332
228384
228624
228738
229277
230340
230485
230737
231998
232001
232082
232177
233264
234118
234448
234540
234630
235191
235439
235505
235900
235968
236010