In [9]:
import geopandas as gpd
import pandas as pd
from scipy.spatial import KDTree
from sklearn.preprocessing import QuantileTransformer
from sklearn.neighbors import KNeighborsClassifier
from libpysal.graph import read_parquet

In [10]:
clusters_dir = '/data/uscuni-ulce/processed_data/clusters/'
chars_dir = '/data/uscuni-ulce/processed_data/chars/'
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)
graph_dir = '/data/uscuni-ulce/processed_data/neigh_graphs/'

In [78]:
region_id = 69333

In [89]:
def assign_noise(region_id):
    
    # read cluster data and characters
    clusters = pd.read_parquet(f'{clusters_dir}{region_id}_clusters.pq', columns=['final', 'morph'])
    chars = pd.read_parquet(f'{chars_dir}primary_chars_{region_id}.parquet')
    chars = chars[chars.index >= 0]
    assert (clusters.index == chars.index).all()


    # get morphotopes and noise groups values
    is_noise = clusters['final'].values == -1
    morph_medians = chars[~is_noise].groupby(clusters[~is_noise].morph.values).median()
    tq1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
    noise_graph = tq1.subgraph(clusters[is_noise].index)
    noise_groups = chars[is_noise].groupby(noise_graph.component_labels).median()    
    
    ## aggreate morphotope medians and noise in the same dataframe and normalise
    ## cannot use individual tess cells as takes too much time
    all_data = pd.concat((morph_medians, noise_groups))
    scalar = QuantileTransformer(subsample=None, output_distribution='uniform')
    vals = scalar.fit_transform(all_data)
    all_data = pd.DataFrame(vals, index=all_data.index, columns=all_data.columns).fillna(0)


    # morph -> final dictinary for final cluster assignment
    morph_to_final = pd.Series(clusters.loc[~is_noise, 'final'].values,
                               clusters.loc[~is_noise, 'morph'].values)
    morph_to_final = morph_to_final[~morph_to_final.index.duplicated()]
    
    # setup tree data
    query_data = all_data.loc[noise_groups.index]
    tree_data = all_data.loc[morph_medians.index]

    ## query a knn classifier and assign predictions
    tree = KNeighborsClassifier(n_neighbors=15, n_jobs=-1)
    tree = tree.fit(X=tree_data.values, y=morph_to_final.loc[tree_data.index].values)
    predictions = tree.predict(query_data.values)

    #map noise group clusters to building ids
    bid_to_ngroup = noise_graph.component_labels
    bid_to_prediction = bid_to_ngroup.map(pd.Series(predictions, query_data.index).to_dict())
    clusters.loc[bid_to_prediction.index.values, 'final'] = bid_to_prediction.values
    return clusters.final

In [90]:
%%time
for region_id, _ in region_hulls.iterrows():
    print(region_id)
    final_without_noise = assign_noise(region_id)
    clusters = gpd.read_parquet(f'{clusters_dir}{region_id}_clusters.pq')
    # assignment hasnt changed existing clusters
    pd.testing.assert_series_equal(final_without_noise[clusters.final != -1], clusters.final[clusters.final != -1])
    clusters['final_without_noise'] = final_without_noise
    clusters.to_parquet(f'{clusters_dir}{region_id}_clusters.pq')

4
10
132




134
286




313




400
523




765
801
832
913




960




1124




1154
1387




1478




1515
1605
1718




1736
1782




1970
1981




2096
2322
2350




2478
2514




2625




2728




2975




3039




3109
3150
3221




3250
3526




3610




3612




3701




3705
3752
3759
3981




4070




4214
4215




4235
4284
4356




4382




4723




4805
5096




5191
5246




5310
5408




5427
5662
5671




5766
5883




6254
6529




6560
6576
6741




6749




6811




6873
6996




7068




7094
7280
7485
7528




7534
7681




7688
7712
7727
7805




7914
7937




7963
8046
8216




8238




8256
8265




8345
8374




8396




8592




8707
8731




8757




8759




8813
9016




9064
9074




9150




9169




9194
9284
9824




9924




9954




9972
10019
10086




10095
10124




10179
10222




10263




10277




10455




10510




10511
10563




10579
10602




10666
10794




10847
10908




10926




10970
11002




11019
11057
11141




11210
11256




11261




11305
11309




11311
11318




11367
11444




11455
11471




11667




11678
11735




11757




11799
11877
11905
12027




12084
12100
12115
12154




12191
12381




12440




12483
12552




12667




12707
12755




12756




12844
12919




12965




13076
13137




13172
13191




13196
13229




13301




13395




13442
13482




13506
13553




13555
13614




13616
13655
13677




13779
13810




13881
13897




13965
14084




14245




14319
14321




14327




14364
14383




14550
14552




14623
14679




14702
14735




14789
14933




15019




15139
15265




15331
15347




15484
15509




15545
15555
15613




15626




15653




15690




15716
15794




15802
15933
15970




16217
16242
16256




16291




16318
16429
16509




16510
16566
16632




16685
16688




16745




16921
17171
17268




17389
17393




17458
17677




17690
17959




18008




18164
18454




18489
18516




18557
18893




19124
19151
19244




19254




19376




19386
19393
19542




19878
19940
20123




20149




20243




20496




20554
20783
20968




20970
21174




21304




21484
21523




21569
21591
21894
21904
22040
22105




22345




22602
22704




22884
22912




22976
23258




23337




23401
23631
23642




23661




23812




23984
24116




24232
24257
24276
24389




24402
24596




24735
24743
25202




25361
25491




25532
25656




25697




25765
25934
26072
26265




26300
26315
26429




26435




26468
26780




26888




27053
27166
27178
27297
27411




27578




27587




27717
27773
27926
28040
28235




28307




28566




28835




29096
29215
29249




29278
29344




29374
29694
29729




29787




29800
29984




30124
30213




30523
30585
30615
30640
30729




30754




30775
30841
31212
31237
31287




31291
31298




31736
32023




32045




32108
32261
32386




32427
32503




32593
32685




32793




32970
33122
33287
33427
33492
33528




33692
33722
34053




34152
34266




34436




34522




34767




34839




34900
34978




35055
35141
35180




35211




35246




35368
35540




35679




35755




35812




36012
36122




36330




36580




36689




36704




36752
36842




36953
37105




37163




37236
37434




37496




37628




37635




37698
37811
38018




38243
38248




38375




38512
38615




38679
38844




38924
38935




39030




39078




39161
39171




39293
39396




39490
39680
39911




39947
40064




40130




40294
40447




40766




40885
40890
40941




41002
41095
41185




41289




41303
41428




41630




42084
42154
42226




42366
42418




42499
42528




42595
43070
43097
43376




43422
43455
43495




43508
43619




43651
43662




43706




43715
43852
44065




44194
44323




44390
44478
44913
44933
44960
45014
45181




45303




45441




45512
45801




45865




45888
46115
46185




46281




46352
46355




46461
46545




47004




47090
47149
47443




47446
47465




47505
47539
47554
47774




47813




47919
48152
48227
48254
48357




48405
48597




48965




49045




49393
49434




49571




49643




49714
50280
50285




50528
50880




50896
50966




51019




51277
51289




51570




51677




52381
52492




53232
53380




53410
53525




53605




53616
53744




54871




55392
55763
55811
55835
56160
56178




56184
56404




56937




57109




57313




57652




57717
57876




57908




58077




58523
58571
58713




58795
59194




59226




59718
59768




60009
60045




60202
60273




60613




60914




61183
61187
61711
61875




62193
62222




62393
62844




62954
63485
63655




63726
63739




63756




64097




64220
64463




65007




65441
65626




66007




66047
66540
66593




66793




67156




67279
67459




67803
68059
68265
68363




69333
69394




70073
70108
70368




70973




71306




71477




71506
71843




72032
72110




72396




72507
73611




73625




73735
74378




74568
75472
75642




76038
76305
76512
77205
77846
78792
78820
78938
79988




80313
80404
82064
82514
82773




83538
83542




84297




84420
84554




84962
85104
85156
85262
86502
86568
86863
86873
87615
87947




88930
88950
89122
89586
89921




89971
90075
90196
90770
91011
91479
91702
92178




93167
93504
93793
95256




96465
96727




96895
97745
97978
98107
98628
98660
98716
98958




99661
99865
99886
100210
100342
100348
101313




101429




101574




101992




102031




102127




102262
102474




102679




102801




102814
102939




103039
103283




103305




103561
103713
103852
103882
103928
104018
104066
104568
105038




105090




105385
105388
105426
105457
105589




106034




106124




106363




106370
106384




106559
107131
107442
108050
108101




108127
108129




108263
108748
108755




108884
109404
109488
109491
109636




109727




109756
110016




110179




110237




110657
110691




110906
110908




111555
111689
111911
112253




112437
112696
112843
112949
113068
113301




114014
114180
114311




114591




114680




114822
115151
115595




115950
116316
116523




116801




116866
117182




117638
117720
118096




118747
119562




119609
120057
120193
120545
120961
120982




121071
121143




123228
123690
123904
123941




124160




124904
125016
125141
125667
126588




127101
127183




127626
128286
129032




129104
129395
129730
130341




130658
131130
131395




131786




131824
132225




133057
133100




133948




134023
134189




134454
134755
134905
135687
136169
136533




137469




137537
137636
137863
138804
139096
139621
139674
139764
140420
141272
141366
142100
143701
145906
146285
147112
147634
148018
148085
149997
150044
151676
152081
152511
CPU times: user 10min 21s, sys: 1min 40s, total: 12min 1s
Wall time: 9min 6s


In [70]:
# %%time
# tree = KDTree(tree_data.values)
# _, idxs = tree.query(query_data.values, k=1, workers=-1)

# clusters.loc[is_noise, 'final'] =  morph_to_final.loc[tree_data.index[idxs]].values

### Plotting

In [91]:
from core.cluster_validation import get_color
final_clusters = pd.read_parquet('/data/uscuni-ulce/processed_data/clusters/clusters_v2.pq')[0]
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [255,255,255]

In [92]:
# region_id = 69333 
region_id = 151676

In [93]:
etcs = gpd.read_parquet(f'{clusters_dir}{region_id}_clusters.pq')

In [94]:
etcs['geometry'] = etcs.simplify(1).to_crs(epsg=4326).make_valid()
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']


In [95]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

CPU times: user 1.24 s, sys: 156 ms, total: 1.4 s
Wall time: 1.4 s


In [96]:
from sidecar import Sidecar
sc = Sidecar(title=f'Final Clusters with noise')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.DarkMatter)
with sc:
    display(m)

In [97]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[etcs.final_without_noise].values.astype('uint8')