In [1]:
%%time

import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler

from sklearn.neighbors import KDTree

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import davies_bouldin_score
from core.cluster_validation import get_linkage_matrix

from sklearn.neighbors import KDTree
import shapely
from core.cluster_validation import generate_detailed_clusters
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import cluster_data

CPU times: user 12 s, sys: 625 ms, total: 12.7 s
Wall time: 10.4 s


In [21]:


tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
cluster_dir = '/data/uscuni-ulce/processed_data/clusters/'


In [2]:
region_id = 69333
val_path = '../data/prague_validation/morphotopes.pq'

In [3]:
# region_id = 69333
# val_path = '../data/prague_validation/morphotopes.pq'

In [5]:
# morphotopes = gpd.read_parquet(val_path)
# morphotopes.explore()

In [39]:
data = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)
morphotopes = gpd.read_parquet(val_path)

In [109]:
to_drop = data.columns[~(data.columns.str.startswith('s') | data.columns.str.startswith('m'))]


In [110]:
data = data.drop(to_drop, axis=1)

In [111]:


tess_groups = generate_detailed_clusters(tessellation,
                                         include_random_sample=False, path=val_path)
tess_groups = tess_groups[tess_groups.index.isin(data.index)]
tess_groups_ilocs = (
    pd.Series(np.arange(len(data[data.index >=0])), index=data[data.index >=0].index)
    .loc[tess_groups.index]
    .values
)

In [112]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [113]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[tess_groups_ilocs].values, tess_groups.values.astype('str'), test_size=0.15, random_state=42)

In [114]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42, verbose=1)
clf = clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.0s finished


In [115]:
importances = pd.Series(clf.feature_importances_, index=data.columns).sort_values(ascending=False)

In [116]:
importances.iloc[:50].sum()

np.float64(1.0000000000000002)

In [117]:
importances.iloc[-10:].index

Index(['sdbPer', 'ssbCCD', 'mtbNDi', 'sssLin', 'sdcLAL', 'sdcAre', 'mtbSWR',
       'ssbCor', 'mtdDeg', 'sdbCoA'],
      dtype='object')

In [118]:
y_pred = clf.predict(X_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.0s finished


In [119]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.2289938694554634

In [85]:
from sklearn.metrics import confusion_matrix
cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=np.unique(y_test), normalize='true'), columns=np.unique(y_test), index=np.unique(y_test))
cm

Unnamed: 0,0,1,10,11,12,13,134,135,136,14,...,60,61,62,63,64,65,66,7,8,9
0,0.241379,0.379310,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.137931,0.000000
1,0.155556,0.377778,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.022222,0.044444,0.000000
10,0.000000,0.000000,0.292683,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.02439,0.024390,0.0,0.0,0.000000,0.024390,0.219512,0.000000
11,0.000000,0.000000,0.000000,0.0,0.0,0.500000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
12,0.000000,0.000000,0.000000,0.0,0.0,0.333333,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,0.000000,0.111111,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.222222,0.0,0.0,0.000000,0.000000,0.111111,0.000000
66,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.235294,0.000000,0.000000,0.000000
7,0.096774,0.096774,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.032258,0.096774,0.000000
8,0.000000,0.003984,0.003984,0.0,0.0,0.003984,0.0,0.003984,0.0,0.003984,...,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000,0.756972,0.011952


In [96]:
pd.Series(np.diag(cm), index=cm.index).sort_values(ascending=False)

8     0.756972
30    0.745763
48    0.662921
47    0.641509
16    0.634409
        ...   
57    0.000000
61    0.000000
6     0.000000
65    0.000000
64    0.000000
Length: 70, dtype: float64

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,ssbCCM,ssbCCD,...,ldkAre,ldkPer,lskCCo,lskERI,lskCWA,ltkOri,ltkWNB,likWBB,sdsAre,likWCe
112922,98.330912,41.958963,0.0,0.597355,6.0,3.236935,0.946786,0.894713,6.207839,1.436191,...,55184.422328,2076.601181,0.136828,0.560951,765.675800,16.183867,0.005297,0.153451,3106.896652,0.000761
215489,50.192584,28.354110,0.0,0.628496,4.0,0.619829,1.000030,0.934484,5.014837,0.027521,...,27554.365195,820.126495,0.306718,0.981497,208.935221,19.978257,0.006097,0.162984,24117.802275,0.001815
112428,335.436251,76.171506,0.0,0.640113,16.0,0.403953,0.963921,0.874906,10.392091,1.642186,...,9747.865783,469.016904,0.308600,0.989350,113.153347,5.148761,0.017057,0.311613,1986.171475,0.001026
111794,240.373348,62.063629,0.0,0.633944,4.0,0.078347,0.999991,0.924966,10.979823,0.007452,...,196250.454847,2320.835983,0.680830,0.765930,418.180240,27.850936,0.000862,0.019754,891671.172173,0.000046
245114,30.886160,23.242124,0.0,0.530133,4.0,0.844593,1.000925,0.544701,4.279799,0.027225,...,43666.045737,999.260550,0.592365,0.838874,175.712331,13.274723,0.006004,0.061003,30909.383342,0.000641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112379,70.767166,35.660715,0.0,0.509823,4.0,0.191616,0.999716,0.503381,6.640520,0.009360,...,53698.798566,1355.197724,0.549791,0.683976,284.027405,25.298632,0.008855,0.191229,10393.949767,0.000968
228267,31.672473,23.435376,0.0,0.586631,6.0,0.594134,0.965843,0.811014,3.521696,0.797428,...,72126.340676,1317.989028,0.415859,0.815277,286.137531,24.918189,0.004552,0.128428,26475.580362,0.000693
111602,120.002611,44.424525,0.0,0.598824,4.0,0.541029,0.999067,0.725590,7.958659,0.031642,...,22489.589498,976.589190,0.220071,0.696423,323.181525,1.452272,0.007168,0.173404,21826.365397,0.001112
112329,108.372635,47.773062,0.0,0.484267,8.0,1.483944,0.909659,0.555106,6.758382,1.817265,...,26653.911990,1002.552841,0.292756,0.705147,289.042938,10.815165,0.010972,0.279320,2347.931723,0.001426


In [105]:
clf.predict(data.iloc[tess_groups_ilocs[tess_groups == 6]])

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.0s finished


array(['6', '6', '6', '6', '6', '6', '6', '6', '6', '3', '8', '6', '6',
       '6', '6', '9', '3', '6', '6', '6', '6', '6', '6', '6', '6', '6',
       '6', '3', '8', '6', '6', '6', '3', '2', '6', '6', '6', '6', '6',
       '6', '6', '6', '6', '6', '6', '6', '2', '6', '6', '6', '34', '6',
       '6', '6', '6', '6', '6', '6', '53', '6', '6', '3', '6', '6', '6',
       '6', '6', '6', '6', '6', '6', '6', '6', '5', '6', '6', '6', '6',
       '6', '6'], dtype='<U21')

In [106]:
clf.predict(X_test[y_test == '6'])

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    0.0s finished


array(['2', '2', '53', '3', '8', '34', '3', '5', '3', '9', '3', '8', '3'],
      dtype='<U21')

In [107]:
# morphotopes.loc[[57, 61, 6, 65, 64]].explore()