In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import dask.dataframe
import matplotlib.pyplot as plt
import urbangrammar_graphics as ugg

from matplotlib.lines import Line2D
from sklearn.ensemble import RandomForestClassifier

In [3]:
%time standardized_form = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/form/standardized/").set_index('hindex')
%time stand_fn = dask.dataframe.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/function/standardized/")
%time data = dask.dataframe.multi.concat([standardized_form, stand_fn], axis=1).replace([np.inf, -np.inf], np.nan).fillna(0)
%time data = data.drop(columns=["keep_q1", "keep_q2", "keep_q3"])
%time data = data.compute()

CPU times: user 19.2 s, sys: 3.26 s, total: 22.5 s
Wall time: 21.4 s
CPU times: user 72.3 ms, sys: 4.09 ms, total: 76.4 ms
Wall time: 116 ms
CPU times: user 39.6 ms, sys: 7.9 ms, total: 47.5 ms
Wall time: 41.4 ms
CPU times: user 18.9 ms, sys: 0 ns, total: 18.9 ms
Wall time: 18.7 ms
CPU times: user 2min 41s, sys: 1min 27s, total: 4min 8s
Wall time: 2min 45s


In [4]:
labels_l1 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/KMeans10GB.pq")
labels_l2_9 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/clustergram_cl9_labels.pq")
labels_l2_2 = pd.read_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/subclustering_cluster2_k3.pq")
labels = labels_l1.copy()
labels.loc[labels.kmeans10gb == 9, 'kmeans10gb'] = labels_l2_9['9'].values + 90
labels.loc[labels.kmeans10gb == 2, 'kmeans10gb'] = labels_l2_2['subclustering_cluster2_k3'].values + 20

outliers = [98, 93, 96, 97]
mask = ~labels.kmeans10gb.isin(outliers)

## Feature importance per cluster

In [15]:
labels.kmeans10gb.unique()

array([ 4,  0,  6,  1, 21,  7,  3,  5, 90, 20,  8, 22, 98, 92, 94, 91, 95,
       96, 93, 97], dtype=int32)

In [19]:
imps = pd.DataFrame()

In [21]:
for cluster in labels.kmeans10gb.unique():
    if cluster not in outliers:
        cluster_bool = labels.loc[mask]['kmeans10gb'].apply(lambda x: 1 if x == cluster else 0)

        clf = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42, verbose=1)
        clf = clf.fit(data.loc[mask].values, cluster_bool.values)

        importances = pd.Series(clf.feature_importances_, index=data.columns).sort_values(ascending=False)

        imps[f'cluster_{cluster}'] = importances.head(50).index.values
        imps[f'cluster_{cluster}_vals'] = importances.head(50).values

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 13.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 10.4min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  5.8min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.6min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  6.0min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.9min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


In [26]:
chars = [c for c in imps.columns if 'vals' not in c]
imps[sorted(chars)]

Unnamed: 0,cluster_0,cluster_1,cluster_20,cluster_21,cluster_22,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_90,cluster_91,cluster_92,cluster_94,cluster_95
0,sicCAR_q1,ltcWRE_q3,ssbCCM_q2,stbCeA_q1,sdbPer_q2,ldsAre_q1,sdcLAL_q1,ssbElo_q1,lcnClo_q3,mdcAre_q2,linPDE_q2,sdbPer_q2,sdbAre_q1,ssbCCD_q2,ssbERI_q2,sicCAR_q2
1,sicCAR_q2,ltcRea_q3,ssbCCM_q3,linP4W_q2,ssbCCM_q2,mdsAre_q1,sicCAR_q2,ssbCCM_q3,lcnClo_q2,sicCAR_q2,lcdMes_q3,ssbERI_q1,"G, I. Distribution, hotels and restaurants_q2",ssbCCM_q2,ssbCCM_q2,C. Manufacturing_q2
2,ltbIBD_q2,ltcAre_q2,sdbAre_q3,stbCeA_q2,sdbPer_q3,sddAre_q2,sicCAR_q1,ssbElo_q2,ldePer_q1,mtdMDi_q2,lcdMes_q2,ssbCCM_q2,"K, L, M, N. Financial, real estate, profession...",ssbSqu_q3,ssbCCD_q2,"R, S, T, U. Other_q2"
3,sdcAre_q2,sdcAre_q2,population_q3,ldeAre_q2,sdbAre_q2,sicCAR_q2,sdcAre_q2,ssbCCo_q1,ldeAre_q2,sicCAR_q1,ssbERI_q1,ssbSqu_q3,"R, S, T, U. Other_q2","K, L, M, N. Financial, real estate, profession...",ssbCor_q2,"G, I. Distribution, hotels and restaurants_q2"
4,sddAre_q2,misCel_q1,sdbPer_q2,stcOri_q2,population_q3,lddNDe_q3,mtcWNe_q3,ssbCCD_q2,lcnClo_q1,mdsAre_q1,ssbCCo_q1,sdbAre_q2,"G, I. Distribution, hotels and restaurants_q3",sdbPer_q2,"K, L, M, N. Financial, real estate, profession...",sicCAR_q1
5,mtdMDi_q2,misCel_q2,sdbAre_q2,ssbERI_q1,ssbSqu_q3,mdsAre_q2,mdsAre_q1,sdbPer_q3,ltcWRE_q3,sscCCo_q2,population_q1,ssbCCD_q3,"K, L, M, N. Financial, real estate, profession...",sdbPer_q3,"G, I. Distribution, hotels and restaurants_q2",C. Manufacturing_q3
6,mdsAre_q1,lisCel_q2,ldePer_q1,linP4W_q1,ssbCCD_q3,sicCAR_q1,ltcAre_q1,sdsSPW_q2,linP4W_q2,mdcAre_q1,ssbElo_q2,"K, L, M, N. Financial, real estate, profession...",C. Manufacturing_q2,sdbAre_q2,sdbPer_q2,ssbCCM_q2
7,Code_18_112_q2,mtcWNe_q1,lteOri_q2,ldePer_q1,"K, L, M, N. Financial, real estate, profession...",ldeAre_q2,sdcAre_q3,ssbCCo_q2,sddAre_q1,sisBpM_q2,mdsAre_q2,"G, I. Distribution, hotels and restaurants_q2",sdbPer_q2,"G, I. Distribution, hotels and restaurants_q2",ssbSqu_q3,sdbPer_q2
8,mtcWNe_q2,ltcAre_q1,sdbPer_q3,linPDE_q1,ssbERI_q1,lseCWA_q3,mtbNDi_q2,ltcAre_q1,sddAre_q2,ltcAre_q1,sdsAre_q3,sdbPer_q3,ssbCCM_q2,ssbCor_q3,"K, L, M, N. Financial, real estate, profession...",sdsSPO_q2
9,sdcLAL_q2,lisCel_q1,ldeAre_q1,lteOri_q1,"R, S, T, U. Other_q2",sdcAre_q2,ltbIBD_q2,sdbPer_q2,ltcWRE_q2,sddAre_q3,ssbCCo_q2,sdbAre_q1,Code_18_211_q1,ssbCCD_q3,ssbCCM_q1,mean_q3


In [25]:
imps.to_parquet("../../urbangrammar_samba/spatial_signatures/clustering_data/per_cluster_importance.pq")