In [None]:
import geopandas as gpd
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
fi = {}
lc = {}
perf = []

for reduction in ["pca", "fa", "umap_dim20_nb5_euclidean"]:
    fi[reduction] = {}
    lc[reduction] = {}
    for model_type in ["lr", "rf"]:
        fi[reduction][model_type] = {}
        lc[reduction][model_type] = {}
        for cluster in [1, 2, 3, 4, 5, 6, 7, 8]:
            with open(
                f"/data/uscuni-restricted/06_models/{reduction}/label_{cluster}/{model_type}/model.joblib",
                "rb",
            ) as f:
                model = joblib.load(f)
                if model_type == "rf":
                    fi[reduction][model_type][cluster] = model.feature_importances_
                else:
                    lc[reduction][model_type][cluster] = model.local_coef_
                perf.append(
                    pd.Series(
                        {
                            "reduction": reduction,
                            "model": model_type,
                            "cluster": cluster,
                            "accuracy": model.score_,
                            "balanced_accuracy": model.balanced_accuracy_,
                            "precision": model.precision_,
                            "recall": model.recall_,
                            "f1_macro": model.f1_macro_,
                            "f1_micro_": model.f1_micro_,
                            "f1_weighted": model.f1_weighted_,
                        }
                    )
                )

In [None]:
performance = pd.DataFrame(perf)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 3), sharey=True)

# First scatterplot
ax1 = sns.scatterplot(
    data=performance,
    x="cluster",
    y="balanced_accuracy",
    hue="model",
    style="reduction",
    ax=axes[0],
)
sns.despine(ax=ax1)
sns.move_legend(ax1, loc="upper left", bbox_to_anchor=(1, 1), frameon=False)
ax1.set_title("balanced_accuracy")

# Second scatterplot
ax2 = sns.scatterplot(
    data=performance,
    x="cluster",
    y="f1_macro",
    hue="model",
    style="reduction",
    ax=axes[1],
)
sns.despine(ax=ax2)
sns.move_legend(ax2, loc="upper left", bbox_to_anchor=(1, 1), frameon=False)
ax2.set_title("f1_macro")

plt.tight_layout()
plt.show()

In [None]:
performance = performance.replace(
    {
        "pca": "PCA",
        "fa": "FA",
        "umap_dim20_nb5_euclidean": "no_dr",
        "lr": "LR",
        "rf": "RF",
        1: "Incoherent Large-Scale Homogenous Fabric",
        2: "Incoherent Large-Scale Heterogenous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    }
)

In [None]:
performance.set_index(["reduction", "model", "cluster"])[
    "f1_macro"
].unstack().style.format("{:.4f}").background_gradient(
    cmap="Blues", vmin=0.50, vmax=1.10
)

# choose the best option

In [None]:
fi_means = {}
fi_stds = {}

for k, v in fi["umap_dim20_nb5_euclidean"]["rf"].items():
    fi_means[k] = v.mean()
    fi_stds[k] = v.std()

In [None]:
lc_means = {}
lc_stds = {}

for k, v in lc["pca"]["lr"].items():
    lc_means[k] = v.mean()
    lc_stds[k] = v.std()

In [None]:
fi_means = pd.DataFrame(fi_means)
fi_means = fi_means.rename(
    columns={
        1: "Incoherent Large-Scale Homogenous Fabric",
        2: "Incoherent Large-Scale Heterogenous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    }
).T
fi_means.style.format("{:.4f}").background_gradient(cmap="YlGnBu", vmin=0.01, vmax=0.03)

In [None]:
fi_stds = pd.DataFrame(fi_stds)
fi_stds = fi_stds.rename(
    columns={
        1: "Incoherent Large-Scale Homogenous Fabric",
        2: "Incoherent Large-Scale Heterogenous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    }
).T
fi_stds.style.format("{:.4f}").background_gradient(cmap="YlOrRd")

In [None]:
lc_means = pd.DataFrame(lc_means)
lc_means = lc_means.rename(
    columns={
        1: "Incoherent Large-Scale Homogenous Fabric",
        2: "Incoherent Large-Scale Heterogenous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    }
).T
lc_means.style.background_gradient(cmap="YlGnBu")

In [None]:
lc_stds = pd.DataFrame(lc_stds)
lc_stds = lc_stds.rename(
    columns={
        1: "Incoherent Large-Scale Homogenous Fabric",
        2: "Incoherent Large-Scale Heterogenous Fabric",
        3: "Incoherent Small-Scale Linear Fabric",
        4: "Incoherent Small-Scale Sparse Fabric",
        5: "Incoherent Small-Scale Compact Fabric",
        6: "Coherent Interconnected Fabric",
        7: "Coherent Dense Disjoint Fabric",
        8: "Coherent Dense Adjacent Fabric",
    }
).T
lc_stds.style.background_gradient(cmap="YlOrRd")

In [None]:
pcas = gpd.read_parquet("/data/uscuni-restricted/05_pcs/pcs_29.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/04_spatial_census/cluster_assignment_v10.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v10.pq"
)
data = pcas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
for reduction in ["no_dr"]:
    for model_type in ["lr"]:
        for cluster in [4]:
            with open(
                f"/data/uscuni-restricted/06_models/{reduction}/label_{cluster}/{model_type}/model.joblib",
                "rb",
            ) as f:
                model = joblib.load(f)

In [None]:
data.plot(
    model.local_intercept_,
    legend=True,
    missing_kwds={"color": "lightgray"},
    figsize=(20, 8),
)

In [None]:
data.plot(
    model.local_coef_["0"],
    legend=True,
    missing_kwds={"color": "lightgray"},
    figsize=(20, 8),
    legend_kwds={"shrink": 0.6},
).set_axis_off()

In [None]:
for reduction in ["pca"]:
    for model_type in ["rf"]:
        for cluster in [1]:
            with open(
                f"/data/uscuni-restricted/06_models/{reduction}/label_{cluster}/{model_type}/model.joblib",
                "rb",
            ) as f:
                model = joblib.load(f)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 8))

# First map
data.plot(
    column=model.feature_importances_["0"],
    ax=axes[0, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[0, 0].set_title("Feature Importance of PC1")
axes[0, 0].axis("off")

# Second map
data.plot(
    column=model.feature_importances_["1"],
    ax=axes[0, 1],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[0, 1].set_title("Feature Importance PC2")
axes[0, 1].axis("off")

# Third map
data.plot(
    column=model.feature_importances_["2"],
    ax=axes[1, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[1, 0].set_title("Feature Importance PC3")
axes[1, 0].axis("off")

data.plot(
    column=model.local_oob_f1_macro_,
    ax=axes[1, 1],
    legend=True,
    cmap="YlGnBu",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.6,
    vmax=0.95,
)
axes[1, 1].set_title("OOB F1-macro score")
axes[1, 1].axis("off")

plt.tight_layout()
plt.show()

In [None]:
for reduction in ["pca"]:
    for model_type in ["rf"]:
        for cluster in [7]:
            with open(
                f"/data/uscuni-restricted/06_models/{reduction}/label_{cluster}/{model_type}/model.joblib",
                "rb",
            ) as f:
                model = joblib.load(f)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 8))

# First map
data.plot(
    column=model.feature_importances_["0"],
    ax=axes[0, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.01,
    vmax=0.2,
)
axes[0, 0].set_title("Feature Importance of PC1")
axes[0, 0].axis("off")

# Second map
data.plot(
    column=model.feature_importances_["1"],
    ax=axes[0, 1],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.01,
    vmax=0.2,
)
axes[0, 1].set_title("Feature Importance PC2")
axes[0, 1].axis("off")

# Third map
data.plot(
    column=model.feature_importances_["2"],
    ax=axes[1, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.01,
    vmax=0.2,
)
axes[1, 0].set_title("Feature Importance PC3")
axes[1, 0].axis("off")

data.plot(
    column=model.local_oob_f1_macro_,
    ax=axes[1, 1],
    legend=True,
    cmap="YlGnBu",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
)
axes[1, 1].set_title("OOB F1-macro score")
axes[1, 1].axis("off")

plt.tight_layout()
plt.show()

In [None]:
for reduction in ["pca"]:
    for model_type in ["rf"]:
        for cluster in [1]:
            with open(
                f"/data/uscuni-restricted/06_models/{reduction}/label_{cluster}/{model_type}/model.joblib",
                "rb",
            ) as f:
                model = joblib.load(f)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 8))

# First map
data.plot(
    column=model.feature_importances_["0"],
    ax=axes[0, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[0, 0].set_title("Feature Importance of PC1")
axes[0, 0].axis("off")

# Second map
data.plot(
    column=model.feature_importances_["1"],
    ax=axes[0, 1],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[0, 1].set_title("Feature Importance PC2")
axes[0, 1].axis("off")

# Third map
data.plot(
    column=model.feature_importances_["2"],
    ax=axes[1, 0],
    legend=True,
    cmap="RdYlGn_r",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
    vmin=0.02,
    vmax=0.2,
)
axes[1, 0].set_title("Feature Importance PC3")
axes[1, 0].axis("off")

data.plot(
    column=model.local_oob_f1_macro_,
    ax=axes[1, 1],
    legend=True,
    cmap="YlGnBu",
    missing_kwds={"color": "lightgray"},
    legend_kwds={"shrink": 0.6},
)
axes[1, 1].set_title("OOB F1-macro score")
axes[1, 1].axis("off")

plt.tight_layout()
plt.show()

In [None]:
data.plot(
    model.local_oob_f1_macro_,
    legend=True,
    missing_kwds={"color": "lightgray"},
    figsize=(20, 8),
    cmap="YlGnBu",
    legend_kwds={"shrink": 0.6},
).set_axis_off()

In [None]:
# 1 - modra, 3  - zelena, 4 - cervena, 5-fialova, 6- hneda,  7 - ruzova, 8 - zluta