In [None]:
import pathlib

import geopandas as gpd
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shapely
from libpysal import graph
from sklearn import ensemble, metrics, model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
# Open data for total population
total = pd.read_csv(
    "/data/uscuni-restricted/04_spatial_census/total.csv",
    dtype={"nadzsjd": str},
    index_col=0,
)
# Open data diles
data = gpd.read_parquet(
    "/data/uscuni-restricted/04_spatial_census/_merged_census_2021.parquet"
).set_index("nadzsjd")
# Merge data
data_total = data.join(total)
# Remove unnecessary columns

data_relative = data_total.drop(
    columns=[
        "NUTS_2",
        "naz_oblast",
        "NUTS_3",
        "naz_kraj",
        "kod_okres",
        "naz_okres",
        "naz_orp",
        "kod_obec",
        "naz_obec",
        "kod_mco",
        "nazev_mco",
    ]
)

# Do some preprocessing
# data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
data_relative[data_relative.columns.drop("geometry")] = data_relative[
    data_relative.columns.drop("geometry")
].astype(float)

# Normalize the data
cols_to_normalize = data_relative.columns.drop(
    ["Obyvatelstvo celkem", "geometry", "kod_orp"]
)
data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
    data_relative["Obyvatelstvo celkem"], axis=0
)

# Drop NaN values
data_relative = data_relative.dropna(axis=0)

scaler = StandardScaler()
data_relative[cols_to_normalize] = scaler.fit_transform(
    np.nan_to_num(data_relative[cols_to_normalize])
)

In [None]:
data_r = data_relative[data_relative.columns.drop(["Obyvatelstvo celkem", "kod_orp"])]
data_r.to_parquet("/data/uscuni-restricted/05_umap/no_dr.parquet")

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/04_spatial_census/cluster_assignment_v10.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v10.pq"
)
data = data_relative.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(
    [
        "geometry",
        "kod_nadzsj_d",
        "final_without_noise",
        "kod_orp",
        "Obyvatelstvo celkem",
    ]
)

data["Cluster"] = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
data["Cluster"].unique()

In [None]:
independent = data[variables]
target = data["Cluster"]

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    independent, target, test_size=0.3, random_state=0
)

In [None]:
model = RandomForestClassifier(random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)
pred

In [None]:
proba = model.predict_proba(X_test)
proba

In [None]:
model.classes_

In [None]:
pd.DataFrame(proba, columns=model.classes_, index=X_test.index)

In [None]:
accuracy = metrics.accuracy_score(pred, y_test)
kappa = metrics.cohen_kappa_score(pred, y_test)

summary = f"""\
Evaluation metrics
==================
Basic model:
  Accuracy: {round(accuracy, 3)}
  Kappa:    {round(kappa, 3)}
"""

print(summary)

In [None]:
predicted = model_selection.cross_val_predict(
    model, independent, target, cv=4, n_jobs=-1
)

ax = data.plot(predicted, legend=True, figsize=(9, 9), markersize=0.1, categorical=True)
ax.set_axis_off()

In [None]:
ax = data.plot(
    predicted == target,
    categorical=True,
    figsize=(9, 9),
    markersize=0.1,
    cmap="bwr_r",
    legend=True,
)
ax.set_axis_off()

In [None]:
feat_importances = pd.Series(
    model.feature_importances_, index=X_train.columns
).sort_values()
plt.figure(figsize=(5, 20))

feat_importances.plot(kind="barh")

In [None]:
gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    independent,
    target,
    groups=data.kod_orp,
)

In [None]:
split_label = np.empty(len(data), dtype=float)
for i, (test) in enumerate(splits):
    split_label[test] = i
data["split"] = split_label

In [None]:
ax = data.plot("split", categorical=True, figsize=(9, 9), markersize=0.1, legend=True)
data.dissolve("kod_orp").convex_hull.boundary.plot(
    ax=ax, color="k", linewidth=0.5, markersize=0
)
ax.set_axis_off()

In [None]:
train = data["split"] != 0
X_train = independent.loc[train]
y_train = data["Cluster"].loc[train]

test = data["split"] == 0
X_test = independent.loc[test]
y_test = data["Cluster"].loc[test]

In [None]:
rf_spatial_cv = RandomForestClassifier(random_state=0, n_jobs=-1)
rf_spatial_cv.fit(X_train, y_train)

In [None]:
pred = rf_spatial_cv.predict(X_test)

accuracy_spatial_cv = metrics.accuracy_score(pred, y_test)
kappa_spatial_cv = metrics.cohen_kappa_score(pred, y_test)

summary += f"""\
Basic model with spatial cross-validation:
  Accuracy: {round(accuracy_spatial_cv, 3)}
  Kappa:    {round(kappa_spatial_cv, 3)}
"""

print(summary)

In [None]:
predicted = model_selection.cross_val_predict(
    rf_spatial_cv, independent, target, cv=4, n_jobs=-1
)

ax = data.plot(predicted, legend=True, figsize=(9, 9), markersize=0.1, categorical=True)
ax.set_axis_off()

In [None]:
feat_importances = pd.Series(
    model.feature_importances_, index=X_train.columns
).sort_values()
plt.figure(figsize=(5, 20))

feat_importances.plot(kind="barh")