In [None]:
import geopandas as gpd
import pandas as pd
from sklearn import preprocessing

from core.gw import BandwidthSearch
from core.gw.ensemble import GWGradientBoostingClassifier, GWRandomForestClassifier
from core.gw.linear_model import GWLogisticRegression

In [None]:
from glob import glob

glob("/data/uscuni-restricted/04_spatial_census/*")

In [None]:
census = gpd.read_parquet(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_education_2021.parquet"
)

In [None]:
total_pop = pd.read_csv(
    "/data/uscuni-restricted/04_spatial_census/total.csv",
    index_col=0,
    dtype={"nadzsjd": str},
)

In [None]:
census = census.merge(total_pop, on="nadzsjd")

In [None]:
variables = census.columns[census.columns.str.startswith("Obyvatelstvo -")]

In [None]:
census[variables] = census[variables].div(census["Obyvatelstvo celkem"], axis=0)

In [None]:
non_na = census.dropna(subset=variables)

In [None]:
non_na.shape

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)

In [None]:
cluster_mapping.loc[8]

In [None]:
non_na = non_na.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")

In [None]:
search = BandwidthSearch(
    GWRandomForestClassifier,
    fixed=False,
    n_jobs=-1,
    search_method="golden_section",
    criterion="aicc",
    max_bandwidth=1000,
    max_iterations=10,
    tolerance=0.05,
    verbose=True,
    batch_size=500,
    min_proportion=0.05,
    class_weight="balanced",
)
search.fit(
    non_na[variables],
    non_na["final_without_noise"].map(cluster_mapping[3]) == 1,
    non_na.representative_point(),
)

In [None]:
search.oob_scores.sort_index().plot()

In [None]:
search.optimal_bandwidth

In [None]:
rf = GWRandomForestClassifier(
    bandwidth=750,
    fixed=False,
    n_jobs=-1,
    keep_models=False,
    # temp_folder="/tmp",
    batch_size=1000,
    min_proportion=0.05,
    class_weight="balanced",
)
rf.fit(
    non_na[variables],
    non_na["final_without_noise"].map(cluster_mapping[4]) == 2,
    non_na.representative_point(),
)

In [None]:
rf.oob_score_

In [None]:
rf.score_

In [None]:
non_na.plot(
    rf.local_oob_score_,
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
non_na.plot(
    rf.focal_proba_[True],
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
non_na.plot(
    rf.feature_importances_[
        "Obyvatelstvo - věk: 15 a více - nejvyšší dosažené vzdělání:  vysokoškolské - muži"
    ],
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
focal_pred = rf.focal_proba_.iloc[:, 0] > 0.5

In [None]:
from sklearn import metrics

y = non_na["final_without_noise"].map(cluster_mapping[4]) == 2

In [None]:
metrics.accuracy_score(y, focal_pred)

In [None]:
metrics.f1_score(y, focal_pred, average="macro")

In [None]:
metrics.f1_score(y, focal_pred, average="micro")

In [None]:
metrics.f1_score(y, focal_pred, average="weighted")

In [None]:
rf.oob_score_