In [None]:
import geopandas as gpd
import pandas as pd
from sklearn import preprocessing

from core.gw import BandwidthSearch
from core.gw.ensemble import GWGradientBoostingClassifier, GWRandomForestClassifier
from core.gw.linear_model import GWLogisticRegression

In [None]:
from glob import glob

glob("/data/uscuni-restricted/05_pcs/*")

In [None]:
census = gpd.read_parquet(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_education_2021.parquet",
    columns=["nadzsjd", "geometry"],
)

In [None]:
census.plot()

In [None]:
pcas = pd.read_parquet("/data/uscuni-restricted/05_pcs/pcs_of_pcs.parquet")

In [None]:
data = census.join(pcas).dropna()

In [None]:
data.plot()

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)

In [None]:
cluster_mapping.loc[102]

In [None]:
data = data.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")

In [None]:
variables = data.columns[data.columns.str.startswith("pca")]

In [None]:
y = data["final_without_noise"].map(cluster_mapping[3]) == 8

In [None]:
# search = BandwidthSearch(
#     GWRandomForestClassifier,
#     fixed=False,
#     n_jobs=-1,
#     search_method="golden_section",
#     criterion="aicc",
#     max_bandwidth=1000,
#     max_iterations=10,
#     tolerance=0.05,
#     verbose=True,
#     batch_size=500,
#     min_proportion=0.05,
#     class_weight="balanced",
# )
# search.fit(
#     non_na[variables],
#     non_na["final_without_noise"].map(cluster_mapping[3]) == 8,
#     non_na.representative_point(),
# )

In [None]:
# search.oob_scores.sort_index().plot()

In [None]:
# search.optimal_bandwidth

In [None]:
data[variables]

In [None]:
rf = GWRandomForestClassifier(
    bandwidth=750,
    fixed=False,
    n_jobs=-1,
    keep_models=False,
    # temp_folder="/tmp",
    batch_size=1000,
    min_proportion=0.05,
    class_weight="balanced",
)
rf.fit(
    data[variables],
    y,
    data.representative_point(),
)

In [None]:
rf.oob_score_

In [None]:
rf.score_

In [None]:
rf.oob_balanced_accuracy_

In [None]:
rf.balanced_accuracy_

In [None]:
data.plot(
    rf.local_oob_score_,
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
data.plot(
    rf.local_oob_balanced_accuracy_,
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
data.plot(
    rf.focal_proba_[True],
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
data.plot(
    rf.feature_importances_["pca_ 0"],
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
lr = GWLogisticRegression(
    bandwidth=750,
    fixed=False,
    n_jobs=-1,
    keep_models=False,
    # temp_folder="/tmp",
    batch_size=1000,
    min_proportion=0.05,
    class_weight="balanced",
    max_iter=500,
)
lr.fit(
    data[variables],
    y,
    data.representative_point(),
)

In [None]:
lr.score_

In [None]:
lr.balanced_accuracy_

In [None]:
data.plot(
    lr.local_pred_score_,
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()

In [None]:
data.plot(
    lr.local_pred_balanced_accuracy_,
    legend=True,
    figsize=(16, 8),
    missing_kwds=dict(color="lightgray"),
).set_axis_off()