# Bandwidth search on Principal Component Analysis

Find optimal bandwidth for each class using a PCA option as the reduction input.

In [None]:
import pathlib
from glob import glob

import geopandas as gpd
import joblib
import numpy as np
import pandas as pd

from gwlearn.ensemble import GWRandomForestClassifier
from gwlearn.linear_model import GWLogisticRegression
from gwlearn.search import BandwidthSearch

In [None]:
# Load data
pcas = gpd.read_parquet("/data/uscuni-restricted/05_pcs/pcs_new25.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)
data = pcas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
# adaptive bandwidth search
for label in mapped.unique():
    y = mapped == label

    print(f"Label: {label}")
    search = BandwidthSearch(
        GWLogisticRegression,
        fixed=False,
        n_jobs=-1,
        search_method="interval",
        min_bandwidth=100,
        max_bandwidth=3500,
        interval=200,
        criterion="aic",
        verbose=True,
        max_iter=500,
        batch_size=1000,
        min_proportion=0.1,
        class_weight="balanced",
        undersample=True,
    )
    search.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    search.scores_.to_frame().to_csv(f"{label}_adaptive_bandwith_scores_pca_new.csv")

In [None]:
# fixed bandwidth search
for label in mapped.unique():
    y = mapped == label

    print(f"Label: {label}")
    search = BandwidthSearch(
        GWLogisticRegression,
        fixed=True,
        n_jobs=-1,
        search_method="interval",
        min_bandwidth=10_000,
        max_bandwidth=150_000,
        interval=15_000,
        criterion="aic",
        verbose=True,
        max_iter=500,
        batch_size=750,
        min_proportion=0.1,
        class_weight="balanced",
        undersample=True,
    )
    search.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    search.scores_.to_frame().to_csv(f"{label}_fixed_bandwith_scores_new.csv")

In [None]:
# Load data
fas = gpd.read_parquet("/data/uscuni-restricted/05_fa/fa_new.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)
data = fas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
# adaptive bandwidth search
for label in [5]:
    y = mapped == label

    print(f"Label: {label}")
    search = BandwidthSearch(
        GWLogisticRegression,
        fixed=False,
        n_jobs=-1,
        search_method="interval",
        min_bandwidth=100,
        max_bandwidth=3500,
        interval=200,
        criterion="aic",
        verbose=True,
        max_iter=500,
        batch_size=1000,
        min_proportion=0.1,
        class_weight="balanced",
        undersample=True,
    )
    search.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    search.scores_.to_frame().to_csv(f"{label}_adaptive_bandwith_scores_new_fa.csv")

In [None]:
umap = gpd.read_parquet("/data/uscuni-restricted/05_umap/umap_new.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)
data = umap.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
# adaptive bandwidth search
for label in mapped.unique():
    y = mapped == label

    print(f"Label: {label}")
    search = BandwidthSearch(
        GWLogisticRegression,
        fixed=False,
        n_jobs=-1,
        search_method="interval",
        min_bandwidth=100,
        max_bandwidth=3500,
        interval=200,
        criterion="aic",
        verbose=True,
        max_iter=1000,
        batch_size=1000,
        min_proportion=0.1,
        class_weight="balanced",
        undersample=True,
    )
    search.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    search.scores_.to_frame().to_csv(f"{label}_adaptive_bandwith_scores_new_umap.csv")

In [None]:
# fixed bandwidth search
for label in mapped.unique():
    y = mapped == label

    print(f"Label: {label}")
    search = BandwidthSearch(
        GWLogisticRegression,
        fixed=True,
        n_jobs=-1,
        search_method="interval",
        min_bandwidth=10_000,
        max_bandwidth=150_000,
        interval=15_000,
        criterion="aic",
        verbose=True,
        max_iter=500,
        batch_size=750,
        min_proportion=0.1,
        class_weight="balanced",
        undersample=True,
    )
    search.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    search.scores_.to_frame().to_csv(f"{label}_fixed_bandwith_scores_new_fa.csv")