# Bandwidth search on Principal Component Analysis

Find optimal bandwidth for each class using a PCA option as the reduction input.

In [None]:
import os
import pathlib
from glob import glob

import geopandas as gpd
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from gwlearn.ensemble import GWRandomForestClassifier
from gwlearn.linear_model import GWLogisticRegression
from gwlearn.search import BandwidthSearch

In [None]:
# Open scores for adaptive bandwidth
a_files = glob("/data/uscuni-restricted/06_bandwidths/*adaptive*.csv")
a_files = [f for f in a_files if "pca" in f]

In [None]:
a_files

In [None]:
pca_bw = {1: 1500, 2: 300, 3: 1900, 4: 1500, 5: 2100, 6: 700, 7: 2300, 8: 2500}

In [None]:
# Add all labels to one list
a_files_list = []
labels = []

for i in a_files:
    label = os.path.splitext(os.path.basename(i))
    labels.append(label)
    df = pd.read_csv(i, index_col=0).rename(columns={"aic": label})
    a_files_list.append(df)
a_data = pd.concat(a_files_list, axis=1)

# Create plot
fig, ax = plt.subplots(figsize=(10, 10))

# Plot the lines
a_data.plot(ax=ax)

# Add nodes at minima
for col in a_data:
    min_idx = a_data[col].idxmin()
    min_val = a_data[col].min()
    print(col)
    print(min_idx, min_val)
    ax.plot(min_idx, min_val, "o")

# Add grid: both major and minor lines
ax.grid(which="major", linestyle="-", linewidth=0.5, color="gray")
ax.minorticks_on()
ax.grid(which="minor", linestyle=":", linewidth=0.3, color="gray")

# Customize minor ticks spacing (optional)
ax.xaxis.set_minor_locator(plt.MultipleLocator(500))
ax.yaxis.set_minor_locator(plt.MultipleLocator(0.5))  # every 0.5 on y-axis

# Set plot limits
plt.ylim(66, 72)
plt.xlim(0, 12500)

# Add legend
plt.legend()

plt.show()

In [None]:
label

In [None]:
# Open scores for fixed bandwidth
f_files = glob("*adaptive*.csv")
f_files = [f for f in f_files if "fa" in f]

In [None]:
fa_bw = {1: 1300, 2: 300, 3: 2300, 4: 1500, 5: 1500, 6: 500, 7: 2300, 8: 2500}

In [None]:
# Add all labels to one list
f_files_list = []
for i in f_files:
    f_files_list.append(pd.read_csv(i, index_col=0).rename(columns={"aic": i}))
f_data = pd.concat(f_files_list, axis=1)

# Plot and print the scores
f_data.plot()
for col in f_data:
    print(col)
    print(f_data.loc[:, col].idxmin(), np.min(f_data.loc[:, col]))

In [None]:
# Open scores for fixed bandwidth
u_files = glob("*adaptive*.csv")
u_files = [f for f in u_files if "no_dr" in f]

In [None]:
umap_bw = {1: 1300, 2: 300, 3: 2100, 4: 1500, 5: 1700, 6: 700, 7: 2300, 8: 2500}

In [None]:
# Add all labels to one list
u_files_list = []
for i in u_files:
    u_files_list.append(pd.read_csv(i, index_col=0).rename(columns={"aic": i}))
u_data = pd.concat(u_files_list, axis=1)

# Plot and print the scores
plt.figure(figsize=(10, 6))
ax = u_data.plot(ax=plt.gca())
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left", borderaxespad=0.0)

for col in u_data:
    print(col)
    print(u_data.loc[:, col].idxmin(), np.min(u_data.loc[:, col]))