# Random forest Gridsearch  based on Factor Analysis

Tune hyperparameters using a FA option as the reduction input.

In [None]:
import pathlib

import geopandas as gpd
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

from gwlearn.ensemble import GWRandomForestClassifier
from gwlearn.linear_model import GWLogisticRegression

In [None]:
# Load data
fas = gpd.read_parquet("/data/uscuni-restricted/05_fa/fa_of_fa_geometries.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)
data = fas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
# Define optimal bandwidth for each class
bandwidth_dict = {1: 70000, 3: 85000, 4: 40000, 5: 25000, 6: 25000, 7: 40000, 8: 70000}

# Create empty list to store results
gridsearch_results = []

for label in mapped.unique():
    y = mapped == label
    bandwidth = bandwidth_dict[label]

    # Define parameters for gridsearch
    print(f"Label: {label},Bandwidth: {bandwidth}, RF gridsearch.")
    max_features = [4, 5, 6]
    n_estimators = [50, 100, 200]

    results_size = (len(max_features), len(n_estimators))
    results = np.zeros(results_size, dtype=np.float64)

    best_score = -np.inf
    best_params = None
    best_clf = None

    # Run gridsearch
    for i, mf in enumerate(max_features):
        for j, ne in enumerate(n_estimators):
            print(f"Training RF with max_features={mf}, n_estimators={ne}")

            rf = GWRandomForestClassifier(
                bandwidth=bandwidth,
                fixed=True,
                n_jobs=-1,
                keep_models=False,
                batch_size=1000,
                min_proportion=0.1,
                class_weight="balanced",
                undersample=True,
                min_samples_split=4,
                min_samples_leaf=2,
                max_features=mf,
                n_estimators=ne,
            )

            rf.fit(data[variables], y, data.representative_point())

            score = rf.oob_score_
            results[i, j] = score

            print(f"OOB Score: {score:.4f}")

            if score > best_score:
                best_score = score
                best_params = {"max_features": mf, "n_estimators": ne}
                best_clf = rf

    print(f"Label: {label}, RF manual gridsearch done.")
    print(f"Best parameters for {label}: {best_params}, Best score: {best_score:.4f}")

    # Save gridsearch results
    gridsearch_results.append(
        {
            "label": label,
            "bandwidth": bandwidth,
            "best_max_features": best_params["max_features"],
            "best_n_estimators": best_params["n_estimators"],
            "best_score": best_score,
        }
    )

results_df = pd.DataFrame(gridsearch_results)
results_df.to_csv("rf_fa_gridsearch_results.csv", index=False)