# Exploration of geographically weighted random forest classification modelling

To-do:
- [x] global model
- [x] model evaluation
- [x] bandwidth optimisation
- [ ] feature importances
- [ ] golden section bandwidth selection

In [None]:
import pandas as pd
import numpy as np
import warnings
import geopandas as gpd
from libpysal import graph
from sklearn.ensemble import RandomForestClassifier
from geodatasets import get_path
from joblib import Parallel, delayed
from typing import Hashable, Callable

Get sample data

In [None]:
gdf = gpd.read_file(get_path("geoda.ncovr"))

In [None]:
# It is in the geographic coords in the  US and we need to work with distances. Re-project and use only points as the graph builder will require points anyway.
gdf = gdf.set_geometry(gdf.representative_point()).to_crs(5070)

Define a base class for the heavy lifting.

In [None]:
def _triangular(distances, bandwidth):
    u = np.clip(distances / bandwidth, 0, 1)
    return 1 - u


def _parabolic(distances, bandwidth):
    u = np.clip(distances / bandwidth, 0, 1)
    return 0.75 * (1 - u**2)


def _gaussian(distances, bandwidth):
    u = distances / bandwidth
    return np.exp(-((u / 2) ** 2)) / (np.sqrt(2) * np.pi)


def _bisquare(distances, bandwidth):
    u = np.clip(distances / bandwidth, 0, 1)
    return (15 / 16) * (1 - u**2) ** 2


def _cosine(distances, bandwidth):
    u = np.clip(distances / bandwidth, 0, 1)
    return (np.pi / 4) * np.cos(np.pi / 2 * u)


def _exponential(distances, bandwidth):
    u = distances / bandwidth
    return np.exp(-u)


def _boxcar(distances, bandwidth):
    r = (distances < bandwidth).astype(int)
    return r


_kernel_functions = {
    "triangular": _triangular,
    "parabolic": _parabolic,
    "gaussian": _gaussian,
    "bisquare": _bisquare,
    "cosine": _cosine,
    "boxcar": _boxcar,
    "exponential": _exponential,
}

In [None]:
class GWM:
    """Generic geographically weighted modelling meta-class

    Parameters
    ----------
    model :  model class
        Scikit-learn model class
    bandwidth : int | float
        bandwidth value consisting of either a distance or N nearest neighbors
    fixed : bool, optional
        True for distance based bandwidth and False for adaptive (nearest neighbor) bandwidth, by default False
    kernel : str, optional
        type of kernel function used to weight observations, by default "bisquare"
    n_jobs : int, optional
        The number of jobs to run in parallel. ``-1`` means using all processors by default ``-1``
    fit_global_model : bool, optional
        Determines if the global baseline model shall be fitted alognside the geographically weighted, by default True
    strict : bool, optional
        Do not fit any models if at least one neighborhood has invariant ``y``, by default False
    keep_models : bool, optional
        Keep all local models (required for prediction), by default True. Note that for some models,
        like random forests, the objects can be large.
    **kwargs
        Additional keyword arguments passed to ``model`` initialisation
    """

    def __init__(
        self,
        model,
        bandwidth: int | float,
        fixed: bool = False,
        kernel: str | Callable = "bisquare",
        n_jobs: int = -1,
        fit_global_model: bool = True,
        strict: bool = False,
        keep_models=False,
        **kwargs,
    ):
        self.model = model
        self.bandwidth = bandwidth
        self.kernel = kernel
        self.fixed = fixed
        self.model_kwargs = kwargs
        self.n_jobs = n_jobs
        self.fit_global_model = fit_global_model
        self.strict = strict
        self.keep_models = keep_models

    def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries):
        """Fit the geographically weighted model

        Parameters
        ----------
        X : pd.DataFrame
            Independent variables
        y : pd.Series
            Dependent variable
        geometry : gpd.GeoSeries
            Geographic location
        """
        # build graph
        if self.fixed:  # fixed distance
            self.weights = graph.Graph.build_kernel(
                geometry, kernel=self.kernel, bandwidth=self.bandwidth
            )
        else:  # adaptive KNN
            weights = graph.Graph.build_kernel(
                geometry, kernel="identity", k=self.bandwidth
            )
            # post-process identity weights by the selected kernel
            # and kernel bandwidth derived from each neighborhood
            bandwidth = weights._adjacency.groupby(level=0).transform("max")
            self.weights = graph.Graph(
                adjacency=_kernel_functions[self.kernel](weights._adjacency, bandwidth),
                is_sorted=True,
            )

        # fit the models
        data = X.copy()
        data["_y"] = y
        data = data.loc[self.weights._adjacency.index.get_level_values(1)]
        data["_weight"] = self.weights._adjacency.values
        grouper = data.groupby(self.weights._adjacency.index.get_level_values(0))

        if self.strict:
            invariant = (
                data["_y"]
                .groupby(self.weights._adjacency.index.get_level_values(0))
                .nunique()
                == 1
            )
            if invariant.any():
                raise ValueError(
                    f"y at locations {invariant.index[invariant]} is invariant."
                )

        # models are fit in parallel
        traning_output = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_local)(
                self.model, group, name, self.model_kwargs, self.keep_models
            )
            for name, group in grouper
        )
        if self.keep_models:
            names, accuracy_data, models = zip(*traning_output)
            self.local_models = pd.Series(models, index=names)
        else:
            names, accuracy_data = zip(*traning_output)

        if self.fit_global_model:
            # fit global model as a baseline
            self.global_model = self.model(
                n_jobs=self.n_jobs, oob_score=True, **self.model_kwargs
            )
            self.global_model.fit(X=X, y=y)

        # global GW accuracy
        true, n = zip(*accuracy_data)
        self.oob_score_ = sum(true) / sum(n)
        self.local_oob_score_ = pd.Series(np.array(true) / np.array(n), index=names)

        return self

    def _fit_local(
        self,
        model,
        data: pd.DataFrame,
        name: Hashable,
        model_kwargs: dict,
        keep_models: bool,
    ) -> tuple:
        """Fit individual local model

        Parameters
        ----------
        model : model class
            Scikit-learn model class
        data : pd.DataFrame
            data for training
        name : Hashable
            group name, matching the index of the focal geometry
        model_kwargs : dict
            additional keyword arguments for the model init

        Returns
        -------
        tuple
            name, fitted model
        """
        if data["_y"].nunique() == 1:
            warnings.warn(f"y at location {name} is invariant.")
        local_model = model(oob_score=self._accuracy_data, **model_kwargs)
        local_model.fit(
            X=data.drop(columns=["_y", "_weight"]),
            y=data["_y"],
            sample_weight=data["_weight"],
        )
        if keep_models:
            return name, local_model.oob_score_, local_model
        else:
            return name, local_model.oob_score_

    def _accuracy_data(self, true, pred):
        return sum(true.flatten() == pred), len(pred)

Try with RF Classifier

In [None]:
gwrf = GWM(
    RandomForestClassifier, bandwidth=50, fixed=False, n_jobs=-1, keep_models=False
)
gwrf.fit(
    gdf.iloc[:, 9:15],
    gdf["STATE_NAME"],
    gdf.geometry,
)

Global OOB score (accuracy) for the GW model.

In [None]:
gwrf.oob_score_

Local OOB score.

In [None]:
gdf.plot(gwrf.local_oob_score_, legend=True, s=2)

OOB score of the global model.

In [None]:
gwrf.global_model.oob_score_

Define bandwidth search

In [None]:
class BandwidthSearch:
    """Optimal bandwidth search for geographically-weighted models

    Parameters
    ----------
    model :  model class
        Scikit-learn model class
    fixed : bool, optional
        True for distance based bandwidth and False for adaptive (nearest neighbor) bandwidth, by default False
    kernel : str, optional
        type of kernel function used to weight observations, by default "bisquare"
    n_jobs : int, optional
        The number of jobs to run in parallel. ``-1`` means using all processors by default ``-1``
    fit_global_model : bool, optional
        Determines if the global baseline model shall be fitted alognside the geographically weighted.
    **kwargs
        Additional keyword arguments passed to ``model`` initialisation
    """

    def __init__(
        self,
        model,
        fixed: bool = False,
        kernel: str | Callable = "bisquare",
        n_jobs: int = -1,
        search_method: str = "interval",
        min_bandwidth: int | float | None = None,
        max_bandwidth: int | float | None = None,
        interval: int | float | None = None,
        **kwargs,
    ) -> None:
        self.model = model
        self.kernel = kernel
        self.fixed = fixed
        self.model_kwargs = kwargs
        self.n_jobs = n_jobs
        self.search_method = search_method
        self.min_bandwidth = min_bandwidth
        self.max_bandwidth = max_bandwidth
        self.interval = interval

    def fit(self, X: pd.DataFrame, y: pd.Series, geometry: gpd.GeoSeries) -> None:
        """Fit the geographically weighted model

        Parameters
        ----------
        X : pd.DataFrame
            Independent variables
        y : pd.Series
            Dependent variable
        geometry : gpd.GeoSeries
            Geographic location
        """
        self.oob_scores = {}
        bw = self.min_bandwidth
        while bw <= self.max_bandwidth:
            print(bw)
            try:
                self.oob_scores[bw] = (
                    GWM(
                        model=self.model,
                        bandwidth=bw,
                        fixed=self.fixed,
                        kernel=self.kernel,
                        n_jobs=self.n_jobs,
                        fit_global_model=False,
                        strict=True,
                        **self.model_kwargs,
                    )
                    .fit(X=X, y=y, geometry=geometry)
                    .oob_score_
                )
            except ValueError:  # invariant subset
                self.oob_scores[bw] = np.nan
            bw += self.interval
        self.oob_scores = pd.Series(self.oob_scores, name="oob_score")

In [None]:
search = BandwidthSearch(
    RandomForestClassifier,
    fixed=False,
    n_jobs=-1,
    min_bandwidth=50,
    max_bandwidth=250,
    interval=50,
)
search.fit(
    gdf.iloc[:, 9:15],
    gdf["STATE_NAME"],
    gdf.geometry,
)

In [None]:
search.oob_scores