# Exploration of geographically weighted random forest classification modelling

To-do:
- [x] global model
- [x] model evaluation
- [x] bandwidth optimisation
- [x] feature importances
- [x] golden section bandwidth selection
- [x] other metrics than accuracy
- [x] generic support (logistic regression, gradient boosting)
- [x] dedicated classes
- [ ] local performance of models that do not support OOB
    - [x] with logistic regression I guess we can do predict_proba and measure those on the full sample directly
    - with gradient boosting we can't as the model has seen the data - might need to split to train/test to mimic OOB.
- [x] logistic regression local coefficients
- [x] (optionally) predict method

In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from geodatasets import get_path
from sklearn import metrics, preprocessing

from core.gw import BandwidthSearch
from core.gw.ensemble import GWGradientBoostingClassifier, GWRandomForestClassifier
from core.gw.linear_model import GWLogisticRegression

Get sample data

In [None]:
gdf = gpd.read_file(get_path("geoda.ncovr"))

In [None]:
gdf.shape

In [None]:
# It is in the geographic coords in the  US and we need to work with distances. Re-project and use only points as the graph builder will require points anyway.
gdf = gdf.set_geometry(gdf.representative_point()).to_crs(5070)

In [None]:
y = gdf["FH90"] > gdf["FH90"].median()

### Random forest

In [None]:
gwrf = GWRandomForestClassifier(
    bandwidth=250,
    fixed=False,
    n_jobs=-1,
    keep_models=False,
)
gwrf.fit(
    gdf.iloc[:, 9:15],
    y,
    gdf.geometry,
)

Global OOB accuracy for the GW model measured based on OOB predictions from individual local trees.

In [None]:
gwrf.oob_score_

In [None]:
gwrf.oob_precision_

In [None]:
gwrf.oob_recall_

In [None]:
gwrf.oob_balanced_accuracy_

Local OOB accuracy.

In [None]:
gdf.plot(gwrf.local_oob_score_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_precision_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_recall_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_balanced_accuracy_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_f1_macro_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_f1_micro_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.local_oob_f1_weighted_, legend=True, s=2)

In [None]:
gdf.plot(gwrf.focal_proba_[True], legend=True, s=2)

In [None]:
gdf.plot(y, legend=True, s=2, cmap="Set1_r")

Global accuracy for the GW model measured based on prediction of focals.

In [None]:
gwrf.score_

F1 scores for the GW model measured based on prediction of focals. 

In [None]:
gwrf.f1_macro_, gwrf.f1_micro_, gwrf.f1_weighted_

OOB score (accuracy) of the global model.

In [None]:
gwrf.global_model.oob_score_

Get local feature importances.

In [None]:
gwrf.feature_importances_

In [None]:
gdf.plot(gwrf.feature_importances_["HC60"], legend=True, s=2)

Compare to global feature importance.

In [None]:
gwrf.global_model.feature_importances_

### Gradient boosting

In [None]:
gwgb = GWGradientBoostingClassifier(
    bandwidth=250,
    fixed=False,
    n_jobs=-1,
    keep_models=False,
)
gwgb.fit(
    gdf.iloc[:, 9:15],
    y,
    gdf.geometry,
)

Global score (accuracy) for the GW model measured based on prediction of focals.

In [None]:
gwgb.score_

F1 scores for the GW model measured based on prediction of focals. 

In [None]:
gwgb.f1_macro_, gwgb.f1_micro_, gwgb.f1_weighted_

Get local feature importances.

In [None]:
gwgb.feature_importances_

In [None]:
gdf.plot(gwgb.feature_importances_["HR90"], legend=True, s=2)

Compare to global feature importance.

In [None]:
gwgb.global_model.feature_importances_

### Logistic regression

In [None]:
gwlr = GWLogisticRegression(
    bandwidth=900_000,
    fixed=True,
    n_jobs=-1,
    keep_models=True,
    max_iter=500,
)
gwlr.fit(
    pd.DataFrame(
        preprocessing.scale(gdf.iloc[:, 9:15]), columns=gdf.iloc[:, 9:15].columns
    ),
    gdf["FH90"] > gdf["FH90"].median(),
    gdf.geometry,
)

In [None]:
gwlr.score_

In [None]:
gwlr.pred_f1_micro

In [None]:
gdf.plot(gwlr.local_pred_f1_micro_, legend=True, s=2)

In [None]:
gwlr.f1_macro_, gwlr.f1_micro_, gwlr.f1_weighted_

Local coefficients

In [None]:
gwlr.local_coef_

In [None]:
gdf.plot(gwlr.local_coef_["HR90"], legend=True, s=2)

Local intercepts

In [None]:
gdf.plot(gwlr.local_intercept_, s=2, legend=True)

## Bandwidth search

Golden section search with a fixed distance bandwidth.

In [None]:
search = BandwidthSearch(
    GWRandomForestClassifier,
    fixed=True,
    n_jobs=-1,
    search_method="golden_section",
    criterion="aic",
    max_iterations=10,
    min_bandwidth=250_000,
    max_bandwidth=2_000_000,
    verbose=True,
)
search.fit(
    gdf.iloc[:, 9:15],
    y,
    gdf.geometry,
)

Get the optimal one.

In [None]:
search.optimal_bandwidth

Golden section search with an adaptive KNN bandwidth.

In [None]:
search = BandwidthSearch(
    GWLogisticRegression,
    fixed=False,
    n_jobs=-1,
    search_method="interval",
    min_bandwidth=10,
    max_bandwidth=3084,
    interval=200,
    criterion="aic",
    verbose=True,
    max_iter=500,  # passed to log regr
)
search.fit(
    pd.DataFrame(
        preprocessing.scale(gdf.iloc[:, 9:15]), columns=gdf.iloc[:, 9:15].columns
    ),
    y,
    gdf.geometry,
)

In [None]:
search.scores_.idxmin()

In [None]:
search.oob_scores.plot()

Get the optimal one.

In [None]:
search.optimal_bandwidth

## Prediction

If you want to use the model for prediction, all the local models need to be retained. That may require significant memory for RF.

In [None]:
gwlr = GWLogisticRegression(
    bandwidth=1210,
    fixed=False,
    n_jobs=-1,
    # search_method="golden_section",
    # criterion="aic",
    # max_iterations=10,
    # tolerance=0.1,
    verbose=True,
    max_iter=500,  # passed to log regr
    measure_performance=False,
)
gwlr.fit(
    pd.DataFrame(
        preprocessing.scale(gdf.iloc[:, 9:15]), columns=gdf.iloc[:, 9:15].columns
    ),
    gdf["FH90"] > gdf["FH90"].median(),
    gdf.geometry,
)

In [None]:
all_data = pd.DataFrame(
    preprocessing.scale(gdf.iloc[:, 9:15]), columns=gdf.iloc[:, 9:15].columns
)

Predict probabilities

In [None]:
pp = gwlr.predict_proba(all_data.iloc[:10], geometry=gdf.geometry.iloc[:10])
pp

Predict label (taking max of probabilities)

In [None]:
gwlr.predict(all_data.iloc[5:10], geometry=gdf.geometry.iloc[5:10])