# Train GW models based on Factor analysis

Train models per class using FA as the reduction input.

In [None]:
import pathlib

import geopandas as gpd
import joblib
import pandas as pd

from gwlearn.ensemble import GWRandomForestClassifier
from gwlearn.linear_model import GWLogisticRegression

Prepare folders.

In [None]:
pathlib.Path("/data/uscuni-restricted/06_models").mkdir(exist_ok=True)
pathlib.Path("/data/uscuni-restricted/06_models/fa").mkdir(exist_ok=True)

Read data.

In [None]:
fas = gpd.read_parquet("/data/uscuni-restricted/05_fa/fa_18.parquet")
clusters = pd.read_csv(
    "/data/uscuni-restricted/04_spatial_census/cluster_assignment_v10.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v10.pq"
)
data = fas.merge(clusters, left_on="nadzsjd", right_on="kod_nadzsj_d")
variables = data.columns.drop(["geometry", "kod_nadzsj_d", "final_without_noise"])

mapped = data["final_without_noise"].map(cluster_mapping[3])

In [None]:
bandwidth_dict = {1: 1300, 2: 300, 3: 2300, 4: 1500, 5: 1500, 6: 500, 7: 2300, 8: 2500}

Loop over cluster labels, train RF and LR and save them.

In [None]:
for label in mapped.unique():
    p = pathlib.Path(f"/data/uscuni-restricted/06_models/fa/label_{label}/")
    p.mkdir(exist_ok=True)
    y = mapped == label
    bandwidth = bandwidth_dict[label]

    # RF
    print(f"Label: {label}, training RF.")
    rf_path = p.joinpath("rf")
    rf_path.mkdir(exist_ok=True)

    rf_lm_path = rf_path.joinpath("local")
    rf_lm_path.mkdir(exist_ok=True)

    rf = GWRandomForestClassifier(
        bandwidth=bandwidth,
        fixed=False,
        n_jobs=-1,
        keep_models=rf_lm_path,
        batch_size=1000,
        min_proportion=0.05,
        class_weight="balanced",
        undersample=True,
        min_samples_split=4,
        min_samples_leaf=2,
        n_estimators=100,
        random_state=42,
    )
    rf.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    with open(rf_path.joinpath("model.joblib"), "wb") as f:
        joblib.dump(rf, f, protocol=5)
    print(f"Label: {label}, RF trained. Balanced accuracy: {rf.balanced_accuracy_:.3f}")

    # LR
    print(f"Label: {label}, training LR.")
    lr_path = p.joinpath("lr")
    lr_path.mkdir(exist_ok=True)

    lr_lm_path = lr_path.joinpath("local")
    lr_lm_path.mkdir(exist_ok=True)

    lr = GWLogisticRegression(
        bandwidth=bandwidth,
        fixed=False,
        n_jobs=-1,
        keep_models=lr_lm_path,
        batch_size=1000,
        min_proportion=0.05,
        class_weight="balanced",
        max_iter=1000,
        undersample=True,
        random_state=42,
    )
    lr.fit(
        data[variables],
        y,
        data.representative_point(),
    )
    with open(lr_path.joinpath("model.joblib"), "wb") as f:
        joblib.dump(lr, f, protocol=5)
    print(f"Label: {label}, LR trained. Balanced accuracy: {lr.balanced_accuracy_:.3f}")