In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shap
from sklearn import ensemble, metrics, model_selection

In [None]:
!pip install shap

# Import and Preprocess Files

In [None]:
combined_gdf = gpd.read_parquet("/data/processed_data/regression_stats")
combined_gdf = combined_gdf.drop("cluster", axis=1)

Run the following if a new statistics file should be generated

In [None]:
# Define the file paths for each GeoDataFrame
files = [
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/%german.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/%holidayapt.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/%rented.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/age.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/apartmentno.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/buildingyear.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/familysize.parquet",
    "/home/lisa/work/people_places_germany/Notebooks/temp_data/floorspace.parquet",
]

# Initialize an empty GeoDataFrame for the final combined data
combined_gdf = None

# Iterate over the files, reading only necessary columns and merging
for file in files:
    # Read 'ID', 'geometry', 'STATS', and 'cluster' columns
    gdf = gpd.read_parquet(file)

    columns = gdf.columns

    # Generate names for 'STATS' and 'cluster' columns to avoid conflicts
    base_name = file.split("/")[-1].replace(".parquet", "")
    stat_col_name = base_name

    # Rename the columns
    gdf.rename(
        columns={"STATS": stat_col_name, "STATS_lag": stat_col_name + "_lag"},
        inplace=True,
    )

    # Merge with the combined GeoDataFrame
    if combined_gdf is None:
        combined_gdf = gdf
    else:
        combined_gdf = combined_gdf.merge(gdf, on=["ID", "geometry", "cluster"])

# Now 'combined_gdf' contains all 'STATS' columns from the different GeoDataFrames

In [None]:
combined_gdf.to_parquet("/data/processed_data/regression_stats")

In [None]:
combined_gdf

In [None]:
clusters = gpd.read_parquet(
    "/data/cluster_data/clusters_umap_freiburg_100_3_gaussian_euclidean_complete_chebyshev_3.pq"
)

In [None]:
clusters

In [None]:
clusters.explore(column="label", cmap="tab20")

In [None]:
morphotopes = gpd.read_parquet(
    "/data/cluster_data/morphotopes_freiburg_100_3_gaussian.pq"
).reset_index()

In [None]:
morphotopes["morph_label"] = morphotopes.index

Run the following overlay code for clusters and/or morphotopes if the cells have not yet been assigned the appropriate cluster or morphotope label

In [None]:
overlap = gpd.overlay(combined_gdf, clusters, how="intersection")

In [None]:
overlap = gpd.overlay(combined_gdf, morphotopes, how="intersection")

In [None]:
# Calculate the area of overlap
overlap["area"] = overlap.geometry.area

# Find the cluster with the largest overlap for each cell
largest_overlap = overlap.loc[overlap.groupby(overlap["ID"])["area"].idxmax()]

In [None]:
combined_gdf = combined_gdf.merge(largest_overlap[["ID", "label"]], on="ID")

In [None]:
combined_gdf

In [None]:
combined_gdf[["geometry", "label"]].explore(column="label", cmap="tab20")

In [None]:
# Assuming gdf is your GeoDataFrame
nan_percentage = combined_gdf.isna().mean() * 100

# Display the percentage of NaN values for each column
print(nan_percentage)

In [None]:
combined_gdf.columns

# Random Forest

In [None]:
training_sample = combined_gdf.sample(2119, random_state=0)

Run the random forest with or without spatially lagged variables

In [None]:
independent_variables = [
    "%german",
    "%rented",
    "age",
    "apartmentno",
    "buildingyear",
    "familysize",
    "floorspace",
]

In [None]:
"""
independent_variables = [
    "%german",
    "%rented",
    "age",
    "apartmentno",
    "buildingyear",
    "familysize",
    "floorspace",
    "%german_lag",
    "%rented_lag",
    "age_lag",
    "apartmentno_lag",
    "buildingyear_lag",
    "familysize_lag",
    "floorspace_lag"
]
"""

In [None]:
independent = training_sample[independent_variables]
independent

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    independent, training_sample["label"], test_size=0.25, random_state=0
)

In [None]:
model = ensemble.RandomForestClassifier(random_state=0, n_jobs=-1)
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
proba = model.predict_proba(X_test)

In [None]:
pd.DataFrame(proba, columns=model.classes_, index=X_test.index)

In [None]:
accuracy = metrics.accuracy_score(pred, y_test)
kappa = metrics.cohen_kappa_score(pred, y_test)

summary = f"""\
Evaluation metrics
==================
Basic model:
  Accuracy: {round(accuracy, 3)}
  Kappa:    {round(kappa, 3)}
"""

print(summary)

In [None]:
pd.Series(model.feature_importances_, index=model.feature_names_in_).sort_values()

In [None]:
gkf = model_selection.StratifiedGroupKFold(n_splits=5)
splits = gkf.split(
    training_sample,
    training_sample.label,
    groups=training_sample.morph_label,
)

In [None]:
gkf

In [None]:
split_label = np.empty(len(training_sample), dtype=float)
for i, (train, test) in enumerate(splits):
    split_label[test] = i
training_sample["split"] = split_label

In [None]:
ax = training_sample.plot(
    "split", categorical=True, figsize=(9, 9), markersize=0.1, legend=True
)
training_sample.dissolve("morph_label").convex_hull.boundary.plot(
    ax=ax, color="k", linewidth=0.5, markersize=0
)
ax.set_axis_off()

In [None]:
train = training_sample["split"] != 0
X_train = independent.loc[train]
y_train = training_sample["label"].loc[train]

test = training_sample["split"] == 0
X_test = independent.loc[test]
y_test = training_sample["label"].loc[test]

In [None]:
rf_spatial_cv = ensemble.RandomForestClassifier(random_state=0, n_jobs=-1)
rf_spatial_cv.fit(X_train, y_train)

In [None]:
pred = rf_spatial_cv.predict(X_test)

accuracy_spatial_cv = metrics.accuracy_score(pred, y_test)
kappa_spatial_cv = metrics.cohen_kappa_score(pred, y_test)

summary += f"""\
Basic model with spatial cross-validation:
  Accuracy: {round(accuracy_spatial_cv, 3)}
  Kappa:    {round(kappa_spatial_cv, 3)}
"""

print(summary)

In [None]:
pd.Series(model.feature_importances_, index=model.feature_names_in_).sort_values()

In [None]:
# Create Tree Explainer object that can calculate shap values
explainer = shap.TreeExplainer(rf_spatial_cv)

In [None]:
shap_values = explainer.shap_values(X_test)

In [None]:
for i, class_name in enumerate(rf_spatial_cv.classes_):
    print(f"SHAP summary plot for class: {class_name}")
    plt.figure()
    shap.summary_plot(
        shap_values[:, :, i], X_test, feature_names=independent_variables, show=False
    )
    plt.savefig("shap_cluster" + str(class_name) + ".png")