In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
def process_file(path, path_total):
    # Open data for total population
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0).set_index(
        "nadzsjd"
    )
    # Open data diles
    data = gpd.read_parquet(path).set_index("nadzsjd")
    # Merge data
    data_total = data.join(total)
    # Remove unnecessary columns
    data_relative = data_total.drop(data.columns[:12], axis=1)
    # Do some preprocessing
    data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
    data_relative[data_relative.columns.drop("geometry")] = data_relative[
        data_relative.columns.drop("geometry")
    ].astype(float)

    # Normalize the data
    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    # Drop NaN values
    data_relative = data_relative.dropna(axis=0)

    scaler = StandardScaler()
    data_relative[cols_to_normalize] = scaler.fit_transform(
        data_relative[cols_to_normalize]
    )

    return data_relative

In [None]:
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
file = "/data/uscuni-restricted/04_spatial_census_2/_merged_census_2021.parquet"

In [None]:
data_relative = process_file(file, path_total)

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)

In [None]:
data_relative = data_relative.merge(
    clusters, how="left", left_on="nadzsjd", right_on="kod_nadzsj_d"
)

In [None]:
data_relative["cluster"] = data_relative["final_without_noise"].map(
    cluster_mapping[3].to_dict()
)

In [None]:
data_relative = data_relative.dropna()

In [None]:
umap = umap.UMAP(
    n_neighbors=5, min_dist=0.0, n_components=20, metric="euclidean", random_state=42
)

In [None]:
umap

In [None]:
embedding = umap.fit_transform(
    data_relative.drop(columns=["Obyvatelstvo celkem", "geometry", "cluster"]),
    y=data_relative["cluster"],
)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(
    embedding[:, 0], embedding[:, 1], c=data_relative["cluster"], cmap="Spectral", s=10
)
plt.title("Supervised UMAP Projection")
plt.colorbar()
plt.show()

In [None]:
umap_gdf = gpd.GeoDataFrame(
    umap.embedding_,
    index=data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]).index,
).set_geometry(data_relative.geometry)
umap_gdf.columns = umap_gdf.columns.astype(str)

umap_gdf.to_parquet("/data/uscuni-restricted/05_umap/umap_new.parquet")