# UMAP Visualisations

In [None]:
from glob import glob

import geopandas as gpd
import jscatter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap
import umap.plot

In [None]:
def process_file(path, path_total):
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0).set_index(
        "nadzsjd"
    )
    data = gpd.read_parquet(path).set_index("nadzsjd")

    data_total = data.join(total)

    data_relative = data_total.drop(data.columns[:12], axis=1)

    if (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet"
    ):
        data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
        data_relative[data_relative.columns.drop("geometry")] = data_relative[
            data_relative.columns.drop("geometry")
        ].astype(float)
    elif (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet"
    ):
        data_relative = data_relative.drop(
            columns=[
                "Průměrná plocha 1 obydleného bytu v m2 v domech celkem",
                "Počet obytných místností(4 m2 a více) obydlených bytů v domech celkem",
                "Celková plocha obydlených bytů v m2 v domech celkem",
            ]
        )
    elif (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_flats_2021.parquet"
    ):
        data_relative = data_relative.drop(columns="Neobydlené byty celkem")

    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    data_relative = data_relative.dropna(axis=0)

    return data_relative

## UMAP on one category

In [None]:
path = "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_residence_gender_2021.parquet"
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)

In [None]:
data_rel = process_file(path, path_total)

In [None]:
data_rel

In [None]:
data_umap = data_rel.drop(columns=["geometry", "Obyvatelstvo celkem"]).dropna()

In [None]:
corr = data_umap.corr()

In [None]:
sns.heatmap(corr, cmap="coolwarm")

In [None]:
sns.pairplot(data_umap)

In [None]:
reducer = umap.UMAP(n_neighbors=5, n_components=2)

In [None]:
reducer.fit_transform(data_umap)

In [None]:
data_umap = data_umap.merge(
    clusters, how="left", left_on="nadzsjd", right_on="kod_nadzsj_d"
)

In [None]:
umap.plot.points(
    reducer, labels=data_umap["final_without_noise"].map(cluster_mapping[3].to_dict())
)

In [None]:
df = pd.DataFrame(reducer.embedding_, columns=["x", "y"])
df["cluster"] = data_umap["final_without_noise"].map(cluster_mapping[3].to_dict())

In [None]:
scatter = jscatter.Scatter(
    data=df,
    x="x",
    y="y",
    color_by="cluster",
    size=8,
    tooltip=True,
    tooltip_properties=["cluster"],
)
scatter.color(map="magma")
scatter.show()

## UMAP on all raw data

### Merge Raw Data

In [None]:
files = glob("/data/uscuni-restricted/04_spatial_census/*")

In [None]:
files

In [None]:
file_list = [
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_age_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_ea_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_flats_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_residence_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_type_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_education_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_status_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_houses_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_ea_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_employed_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_nationality_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_religion_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet",
]

In [None]:
df_list = []
for i in file_list:
    data_total = process_file(i, path_total)
    dt = data_total.drop(columns=["Obyvatelstvo celkem", "geometry"])

    df_list.append(dt)

In [None]:
df_concat = pd.concat(df_list, axis=1).dropna()

In [None]:
geometries = process_file(i, path_total)[["geometry"]]

In [None]:
geometries.crs

In [None]:
gdf_concat = gpd.GeoDataFrame(
    df_concat.merge(geometries, how="left", left_on="nadzsjd", right_on="nadzsjd")
)

In [None]:
gdf_concat.to_parquet(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_raw_data.parquet"
)

## Colour by cluster

In [None]:
gdf_concat_cluster = gdf_concat.merge(
    clusters, how="left", left_on="nadzsjd", right_on="kod_nadzsj_d"
)

### Euclidean metric

In [None]:
reducer = umap.UMAP(n_neighbors=5, n_components=2)
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=10, n_components=2)
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=100, n_components=2)
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

### Correlation metric

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="correlation")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=100, n_components=2, metric="correlation")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

### Cosine metric

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="cosine")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

### Canberra metric

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="canberra")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=100, n_components=2, metric="canberra")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

### Minkowski metric

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="minkowski")
reducer.fit_transform(gdf_concat.drop(columns="geometry"))
umap.plot.points(
    reducer,
    labels=gdf_concat_cluster["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

# Create UMAP projected data files in loop

In [None]:
n_components = 2

In [None]:
n_components = [5, 10, 20, 30]
n_neighbors = [5, 10, 20, 50, 100]
metrics = ["euclidean", "canberra", "correlation"]

for i in n_components:
    for j in n_neighbors:
        for k in metrics:
            reducer = umap.UMAP(n_neighbors=j, n_components=i, metric=k)
            reducer.fit_transform(gdf_concat.drop(columns="geometry"))
            umap_gdf = gpd.GeoDataFrame(
                reducer.embedding_,
                columns=[str(n) for n in range(i)],
                index=gdf_concat.index,
                geometry=gdf_concat.geometry,
            )
            umap_gdf.to_parquet(
                f"/data/uscuni-restricted/05_umap/umap_dim{i}_nb{j}_{k}.parquet"
            )

In [None]:
%%time
reducer = umap.UMAP(n_neighbors=5, n_components=2, metric=k)
reducer.fit_transform(gdf_concat.drop(columns="geometry"))

## UMAP on PC data

In [None]:
pcs = gpd.read_parquet("/data/uscuni-restricted/05_pcs/pcs_of_pcs_geometries.parquet")

In [None]:
pcs_merged = pcs.merge(
    clusters,
    left_on=pcs.index,
    right_on="kod_nadzsj_d",
)

In [None]:
reducer = umap.UMAP(n_neighbors=10, n_components=2)
reducer.fit_transform(
    pcs_merged.drop(columns=["geometry", "kod_nadzsj_d", "final_without_noise"])
)
umap.plot.points(
    reducer, labels=pcs_merged["final_without_noise"].map(cluster_mapping[3].to_dict())
)

In [None]:
reducer = umap.UMAP(n_neighbors=100, n_components=2)
reducer.fit_transform(
    pcs_merged.drop(columns=["geometry", "kod_nadzsj_d", "final_without_noise"])
)
umap.plot.points(
    reducer, labels=pcs_merged["final_without_noise"].map(cluster_mapping[3].to_dict())
)