# UMAP Visualisations

In [None]:
import geopandas as gpd
import jscatter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap
import umap.plot

In [None]:
def process_file(path, path_total):
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0)
    data = gpd.read_parquet(path)

    data_total = pd.merge(data, total, on="nadzsjd", how="left")

    data_census = data_total.drop(data.columns[:13], axis=1)
    data_relative = data_census.drop(columns="geometry")
    cols_to_normalize = data_relative.columns.difference(["Obyvatelstvo celkem"])
    data_total[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    clusters = pd.read_csv(
        "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
        dtype={"kod_nadzsj_d": str},
    )

    data_total = data_total.merge(
        clusters, how="left", left_on=data_total.nadzsjd, right_on=clusters.kod_nadzsj_d
    )

    data_census = data_total.drop(data.columns[:13], axis=1)

    return data_total

## UMAP on one category

In [None]:
path = "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_residence_gender_2021.parquet"
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
clusters = pd.read_csv(
    "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
    dtype={"kod_nadzsj_d": str},
)
cluster_mapping = pd.read_parquet(
    "/data/uscuni-ulce/processed_data/clusters/cluster_mapping_v3.pq"
)

In [None]:
data_rel = process_file(path, path_total)

In [None]:
data_umap = (
    data_rel.drop(data_rel.columns[:14], axis=1)
    .drop(columns=["geometry", "Obyvatelstvo celkem"])
    .dropna()
)

In [None]:
corr = data_umap.iloc[:, :-2].corr()

In [None]:
sns.heatmap(corr, cmap="coolwarm")

In [None]:
sns.pairplot(data_umap.iloc[:, :-2])

In [None]:
data_umap.iloc[:, :-2]

In [None]:
reducer = umap.UMAP(n_neighbors=5, n_components=2)

In [None]:
reducer.fit_transform(data_umap.iloc[:, :-2])

In [None]:
umap.plot.points(
    reducer, labels=data_umap["final_without_noise"].map(cluster_mapping[3].to_dict())
)

In [None]:
df = pd.DataFrame(reducer.embedding_, columns=["x", "y"])

In [None]:
df["cluster"] = data_umap["final_without_noise"].map(cluster_mapping[3].to_dict())

In [None]:
scatter = jscatter.Scatter(
    data=df,
    x="x",
    y="y",
    color_by="cluster",
    size=8,
    tooltip=True,
    tooltip_properties=["cluster"],
)
scatter.color(map="magma")
scatter.show()

## UMAP on all raw data

In [None]:
file_list = [
    "nadzsjd_education_2021.parquet",
    "nadzsjd_emp_ea_age_2021.parquet",
    "nadzsjd_emp_employed_2021.parquet",
    "nadzsjd_emp_type_age_2021.parquet",
    "nadzsjd_households_2021.parquet",
    "nadzsjd_households_2021_.parquet",
    "nadzsjd_housing_flats_2021.parquet",
    "nadzsjd_housing_houses_2021.parquet",
    "nadzsjd_housing_size_facilities_2021.parquet",
    "nadzsjd_pop_age_gender_2021.parquet",
    "nadzsjd_pop_ea_gender_2021.parquet",
    "nadzsjd_pop_nationality_2021.parquet",
    "nadzsjd_pop_religion_gender_2021.parquet",
    "nadzsjd_pop_residence_gender_2021.parquet",
    "nadzsjd_pop_status_gender_2021.parquet",
]

In [None]:
def process_file(path, path_total):
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0)
    data = gpd.read_parquet(path)

    if path == "nadzsjd_housing_size_facilities_2021.parquet":
        data = data.drop(
            columns="Průměrná plocha 1 obydleného bytu v m2 v domech celkem"
        )

    data_total = pd.merge(data, total, on="nadzsjd", how="left")

    data_census = data_total.drop(data.columns[:13], axis=1)
    data_relative = data_census.drop(columns="geometry")
    data_relative = data_relative.replace("d", np.nan)
    data_relative = data_relative.dropna(axis=0).apply(pd.to_numeric)
    cols_to_normalize = data_relative.columns.difference(["Obyvatelstvo celkem"])
    data_total[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    clusters = pd.read_csv(
        "/data/uscuni-restricted/geometries/cluster_assignment_v3.csv",
        dtype={"kod_nadzsj_d": str},
    )

    data_total = data_total.merge(
        clusters, how="left", left_on=data_total.nadzsjd, right_on=clusters.kod_nadzsj_d
    )

    data_census = data_total.drop(data.columns[:13], axis=1)

    return data_total

In [None]:
df_list = []
for i in file_list:
    path = "/data/uscuni-restricted/04_spatial_census/" + i

    data_total = process_file(path, path_total)
    dt = (
        data_total.drop(data_total.columns[:14], axis=1)
        .drop(columns=["Obyvatelstvo celkem", "geometry", "final_without_noise"])
        .dropna()
        .set_index("kod_nadzsj_d")
    )
    df_list.append(dt)

In [None]:
df_concat = pd.concat(df_list, axis=1).dropna()

In [None]:
df_concat

In [None]:
df_concat_clusters = df_concat.merge(
    clusters, how="left", left_on=df_concat.index, right_on="kod_nadzsj_d"
)

In [None]:
reducer = umap.UMAP(n_neighbors=5, n_components=2)
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="correlation")
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="cosine")
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="canberra")
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="minkowski")
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=30, n_components=2, metric="canberra")
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=50, n_components=2)
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

In [None]:
reducer = umap.UMAP(n_neighbors=100, n_components=2)
reducer.fit_transform(df_concat)
umap.plot.points(
    reducer,
    labels=df_concat_clusters["final_without_noise"].map(cluster_mapping[3].to_dict()),
)

## UMAP on PC data

In [None]:
pcs = pd.read_parquet("/data/uscuni-restricted/05_pcs/pcs_of_pcs.parquet")

In [None]:
pcs_merged = pcs.reset_index().merge(
    clusters.reset_index(),
    left_on="index",
    right_on="index",
)

In [None]:
reducer = umap.UMAP(n_neighbors=50, n_components=2)
reducer.fit_transform(
    pcs_merged.drop(columns=["index", "kod_nadzsj_d", "final_without_noise"])
)
umap.plot.points(
    reducer, labels=pcs_merged["final_without_noise"].map(cluster_mapping[3].to_dict())
)