In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
def process_file(path, path_total):
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0).set_index(
        "nadzsjd"
    )
    data = gpd.read_parquet(path).set_index("nadzsjd")

    data_total = data.join(total)

    data_relative = data_total.drop(data.columns[:12], axis=1)

    if (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet"
    ):
        data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
        data_relative[data_relative.columns.drop("geometry")] = data_relative[
            data_relative.columns.drop("geometry")
        ].astype(float)
    elif (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet"
    ):
        data_relative = data_relative.drop(
            columns="Průměrná plocha 1 obydleného bytu v m2 v domech celkem"
        )

    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    data_relative = data_relative.dropna(axis=0)

    scaler = StandardScaler()
    data_relative[data_relative.columns.drop("geometry")] = scaler.fit_transform(
        data_relative.drop(columns="geometry")
    )
    return data_relative

In [None]:
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
files = [
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_age_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_ea_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_flats_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_residence_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_type_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_education_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_status_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_houses_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_ea_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_employed_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_nationality_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_religion_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet",
]

In [None]:
dfs = {}
for file in files:
    stem = Path(file).stem
    data_relative = process_file(file, path_total)
    dfs[stem] = data_relative

In [None]:
pca_dict = {
    "nadzsjd_pop_age_gender_2021": 30,
    "nadzsjd_pop_ea_gender_2021": 15,
    "nadzsjd_housing_flats_2021": 4,
    "nadzsjd_pop_residence_gender_2021": 5,
    "nadzsjd_emp_type_age_2021": 9,
    "nadzsjd_education_2021": 7,
    "nadzsjd_pop_status_gender_2021": 7,
    "nadzsjd_housing_houses_2021": 3,
    "nadzsjd_emp_ea_age_2021": 16,
    "nadzsjd_emp_employed_2021": 16,
    "nadzsjd_households_2021": 20,
    "nadzsjd_pop_nationality_2021": 5,
    "nadzsjd_pop_religion_gender_2021": 5,
    "nadzsjd_housing_size_facilities_2021": 5,
}

In [None]:
for i, j in pca_dict.items():
    pca = PCA(n_components=j)
    pca.fit(dfs[i].iloc[:, :-2])

    # Compute Loadings
    loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

    # Convert to DataFrame for readability
    loadings_df = pd.DataFrame(
        loadings.T,
        index=[f"PC{k + 1}" for k in range(pca.explained_variance_.shape[0])],
        columns=dfs[i].iloc[:, :-2].columns,
    )

    # Transform and build DataFrame
    transformed = pca.transform(dfs[i].iloc[:, :-2])
    df_pca = pd.DataFrame(
        transformed,
        index=dfs[i].iloc[:, :-2].index,
        columns=[f"{i}_PC{k + 1}" for k in range(transformed.shape[1])],
    ).set_geometry(dfs[i].geometry)

    # Save to Parquet
    df_pca.to_parquet(f"/data/uscuni-restricted/05_pcs/{i}.parquet")