In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
def process_file(path, path_total):
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0).set_index(
        "nadzsjd"
    )
    data = gpd.read_parquet(path).set_index("nadzsjd")

    data_total = data.join(total)

    data_relative = data_total.drop(data.columns[:12], axis=1)

    if (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet"
    ):
        data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
        data_relative[data_relative.columns.drop("geometry")] = data_relative[
            data_relative.columns.drop("geometry")
        ].astype(float)
    elif (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet"
    ):
        data_relative = data_relative.drop(
            columns=[
                "Průměrná plocha 1 obydleného bytu v m2 v domech celkem",
                "Počet obytných místností(4 m2 a více) obydlených bytů v domech celkem",
                "Celková plocha obydlených bytů v m2 v domech celkem",
            ]
        )
    elif (
        path
        == "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_flats_2021.parquet"
    ):
        data_relative = data_relative.drop(columns="Neobydlené byty celkem")

    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    data_relative = data_relative.dropna(axis=0)

    return data_relative

In [None]:
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
files = [
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_age_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_ea_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_flats_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_residence_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_type_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_education_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_status_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_houses_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_ea_age_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_emp_employed_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_households_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_nationality_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_religion_gender_2021.parquet",
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_housing_size_facilities_2021.parquet",
]

In [None]:
dfs = {}
for file in files:
    stem = Path(file).stem
    data_relative = process_file(file, path_total)
    dfs[stem] = data_relative

In [None]:
fa_dict = {
    "nadzsjd_pop_age_gender_2021": 0,
    "nadzsjd_pop_ea_gender_2021": 0,
    "nadzsjd_housing_flats_2021": 0,
    "nadzsjd_pop_residence_gender_2021": 0,
    "nadzsjd_emp_type_age_2021": 0,
    "nadzsjd_education_2021": 0,
    "nadzsjd_pop_status_gender_2021": 0,
    "nadzsjd_housing_houses_2021": 0,
    "nadzsjd_emp_ea_age_2021": 0,
    "nadzsjd_emp_employed_2021": 0,
    "nadzsjd_households_2021": 0,
    "nadzsjd_pop_nationality_2021": 0,
    "nadzsjd_pop_religion_gender_2021": 0,
    "nadzsjd_housing_size_facilities_2021": 0,
}

In [None]:
for i in dfs:
    fa = FactorAnalyzer()
    fa.fit(dfs[i].iloc[:, :-2])
    ev, _ = fa.get_eigenvalues()

    plt.plot(range(1, len(ev) + 1), ev, marker="o")
    plt.axhline(1, color="r", linestyle="--")
    plt.title(f"Scree Plot for {i}")
    plt.xlabel("Factors")
    plt.ylabel("Eigenvalue")
    plt.grid()
    plt.show()

    suggested = sum(ev > 1)
    print(f"{i}: Suggested number of components (eigenvalue > 1): {suggested}")

    # Append the suggested number of components to the dictionary
    fa_dict[i] = suggested + 1

fa_dict

In [None]:
for i, j in fa_dict.items():
    fa = FactorAnalysis(n_components=j, max_iter=10000)
    fa.fit(dfs[i].iloc[:, :-2])

    # Get Loadings directly from components_
    loadings = fa.components_.T

    # Convert to DataFrame for readability
    loadings_df = pd.DataFrame(
        loadings.T,
        index=[f"Factor{k + 1}" for k in range(j)],
        columns=dfs[i].iloc[:, :-2].columns,
    )

    # Transform and build DataFrame
    transformed = fa.transform(dfs[i].iloc[:, :-2])
    df_fa = pd.DataFrame(
        transformed,
        index=dfs[i].iloc[:, :-2].index,
        columns=[f"{i}_Factor{k + 1}" for k in range(transformed.shape[1])],
    ).set_geometry(dfs[i].geometry)

    # Save to Parquet
    df_fa.to_parquet(f"/data/uscuni-restricted/05_fa/{i}.parquet")

In [None]:
all_files = glob("/data/uscuni-restricted/05_fa/*.parquet")

In [None]:
all_files = [
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_age_gender_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_ea_gender_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_housing_flats_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_residence_gender_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_emp_type_age_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_education_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_status_gender_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_housing_houses_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_emp_ea_age_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_emp_employed_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_nationality_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_households_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_pop_religion_gender_2021.parquet",
    "/data/uscuni-restricted/05_fa/nadzsjd_housing_size_facilities_2021.parquet",
]

In [None]:
dfs = []

# Process all files
for file in all_files:
    dfs.append(gpd.read_parquet(file))

In [None]:
dfs = [df.set_geometry(df.geometry) for df in dfs]

# Drop additional geometry columns before concatenation
for i in range(1, len(dfs)):
    dfs[i] = dfs[i].drop(columns=["geometry"])

# Concatenate along columns
concat_df = pd.concat(dfs, axis=1)

# Reassign geometry column after concatenation
concat_df = gpd.GeoDataFrame(concat_df, geometry=dfs[0].geometry)

In [None]:
# Concatenate all datasets
concat_df = pd.concat(dfs, axis=1)

In [None]:
concat_df.to_parquet("/data/uscuni-restricted/05_fa/merged_fa.parquet")