In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
def process_file(path, path_total):
    # Open data for total population
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0).set_index(
        "nadzsjd"
    )
    # Open data diles
    data = gpd.read_parquet(path).set_index("nadzsjd")
    # Merge data
    data_total = data.join(total)
    # Remove unnecessary columns
    data_relative = data_total.drop(data.columns[:12], axis=1)
    # Do some preprocessing
    data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
    data_relative[data_relative.columns.drop("geometry")] = data_relative[
        data_relative.columns.drop("geometry")
    ].astype(float)

    # Normalize the data
    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    # Drop NaN values
    data_relative = data_relative.dropna(axis=0)

    scaler = StandardScaler()
    data_relative[cols_to_normalize] = scaler.fit_transform(
        data_relative[cols_to_normalize]
    )

    return data_relative

In [None]:
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
file = "/data/uscuni-restricted/04_spatial_census_2/_merged_census_2021.parquet"

In [None]:
data_relative = process_file(file, path_total)

In [None]:
fa = FactorAnalyzer()
fa.fit(data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]))

In [None]:
ev, _ = fa.get_eigenvalues()

In [None]:
# Plot factors and eigenvalues
plt.plot(range(1, len(ev) + 1), ev, marker="o")

plt.axhline(1, color="r", linestyle="--")

plt.xlabel("Factors")
plt.ylabel("Eigenvalue")
plt.grid()
plt.show()

In [None]:
suggested = sum(ev > 1)
print(f" Suggested number of components (eigenvalue > 1): {suggested}")

In [None]:
fa = FactorAnalysis(n_components=35, max_iter=10000)
fa.fit(data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]))
# Get Loadings directly from components_
loadings = fa.components_.T

# Convert to DataFrame for readability
loadings_df = pd.DataFrame(
    loadings.T,
    columns=data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]).columns,
)
loadings_df

In [None]:
# Transform and build DataFrame
transformed = fa.transform(
    data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"])
)

df_fa = pd.DataFrame(
    transformed,
    index=data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]).index,
).set_geometry(data_relative.geometry)

df_fa.columns = df_fa.columns.astype(str)
# Save to Parquet
df_fa.to_parquet("/data/uscuni-restricted/05_fa/fa_new35.parquet")