In [None]:
from glob import glob
from pathlib import Path

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
def process_file(path, path_total):
    # Open data for total population
    total = pd.read_csv(path_total, dtype={"nadzsjd": str}, index_col=0)
    # Open data diles
    data = gpd.read_parquet(path).set_index("nadzsjd")
    # Merge data
    data_total = data.join(total)
    # Remove unnecessary columns
    data_relative = data_total.drop(data.columns[:12], axis=1)
    data_relative = data_relative.drop(columns="Cluster")

    # Do some preprocessing
    # data_relative = data_relative.replace("d", np.nan).dropna(axis=0)
    data_relative[data_relative.columns.drop("geometry")] = data_relative[
        data_relative.columns.drop("geometry")
    ].astype(float)

    # Normalize the data
    cols_to_normalize = data_relative.columns.drop(["Obyvatelstvo celkem", "geometry"])
    data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
        data_relative["Obyvatelstvo celkem"], axis=0
    )

    # Drop NaN values
    data_relative = data_relative.dropna(axis=0)

    scaler = StandardScaler()
    data_relative[cols_to_normalize] = scaler.fit_transform(
        np.nan_to_num(data_relative[cols_to_normalize])
    )

    return data_relative

In [None]:
path_total = "/data/uscuni-restricted/04_spatial_census/total.csv"

In [None]:
path = glob("/data/uscuni-restricted/04_spatial_census/*nadzsj*.parquet")
path

In [None]:
path.remove(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_statni_obcanstvi_narodnost_2021.parquet"
)

In [None]:
path

In [None]:
a = gpd.read_parquet(path[9]).set_index("nadzsjd").sort_index()
a.iloc[:, 12:]

In [None]:
a = gpd.read_parquet(path[8]).set_index("nadzsjd").sort_index()
a.iloc[:, 12:]

In [None]:
data_relative = process_file(path, path_total)

In [None]:
pca = PCA()
pca.fit(data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]))

In [None]:
plt.figure(figsize=(20, 10))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker="o")

In [None]:
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components_80 = np.argmax(cumulative_variance >= 0.80)

In [None]:
n_components_80

In [None]:
pca = PCA(n_components=29)
pca.fit(data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]))

In [None]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loadings_df = pd.DataFrame(
    loadings.T,
    index=[f"PC{i + 1}" for i in range(pca.explained_variance_.shape[0])],
    columns=(data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"])).columns,
)
loadings_df.T.style.background_gradient(cmap="RdBu", vmin=-1, vmax=1)

In [None]:
loadings_df.T.to_csv("loadings_pca.txt")

In [None]:
transformed = pca.transform(
    data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"])
)

df_pca = pd.DataFrame(
    transformed,
    index=data_relative.drop(columns=["Obyvatelstvo celkem", "geometry"]).index,
).set_geometry(data_relative.geometry)

df_pca.columns = df_pca.columns.astype(str)
df_pca.to_parquet("/data/uscuni-restricted/05_pcs/pcs_29.parquet")

## Merge factors from all files together
If applied for multiple seperate files

In [None]:
dfs = []
dfs = [df.set_geometry(df.geometry) for df in dfs]

# Drop additional geometry columns before concatenation
for i in range(1, len(dfs)):
    dfs[i] = dfs[i].drop(columns=["geometry"])

# Concatenate along columns
concat_df = pd.concat(dfs, axis=1)

# Reassign geometry column after concatenation
concat_df = gpd.GeoDataFrame(concat_df, geometry=dfs[0].geometry)

In [None]:
# Concatenate all datasets
concat_df = pd.concat(dfs, axis=1)

In [None]:
concat_df.to_parquet("/data/uscuni-restricted/05_fa/merged_fa.parquet")