# Interpreting PCAs 


In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler

In [None]:
total = pd.read_csv(
    "/data/uscuni-restricted/01_preprocessed_census/nadzsjd_vek_pohlavi_2021.csv",
    dtype={"nadzsjd": str},
)

In [None]:
total = total.iloc[:, 12:14]
total

In [None]:
data = gpd.read_parquet(
    "/data/uscuni-restricted/04_spatial_census/nadzsjd_pop_status_gender_2021.parquet"
)
data_total = pd.merge(
    data,
    total,
    on="nadzsjd",
    how="left",
)

In [None]:
data_census = data_total.drop(data.columns[:13], axis=1)

In [None]:
data_sum = data_census.iloc[:, 12:-2].sum(axis=1)

In [None]:
data_relative = data_census.drop(columns="geometry")

In [None]:
cols_to_normalize = data_relative.columns.difference(["Obyvatelstvo celkem"])
data_relative[cols_to_normalize] = data_relative[cols_to_normalize].div(
    data_relative["Obyvatelstvo celkem"], axis=0
)

data_relative

In [None]:
data_relative = data_relative.dropna(axis=0)

In [None]:
data_relative

In [None]:
scaler = StandardScaler()
data_relative = pd.DataFrame(
    scaler.fit_transform(data_relative),
    columns=data_relative.columns,
    index=data_relative.index,
)

In [None]:
data_relative

## Perform PCA

In [None]:
pca = PCA(n_components=7)
pca.fit(data_relative.iloc[:, :-1])
pca.explained_variance_ratio_

In [None]:
np.cumsum(pca.explained_variance_ratio_)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker="o")

In [None]:
# Compute Loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

# Convert to DataFrame for readability
loadings_df = pd.DataFrame(
    loadings.T,
    index=[f"PC{i + 1}" for i in range(pca.explained_variance_.shape[0])],
    columns=data_relative.iloc[:, :-1].columns,
)

In [None]:
loadings_df

# Plot

In [None]:
import matplotlib.pyplot as plt

# unused but required import for doing 3d projections with matplotlib < 3.2
import mpl_toolkits.mplot3d  # noqa: F401
from sklearn.decomposition import PCA

fig = plt.figure(1, figsize=(8, 6))
ax = fig.add_subplot(111, projection="3d", elev=-150, azim=110)

X_reduced = pca.fit_transform(data_relative.iloc[:, :-1])
scatter = ax.scatter(
    X_reduced[:, 0],
    X_reduced[:, 1],
    X_reduced[:, 2],
    s=40,
)

ax.set(
    title="First three PCA dimensions",
    xlabel="1st Eigenvector",
    ylabel="2nd Eigenvector",
    zlabel="3rd Eigenvector",
)
ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])

plt.show()

In [None]:
pca_scores = pca.fit_transform(data_relative.iloc[:, :-1])

In [None]:
# Scatter plot of PC1 vs PC2
plt.figure(figsize=(8, 6))
sns.scatterplot(x=pca_scores[:, 0], y=pca_scores[:, 1])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA: PC1 vs PC2")
plt.axhline(0, color="gray", linestyle="--")
plt.axvline(0, color="gray", linestyle="--")
plt.show()

In [None]:
def biplot(pca_scores, feature_names):
    plt.figure(figsize=(8, 6))
    plt.scatter(pca_scores[:, 0], pca_scores[:, 1], alpha=0.5)
    scaling_factor1 = 10  # Adjust the scale of arrows
    scaling_factor2 = 20

    for i, feature in enumerate(feature_names):
        plt.arrow(
            0,
            0,
            loadings[i, 0] * scaling_factor1,
            loadings[i, 1] * scaling_factor2,
            color="red",
            alpha=1,
            head_width=1,
            head_length=1,
        )
        plt.text(
            loadings[i, 0] * scaling_factor1 * 1.1,
            loadings[i, 1] * scaling_factor2 * 1.1,
            s=feature,
            color="black",
            ha="center",
            va="center",
        )

    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.axhline(0, color="gray", linestyle="--")
    plt.axvline(0, color="gray", linestyle="--")
    plt.title("PCA Biplot (PC1 vs PC2)")
    plt.show()


biplot(pca, pca_scores, data_relative.iloc[:, :-1].columns)

# Save PCs

In [None]:
pca.transform(data_relative)

In [None]:
columns = [f"pca_ {i}" for i in range(pca.explained_variance_.shape[0])]
df_pca = pd.DataFrame(
    pca.transform(data_relative.iloc[:, :-1]),
    columns=columns,
    index=data_relative.iloc[:, :-1].index,
)

In [None]:
df_pca

In [None]:
df_pca.to_parquet(
    "/data/uscuni-restricted/05_pcs/nadzsjd_pop_status_gender_2021.parquet"
)