In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/data_utils.py

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from data_utils import PCA, StandardScaler, object_from_json_url

### Why PCA

In [None]:
PENGUIN_URL = "https://raw.githubusercontent.com/DM-GY-9103-2024F-H/9103-utils/refs/heads/main/datasets/json/penguins.json"
penguin_data = object_from_json_url(PENGUIN_URL)

penguins_df = pd.DataFrame.from_records(penguin_data)
penguins_df

### Penguin Example

Explore the penguin data.

Let's encode the species column into integers.
It's a simple encoding, so we can just do it manually using a function and the `DataFrame.apply()` command.

In [None]:
species = list(penguins_df["species"].unique())

def species_to_label(s):
  return species.index(s)

penguins_df["label"] = penguins_df["species"].apply(species_to_label)

penguins_df

### Plot the Data

If we're trying to get some insight about our data, we can look at covariance tables and plots of all of the possible pairs of features.

In [None]:
penguins_features_df = penguins_df.drop(columns=["label", "species"])

for i,cx in enumerate(penguins_features_df.columns):
  for j,cy in enumerate(penguins_features_df.columns):
    if j > i:
      plt.scatter(penguins_features_df[cx], penguins_features_df[cy], c=penguins_df["label"])
      plt.xlabel(cx)
      plt.ylabel(cy)
      plt.show()

### PCA

We can try to simplify this data by performing `PCA` and combining some of the original features into _principal components_.

In [None]:
penguinScaler = StandardScaler()
penguinPCA = PCA(n_components=3)

In [None]:
penguins_scaled_df = penguinScaler.fit_transform(penguins_features_df)
penguins_pca_df = penguinPCA.fit_transform(penguins_scaled_df)

penguinPCA.explained_variance()

### Covariances

Now that we have scaled data we can look at covariance tables.

In [None]:
display(penguins_scaled_df.cov().round(3))
display(penguins_pca_df.cov().round(3))

Hmmm... the covariances of the `PCA` data are all $0$ !!

But that's expected. `PCA` separates our data into new features that are combinations of the previous features, but that are themselves not related to each other.

### Plots

In [None]:
pca_column_names = penguins_pca_df.columns

plt.scatter(penguins_pca_df[pca_column_names[0]], penguins_pca_df[pca_column_names[1]], c=penguins_df["label"])
plt.xlabel(pca_column_names[0])
plt.ylabel(pca_column_names[1])
plt.show()

### 3D

In [None]:
# 3D
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')

ax.scatter(penguins_pca_df[pca_column_names[0]],
           penguins_pca_df[pca_column_names[1]],
           penguins_pca_df[pca_column_names[2]],
           c=penguins_df["label"])
ax.set_xlabel(pca_column_names[0])
ax.set_ylabel(pca_column_names[1])
ax.set_zlabel(pca_column_names[2])
plt.show()