# WK09 Prep

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/data_utils.py

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.decomposition import PCA

from data_utils import StandardScaler
from data_utils import object_from_json_url

# Classification / Clustering

In [None]:
## 1. Load Dataset
WINE_FILE = "https://raw.githubusercontent.com/DM-GY-9103-2024F-H/9103-utils/main/datasets/json/wines.json"

# Read into DataFrame
wines_data = object_from_json_url(WINE_FILE)
wines_df = pd.DataFrame.from_records(wines_data)

## 3. Normalize
wine_scaler = StandardScaler()
wines_scaled = wine_scaler.fit_transform(wines_df)

features = wines_scaled.drop(columns=["quality"])
wines_scaled.cov()["quality"].sort_values()

In [None]:
wine_pca = PCA()
wines_pcad = wine_pca.fit_transform(features.values)

In [None]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

In [None]:
x = features["alcohol"].values
y = features["density"].values
c = [colors[int(i)] for i in wines_scaled["quality"].values]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.show()

In [None]:
x = wines_pcad[:, 0]
y = wines_pcad[:, 1]
c = [colors[int(i)] for i in wines_scaled["quality"].values]

# Plot the PCAs
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.show()

In [None]:
from sklearn.cluster import KMeans

## 5. Create a KMeans object
km_model = KMeans(n_clusters=4, n_init=10)

# Create a model that tries to group wines by features
result = km_model.fit(features.values)

## 6. Run the model on the training data
predicted_scaled = km_model.predict(features.values)

In [None]:
x = features["alcohol"].values
y = features["density"].values
c = [colors[i] for i in predicted_scaled]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.xlim(-2,3)
plt.ylim(-2,3)
plt.show()

In [None]:
x = wines_pcad[:, 0]
y = wines_pcad[:, 1]
c = [colors[i] for i in predicted_scaled]

# Plot the PCAs
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.xlim(-4,4)
plt.ylim(-4,4)
plt.show()


# Iris

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
X_reduced = PCA(n_components=3).fit_transform(iris.data)

In [None]:
x = iris.data[:, 0]
y = iris.data[:, 1]
c = [colors[int(i)] for i in iris.target]

plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.5)
plt.show()

In [None]:
x = X_reduced[:, 0]
y = X_reduced[:, 1]
c = [colors[int(i)] for i in iris.target]


plt.figure(figsize=(9, 6.75), dpi=150)
plt.scatter(x, y, color=colors[0], marker='o', linestyle='', alpha=0.7)
plt.xlabel("petal length")
plt.ylabel("petal width")
plt.show()

plt.figure(figsize=(9, 6.75), dpi=150)
plt.scatter(x, y, color=c, marker='o', linestyle='', alpha=0.7)
plt.xlabel("petal length")
plt.ylabel("petal width")
plt.show()

# WK F Prep

In [None]:
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/data_utils.py
!wget -q https://github.com/DM-GY-9103-2024F-H/9103-utils/raw/main/src/image_utils.py

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

from data_utils import PCA, RandomForestClassifier, StandardScaler, SVC
from data_utils import classification_error, object_from_json_url

from image_utils import make_image, open_image

## Distances

### Hilbert Curve

- https://pypi.org/project/hilbertcurve/
- https://github.com/galtay/hilbertcurve

In [None]:
def remap(v, nmax, nmin):
  return ((v - v.min()) / v.ptp()) * (nmax - nmin) + nmin

In [None]:
labels = []
faces = []

for l in range(1, 41):
  for i in range(1, 11):
    mimg = open_image(f"./data/imgs/att-faces/s{l}/{i}.pgm")
    faces.append(mimg.pixels)
    labels.append(l)

## PCA

### Dim Red

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

X = np.random.normal([0, 0], [1,0.71], size=(500,2))
X[:,1] = X[:,0]/1.5 + X[:,1]

pca = PCA(n_components=1)
Xt = pca.fit_transform(X)
Xi = pca.inverse_transform(Xt)

plt.scatter(X[:,0], X[:,1], s=2)
plt.scatter(X[:,0], [0]*len(X), s=2, c='#7DF9FF')
plt.scatter(Xi[:,0], Xi[:,1], s=2, c='r')
plt.xlim([-4, 4])
plt.ylim([-4, 4])
plt.show()

In [None]:
X = np.random.normal([0, 0, 0], [1, 0.8, 0.8], size=(500, 3))
X[:,1] = X[:,0]/1.25 + X[:,1]
X[:,2] = X[:,0]/0.80 + X[:,2]

pca = PCA(n_components=1)
Xt = pca.fit_transform(X)
Xi = pca.inverse_transform(Xt)

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], s=3)
ax.scatter(Xi[:,0], Xi[:,1], Xi[:,2], s=3, c='r')
ax.set_xlim((-4, 4))
ax.set_ylim((-4, 4))
ax.set_zlim((-4, 4))
plt.show()

In [None]:
import json
pca3d = [{"X":round(x[0],5), "Y":round(x[1],5), "Z":round(x[2],5)} for x in X]

with open("./pca3d.json", "w") as f:
  json.dump(pca3d, f)

### PCA as Decomposition

In [None]:
X_df = pd.DataFrame([[8,4,6],[10,6,8],[15,7,13],[20,11,15]], columns=["W","L","H"])

pca = PCA(n_components=1)
X_pca = pca.fit_transform(X_df)

print(X_pca.values, "\n\n", X_df.mean(), "\n\n", pca.components_)

X_pca.values @ pca.components_ + X_df.mean().values

### ATT Faces

In [None]:
# n_components=0.80 keeps 80% of the variation
pca = PCA(n_components=10)
pca.fit(faces)

print(sum(pca.explained_variance_ratio_))

components = pca.components_
latent = pca.transform(faces)
pfaces = pca.inverse_transform(latent)

print(latent.shape, pfaces.shape)

for pc in components[:2]:
  display(make_image(remap(pc, 0, 255), width=92))

In [None]:
display(make_image(list(faces[0]), width=92))
display(make_image(list(pfaces[0]), width=92))

In [None]:
# Random face
fake_latent = np.random.normal(latent.mean(axis=0), latent.std(axis=0)).reshape(1,-1)
fake_face = pca.inverse_transform(fake_latent)
display(make_image(list(fake_face[0]), width=92))

In [None]:
x = latent[:,0]
y = latent[:,1]
z = latent[:,2]
ccs = [i for sub in [[v]*10 for v in range(1,41)] for i in sub]

plt.scatter(x, y, c=labels, marker='o', linestyle='', alpha=1, cmap="tab10")
plt.title("Principal Components")
plt.xlabel("PC 0")
plt.ylabel("PC 1")
plt.show()

fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(projection='3d')
ax.scatter(x,y,z, c=labels, marker='o', linestyle='', alpha=1, cmap="tab10")
ax.set_title("Principal Components")
ax.set_xlabel("PC 0")
ax.set_ylabel("PC 1")
ax.set_zlabel("PC 2")
plt.show()

### Classify with PCA

In [None]:
import random

shuffled = random.sample(list(zip(latent, labels)), len(latent))
train = shuffled[len(shuffled) // 4:]
test = shuffled[:len(shuffled) // 4]

train_feats = [t[0] for t in train]
train_labels = [t[1] for t in train]

test_feats = [t[0] for t in test]
test_labels = [t[1] for t in test]

len(train), len(test)

In [None]:
## 5. Create a Classifier object
quality_model = RandomForestClassifier()

# Create a model that classifies quality of wines based on many features
result = quality_model.fit(train_feats, train_labels)

## 6. Run the model on the training data
train_predicted = quality_model.predict(train_feats)

## 7. Measure error
accuracy_score(train_labels, train_predicted)

In [None]:
## 6. Run the model on the training data
test_predicted = quality_model.predict(test_feats)

## 7. Measure error
accuracy_score(test_labels, test_predicted)