In [153]:
import numpy as np
from numpy.linalg import norm
import os
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import plotly.express as px
import plotly.io as pio
import pandas as pd



In [28]:
# Importing necessary data

jumping_1_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/jumping_1.npy"
jumping_2_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/jumping_2.npy"
jumping_3_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/jumping_3.npy"
jumping_4_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/jumping_4.npy"
jumping_5_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/jumping_5.npy"

running_1_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/running_1.npy"
running_2_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/running_2.npy"
running_3_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/running_3.npy"
running_4_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/running_4.npy"
running_5_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/running_5.npy"

walking_1_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/walking_1.npy"
walking_2_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/walking_2.npy"
walking_3_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/walking_3.npy"
walking_4_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/walking_4.npy"
walking_5_train_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/train/walking_5.npy"

jumping_1_train = np.load(jumping_1_train_path)
jumping_2_train = np.load(jumping_2_train_path)
jumping_3_train = np.load(jumping_3_train_path)
jumping_4_train = np.load(jumping_4_train_path)
jumping_5_train = np.load(jumping_5_train_path)

running_1_train = np.load(running_1_train_path)
running_2_train = np.load(running_2_train_path)
running_3_train = np.load(running_3_train_path)
running_4_train = np.load(running_4_train_path)
running_5_train = np.load(running_5_train_path)

walking_1_train = np.load(walking_1_train_path)
walking_2_train = np.load(walking_2_train_path)
walking_3_train = np.load(walking_3_train_path)
walking_4_train = np.load(walking_4_train_path)
walking_5_train = np.load(walking_5_train_path)


In [49]:
arrays = [jumping_1_train, jumping_2_train, jumping_3_train, jumping_4_train, jumping_5_train, 
          running_1_train, running_2_train, running_3_train, running_4_train, running_5_train,
          walking_1_train, walking_2_train, walking_3_train, walking_4_train, walking_5_train]

x_train = np.hstack(arrays)

# Change shape to 1500 x 114
x_train = x_train.T
mean_train = np.mean(x_train, axis = 0)

# Center x_train
x_train = x_train - mean_train

print(x_train.shape)

(1500, 114)


In [58]:
# Testing number of modes

frob_norm_arr = np.zeros(5)

for num_modes in range(1, 6):
    pca = PCA(n_components = num_modes)
    x_train_reduced = pca.fit_transform(x_train)

    x_train_reconstructed = pca.inverse_transform(x_train_reduced)
    frobenius_norm = norm(x_train_reconstructed)/norm(x_train)

    print(f"Number of modes: {num_modes}, Frobenius norm: {frobenius_norm}")


Number of modes: 1, Frobenius norm: 0.6880121582479749
Number of modes: 2, Frobenius norm: 0.8524018928475736
Number of modes: 3, Frobenius norm: 0.9107513037069751
Number of modes: 4, Frobenius norm: 0.9387250708058195
Number of modes: 5, Frobenius norm: 0.9554013904039478


In [157]:
# Plotting cumulative energy
k_values = np.arange(1,115)
norms = np.zeros(114)

for k in k_values:
    pca = PCA(n_components = k)
    x_train_reduced = pca.fit_transform(x_train)

    x_train_reconstructed = pca.inverse_transform(x_train_reduced)
    frobenius_norm = norm(x_train_reconstructed)/norm(x_train)

    norms[k-1] = frobenius_norm

df = pd.DataFrame({
    "k-values": k_values,
    "norm": norms
})

fig = px.scatter(df, x = "k-values", y = "norm", title = "Frobenius norm for different numbers of modes")

pio.write_image(fig, "frob_norms.pdf")

fig.show()

In [73]:
# Perform PCA
pca = PCA(n_components=2)
coefficients = pca.fit_transform(x_train)

# Extract PC1, PC2, PC3
PC1, PC2 = coefficients[:, 0], coefficients[:, 1]

In [151]:
categories = np.zeros(500)
categories = np.append(categories, np.ones(500))
categories = np.append(categories, 2 * np.ones(500))

In [158]:
categories = np.zeros(500)
categories = np.append(categories, np.ones(500))
categories = np.append(categories, 2 * np.ones(500))

df = pd.DataFrame({
    "PC1": PC1,
    "PC2": PC2,
    "category": categories
})

label = {0.0: "jumping", 1.0: "running", 2.0: "walking"}
df["category"] = df["category"].map(label)

fig = px.scatter(df, x = "PC1", y = "PC2", color = "category", title = "2D PCA Projection")

pio.write_image(fig, "PCA_2d.pdf")

fig.show()

In [85]:
# Perform PCA
pca = PCA(n_components=3)
coefficients = pca.fit_transform(x_train)

# Extract PC1, PC2, PC3
PC1, PC2, PC3 = coefficients[:, 0], coefficients[:, 1], coefficients[:, 2]

In [159]:
df = pd.DataFrame({
    "PC1": PC1,
    "PC2": PC2,
    "PC3": PC3,
    "category": categories
})

label = {0.0: "jumping", 1.0: "running", 2.0: "walking"}
df["category"] = df["category"].map(label)

fig = px.scatter_3d(df, x = "PC1", y = "PC2", z = "PC3", color = "category", title = "3D PCA Projection",
                    opacity = 0.8,
                    size_max = 1)

fig.update_traces(marker_size = 3)

pio.write_image(fig, "PCA_3d.pdf")

fig.show()

In [128]:
pca = PCA(n_components = 5)
x_train_reduced = pca.fit_transform(x_train)

centroid_jump = np.mean(x_train_reduced[0:500], axis = 0)
centroid_run = np.mean(x_train_reduced[500:1000], axis = 0)
centroid_walk = np.mean(x_train_reduced[1000:1500], axis = 0)

centroids = np.array([centroid_jump, centroid_run, centroid_walk])

print(centroids)
    

[[ -23.88986635  499.36826149  -72.5000755    28.58077286    1.89232891]
 [  60.77197779 -752.7210869  -103.41194553    1.71007194  -43.51746561]
 [ -36.88211143  253.35282541  175.91202104  -30.2908448    41.62513671]]


In [161]:
k_values = np.arange(1, 115)
accuracies = []

for k in k_values:
    pca = PCA(n_components = k)
    x_train_pca = pca.fit_transform(x_train)

    centroid_jump = np.mean(x_train_pca[0:500], axis = 0)
    centroid_run = np.mean(x_train_pca[500:1000], axis = 0)
    centroid_walk = np.mean(x_train_pca[1000:1500], axis = 0)

    trained_labels = []

    for sample in x_train_pca:
        distance_jump = np.linalg.norm(sample - centroid_jump)
        distance_run = np.linalg.norm(sample - centroid_run)
        distance_walk = np.linalg.norm(sample - centroid_walk)
        assigned_label = np.argmin(np.array([distance_jump, distance_run, distance_walk]))
        trained_labels.append(assigned_label)

    trained_labels = np.array(trained_labels)

    accuracy = accuracy_score(categories, trained_labels)

    accuracies.append(accuracy)

df = pd.DataFrame({
    "k-values": k_values,
    "Accuracy": accuracies
})

fig = px.scatter(df, x = "k-values", y = "Accuracy", title = "Accuracy on training set for different number of modes")

pio.write_image(fig, "training_accuracy.pdf")

fig.show()

In [140]:
for k in range(20):
    print(f"Accuracy score with {k+1}-modes: {accuracies[k]}")

Accuracy score with 1-modes: 0.5073333333333333
Accuracy score with 2-modes: 0.8813333333333333
Accuracy score with 3-modes: 0.756
Accuracy score with 4-modes: 0.73
Accuracy score with 5-modes: 0.7506666666666667
Accuracy score with 6-modes: 0.726
Accuracy score with 7-modes: 0.8706666666666667
Accuracy score with 8-modes: 0.8753333333333333
Accuracy score with 9-modes: 0.8786666666666667
Accuracy score with 10-modes: 0.888
Accuracy score with 11-modes: 0.908
Accuracy score with 12-modes: 0.9093333333333333
Accuracy score with 13-modes: 0.91
Accuracy score with 14-modes: 0.9106666666666666
Accuracy score with 15-modes: 0.9106666666666666
Accuracy score with 16-modes: 0.9106666666666666
Accuracy score with 17-modes: 0.9106666666666666
Accuracy score with 18-modes: 0.9106666666666666
Accuracy score with 19-modes: 0.9106666666666666
Accuracy score with 20-modes: 0.9106666666666666


In [141]:
# Loading and preprocessing test data

jumping_test_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/test/jumping_1t.npy"
running_test_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/test/running_1t.npy"
walking_test_path = "/Users/trevorruggeri/Documents/AMATH482/hw2data/test/walking_1t.npy"

jump_test = np.load(jumping_test_path)
run_test = np.load(running_test_path)
walk_test = np.load(walking_test_path)


arrays = [jump_test, run_test, walk_test]

x_test = np.hstack(arrays)

# Change shape
x_test = x_test.T
mean_test = np.mean(x_test, axis = 0)

# Center x_test
x_test = x_test - mean_test

print(x_test.shape)

(300, 114)


In [166]:
categories = np.zeros(100)
categories = np.append(categories, np.ones(100))
categories = np.append(categories, 2 * np.ones(100))

In [165]:
k_values = np.arange(1, 115)
accuracies = []

for k in k_values:
    pca = PCA(n_components = k)
    x_test_pca = pca.fit_transform(x_test)

    centroid_jump = np.mean(x_test_pca[0:100], axis = 0)
    centroid_run = np.mean(x_test_pca[100:200], axis = 0)
    centroid_walk = np.mean(x_test_pca[200:300], axis = 0)

    trained_labels = []

    for sample in x_test_pca:
        distance_jump = np.linalg.norm(sample - centroid_jump)
        distance_run = np.linalg.norm(sample - centroid_run)
        distance_walk = np.linalg.norm(sample - centroid_walk)
        assigned_label = np.argmin(np.array([distance_jump, distance_run, distance_walk]))
        trained_labels.append(assigned_label)

    trained_labels = np.array(trained_labels)

    accuracy = accuracy_score(categories, trained_labels)

    accuracies.append(accuracy)

df = pd.DataFrame({
    "k-values": k_values,
    "Accuracy": accuracies
})

fig = px.scatter(df, x = "k-values", y = "Accuracy", title = "Accuracy on test set for different number of modes")

pio.write_image(fig, "test_accuracy.pdf")

fig.show()

In [148]:
for k in range(20):
    print(f"Accuracy score with {k+1}-modes: {accuracies[k]}")

Accuracy score with 1-modes: 0.5066666666666667
Accuracy score with 2-modes: 0.9333333333333333
Accuracy score with 3-modes: 1.0
Accuracy score with 4-modes: 1.0
Accuracy score with 5-modes: 1.0
Accuracy score with 6-modes: 1.0
Accuracy score with 7-modes: 1.0
Accuracy score with 8-modes: 1.0
Accuracy score with 9-modes: 1.0
Accuracy score with 10-modes: 1.0
Accuracy score with 11-modes: 1.0
Accuracy score with 12-modes: 1.0
Accuracy score with 13-modes: 1.0
Accuracy score with 14-modes: 1.0
Accuracy score with 15-modes: 1.0
Accuracy score with 16-modes: 1.0
Accuracy score with 17-modes: 1.0
Accuracy score with 18-modes: 1.0
Accuracy score with 19-modes: 1.0
Accuracy score with 20-modes: 1.0


In [168]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [211]:
categories = np.zeros(500)
categories = np.append(categories, np.ones(500))
categories = np.append(categories, 2 * np.ones(500))

In [213]:
k_values = np.arange(1, 21)
accuracy_results = []

for k in k_values:
    pca = PCA(n_components=k)
    x_pca = pca.fit_transform(x_train)

    X_train, X_test, y_train, y_test = train_test_split(x_pca, categories, test_size=0.2, random_state=42)

    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results.append(accuracy)

    print(f'Accuracy for k={k}: {accuracy_results[k-1]}')

df = pd.DataFrame({
    "k-value": k_values,
    "Accuracy": accuracy_results
})

fig = px.scatter(df, x = "k-value", y = "Accuracy", title = "Accuracy of Logistic Regression Classifier in k-mode PCA space")

pio.write_image(fig, "log_reg_train.pdf")

fig.show()

Accuracy for k=1: 0.16
Accuracy for k=2: 0.9033333333333333
Accuracy for k=3: 0.8933333333333333
Accuracy for k=4: 0.9033333333333333
Accuracy for k=5: 0.9333333333333333
Accuracy for k=6: 0.9533333333333334
Accuracy for k=7: 1.0
Accuracy for k=8: 1.0
Accuracy for k=9: 1.0
Accuracy for k=10: 1.0
Accuracy for k=11: 1.0
Accuracy for k=12: 1.0
Accuracy for k=13: 1.0
Accuracy for k=14: 1.0
Accuracy for k=15: 1.0
Accuracy for k=16: 1.0
Accuracy for k=17: 1.0
Accuracy for k=18: 1.0
Accuracy for k=19: 1.0
Accuracy for k=20: 1.0


In [184]:
categories = np.zeros(500)
categories = np.append(categories, np.ones(500))
categories = np.append(categories, 2 * np.ones(500))

In [214]:
pca = PCA(n_components = 2)
x_train_pca = pca.fit_transform(x_train)

model = LogisticRegression(max_iter=10000)
model.fit(x_train_pca, categories)

In [215]:
y_test = np.zeros(100)
y_test = np.append(y_test, np.ones(100))
y_test = np.append(y_test, 2 * np.ones(100))

x_test_pca = pca.fit_transform(x_test)

test_pred = model.predict(x_test_pca)
accuracy = accuracy_score(test_pred, y_test)

print(accuracy)

0.23333333333333334


In [216]:
import zipfile
import os

# Define the paths
notebook_name = "amath_482_homework_2.ipynb"  
output_zip_name = "amath_482_homework_2.zip" 

# Initialize a ZipFile object
with zipfile.ZipFile(output_zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Add the notebook
    zipf.write(notebook_name)
    
    for file_name in os.listdir():
        if file_name != notebook_name and not file_name.endswith('.zip'):
            zipf.write(file_name)

print(f"Notebook and files zipped into: {output_zip_name}")

Notebook and files zipped into: amath_482_homework_2.zip


In [217]:
import numpy.linalg

In [220]:
A = np.array([[1, 1/2, 1/3], [1/2, 1/3, 1/4], [1/3, 1/4, 1/5]])
b = np.array([0, 1, 1])

A_inv = np.linalg.inv(A)

print(np.dot(A_inv, b))

[-6.00000000e+00  1.20000000e+01 -5.68434189e-14]
