In [4]:
#ex1

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

N = 1000
u = [0, 0, 0]
o = [1, 3, 0]

x1 = np.random.normal(u[0], o[0], N)
x2 = x1 + np.random.normal(u[1], o[1], N)
x3 = 2 * x1 + x2

data = np.column_stack((x1, x2, x3))

cov_matrix = np.cov(data, rowvar=False)

eigenvalues_cov, eigenvectors_cov = np.linalg.eig(cov_matrix)

u_svd, s_svd, vt_svd = np.linalg.svd(data - np.mean(data, axis=0))

print("Eigenvectors match:", np.allclose(np.abs(eigenvectors_cov), np.abs(vt_svd.T[:, :3])))
print("Eigenvalues match:", np.allclose(eigenvalues_cov, s_svd ** 2 / (N - 1)))

explained_variance_ratio = np.cumsum(s_svd ** 2) / np.sum(s_svd ** 2)

num_components_99 = np.argmax(explained_variance_ratio >= 0.99) + 1

pca_result = np.dot(data - np.mean(data, axis=0), u_svd[:, :num_components_99].T)

fig, axs = plt.subplots(2, 3, figsize=(15, 10))

axs[0, 0].scatter(data[:, 0], data[:, 1], alpha=0.5)
axs[0, 1].scatter(data[:, 0], data[:, 2], alpha=0.5)
axs[0, 2].scatter(data[:, 1], data[:, 2], alpha=0.5)

axs[1, 0].scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
axs[1, 1].scatter(pca_result[:, 0], pca_result[:, 2], alpha=0.5)
axs[1, 2].scatter(pca_result[:, 1], pca_result[:, 2], alpha=0.5)

plt.show()


Eigenvectors match: False
Eigenvalues match: False


ValueError: shapes (1000,3) and (2,1000) not aligned: 3 (dim 1) != 2 (dim 0)

In [5]:
#ex2
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

N = 1000
u = [0, 0, 0]
o = [1, 3, 0]

x1 = np.random.normal(u[0], o[0], N)
x2 = x1 + np.random.normal(u[1], o[1], N)
x3 = 2 * x1 + x2

data_original = np.column_stack((x1, x2, x3))

num_noise_variables = 10
noise_std = 0.05  # A smaller standard deviation for noise

noise = np.random.normal(0, noise_std, size=(N, num_noise_variables))

data_with_noise = np.column_stack((data_original, noise))

cov_matrix_with_noise = np.cov(data_with_noise, rowvar=False)

eigenvalues_with_noise, eigenvectors_with_noise = np.linalg.eig(cov_matrix_with_noise)

u_svd_with_noise, s_svd_with_noise, vt_svd_with_noise = np.linalg.svd(data_with_noise - np.mean(data_with_noise, axis=0))

print("Eigenvectors match:", np.allclose(np.abs(eigenvectors_with_noise), np.abs(vt_svd_with_noise.T)))
print("Eigenvalues match:", np.allclose(eigenvalues_with_noise, s_svd_with_noise ** 2 / (N - 1)))

explained_variance_ratio_with_noise = np.cumsum(s_svd_with_noise ** 2) / np.sum(s_svd_with_noise ** 2)

num_components_99_with_noise = np.argmax(explained_variance_ratio_with_noise >= 0.99) + 1

pca_result_with_noise = np.dot(data_with_noise - np.mean(data_with_noise, axis=0), u_svd_with_noise[:, :num_components_99_with_noise])

fig, axs = plt.subplots(2, 3, figsize=(15, 10))

# Original Basis
axs[0, 0].scatter(data_original[:, 0], data_original[:, 1], alpha=0.5)
axs[0, 1].scatter(data_original[:, 0], data_original[:, 2], alpha=0.5)
axs[0, 2].scatter(data_original[:, 1], data_original[:, 2], alpha=0.5)
axs[0, 0].set_title("Original Basis")

# New Basis with Noise
axs[1, 0].scatter(pca_result_with_noise[:, 0], pca_result_with_noise[:, 1], alpha=0.5)
axs[1, 1].scatter(pca_result_with_noise[:, 0], pca_result_with_noise[:, 2], alpha=0.5)
axs[1, 2].scatter(pca_result_with_noise[:, 1], pca_result_with_noise[:, 2], alpha=0.5)
axs[1, 0].set_title("New Basis with Noise")

plt.show()


Eigenvectors match: False
Eigenvalues match: False


ValueError: shapes (1000,13) and (1000,2) not aligned: 13 (dim 1) != 1000 (dim 0)

In [None]:
#3
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

data = pd.read_csv('data/magic04.data', header=None)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
X_standardized = StandardScaler().fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_standardized)

explained_variance_ratio = pca.explained_variance_ratio_

plt.plot(np.cumsum(explained_variance_ratio))
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance Ratio')
plt.show()

cumulative_variance = np.cumsum(explained_variance_ratio)
num_components = np.argmax(cumulative_variance >= 0.95) + 1

pca = PCA(n_components=num_components)
X_pca = pca.fit_transform(X_standardized)

columns = [f'PC{i}' for i in range(1, num_components + 1)]
df_pca = pd.DataFrame(data=X_pca, columns=columns)
df_final = pd.concat([df_pca, y], axis=1)

print(df_final.head())
