In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib
import matplotlib.pyplot as plt
from numpy.matlib import repmat
from sklearn.preprocessing import normalize

In [2]:
def cov(x, y):
    xbar, ybar = x.mean(), y.mean()
    return np.sum((x - xbar)*(y - ybar))/(len(x) - 1)

In [3]:
def cov_mat(X):
    return np.array([[cov(X[0], X[0]), cov(X[0], X[1])], \
                     [cov(X[1], X[0]), cov(X[1], X[1])]])

In [4]:
def eigsort(V, eigvals):
    
    # Sort the eigenvalues from largest to smallest. Store the sorted
    # eigenvalues in the column vector lambd.
    lohival = np.sort(eigvals)
    lohiindex = np.argsort(eigvals)
    lambd = np.flip(lohival)
    index = np.flip(lohiindex)
    Dsort = np.diag(lambd)
    
    # Sort eigenvectors to correspond to the ordered eigenvalues. Store sorted
    # eigenvectors as columns of the matrix vsort.
    M = np.size(lambd)
    Vsort = np.zeros((M, M))
    for i in range(M):
        Vsort[:,i] = V[:,index[i]]
    return Vsort, Dsort

In [5]:
# normc(M) normalizes the columns of M to a length of 1.

def normc(Mat):
    return normalize(Mat, norm='l2', axis=0)

In [6]:
def PCA(x):
    # data set x of shape (N, D) where N is number of observations and D is the number of varaiables or features
    mean_white = np.mean(x,axis=1)
    A = x - repmat(mean_white,x.shape[1],1).T
    
    #eigendecomposition
    eigValues, eigVectors = np.linalg.eig(np.dot(A.T,A))
    
    #sort
    V,D = eigsort(eigVectors,eigValues)
    
    return V,D,mean_white,A

In [7]:
# def PCA(x):
#     # data set x of shape (N, D) where N is number of observations and D is the number of varaiables or features
#     m = np.mean(x, axis=0)
#     m = m.reshape(12, 1)
#     x_centered = x.T - m
    
#     #calculate covariance
#     covmatrix = cov_mat(x_centered.T)
    
#     #eigendecomposition
#     eigenvals, eigenvecs = np.linalg.eig(covmatrix)
    
#     #sort 
#     eigenvecs, eigenvals = eigsort(eigenvecs, eigenvals)
    
#     return (eigenvecs, eigenvals, m, x_centered)
    

In [8]:
wine_white = pd.read_csv('winequality-white.csv', delimiter = ';')

wine_white = wine_white.to_numpy()

In [9]:
PCA_wine_white = PCA(wine_white)

In [10]:
eigenvecs, eigenvals, m, x_centered = PCA_wine_white
print(eigenvecs.shape)
print(eigenvals.shape)
print(m.shape)
print(x_centered.shape)

(12, 12)
(12, 12)
(4898,)
(4898, 12)


In [11]:
U = np.dot(x_centered,eigenvecs)
U = normc(U)

In [17]:
c = np.dot(U.T,x_centered[:,2])
c.shape

(12,)