In [1]:
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt

## Read

In [2]:
path = 'hw3-data/data*.csv'
file_list = glob(path)
ndata_list = [pd.read_csv(file) for file in file_list]
print(file_list)
print(len(ndata_list))

path = 'hw3-data/iris.csv'
cdata = pd.read_csv(path)
print(cdata.shape)

['hw3-data\\dataI.csv', 'hw3-data\\dataII.csv', 'hw3-data\\dataIII.csv', 'hw3-data\\dataIV.csv', 'hw3-data\\dataV.csv']
5
(150, 4)


## Model

In [111]:
def pca(ndata, n_comp=2, cdata=None):
    # use noiseless data
    if cdata is not None:
        data = cdata
    else:
        # use noisy data
        data = ndata
    
    # mean and covariance
    mean = np.array(np.mean(data))
    X = data-mean
    covmat= np.cov(X.T)
    eigval, eigvec = np.linalg.eig(covmat)
    idx = np.argsort(-eigval)
    eigval = eigval[idx]
    eigvec = eigvec[:,idx]
    
    if n_comp==0:
        # use data mean
        mean = mean.reshape(-1,4)
        recon = np.repeat(mean, ndata.shape[0], axis=0)
    else:
        # reconstruct
        X = ndata - mean
        r = np.dot(X, eigvec[:,0:n_comp])
        recon = np.dot(r, eigvec[:,0:n_comp].T)+mean
    

    return recon, eigval, eigvec

In [113]:
# compute n1 ~ n4
errors = []
n_comp = 5
# N columns: reconstruct the noisy datasets using the PCs of the noiseless dataset
print('noiseless data')
for i in np.arange(n_comp):
    print(i)
    for ndata in ndata_list:
        xhat, eigval, eigvec = pca(ndata, i, cdata)
        mse = np.mean(np.sum((xhat - np.array(cdata) )**2, 1))
        print(mse)
        errors.append(mse)

print('noisy data')
# C columns: reconstruct the noisy datasets using the PCs of the noisy dataset
for i in np.arange(n_comp):
    print(i)
    for ndata in ndata_list:
        xhat, eigval, eigvec = pca(ndata, i, None)
        mse = np.mean(np.sum((xhat - np.array(cdata) )**2, 1) )
        print(mse)
        errors.append(mse)

noiseless data
0
4.5424706666666665
4.5424706666666665
4.5424706666666665
4.5424706666666665
4.5424706666666665
1
0.641093184900985
1.290372450759801
0.7999427437338249
1.917767749946061
0.38345031150498454
2
0.7156284875049563
1.967240392379871
0.8280825547067429
3.3317221039403275
0.17556300024433893
3
0.9083929073982753
2.650841135132736
0.9849497682406673
4.548257197249823
0.14178364800457008
4
1.1156578578493088
3.6532797325111104
1.193999999999999
5.13926666666666
0.16083836180763808
noisy data
0
4.549538992715444
4.557472963930547
4.566198666666666
4.919927999999999
4.543119029074548
1
0.6486421084108522
1.3234621480418773
0.840614157257198
2.8356794280264324
0.38461353395761727
2
0.7506211289999836
2.1197480492819554
1.2070897968259071
4.651434502717087
0.17781528266962562
3
0.9419728192850566
3.0273799199753313
1.2711919671860714
4.971247271525594
0.14444050603137706
4
1.1156578578493082
3.653279732511109
1.1940000000000006
5.139266666666663
0.16083836180763805


## Save

In [114]:
# 1. table of numbers
result = pd.DataFrame(np.array(errors).reshape(-1, 5).T, columns=['0N','1N','2N','3N','4N', '0c', '1c', '2c', '3c', '4c'],
            index=['Dataset I','Dataset II','Dataset III','Dataset IV','Dataset V'])
result.to_csv('yjkwon2-numbers.csv', header=True, index=None)
result

Unnamed: 0,0N,1N,2N,3N,4N,0c,1c,2c,3c,4c
Dataset I,4.542471,0.641093,0.715628,0.908393,1.115658,4.549539,0.648642,0.750621,0.941973,1.115658
Dataset II,4.542471,1.290372,1.96724,2.650841,3.65328,4.557473,1.323462,2.119748,3.02738,3.65328
Dataset III,4.542471,0.799943,0.828083,0.98495,1.194,4.566199,0.840614,1.20709,1.271192,1.194
Dataset IV,4.542471,1.917768,3.331722,4.548257,5.139267,4.919928,2.835679,4.651435,4.971247,5.139267
Dataset V,4.542471,0.38345,0.175563,0.141784,0.160838,4.543119,0.384614,0.177815,0.144441,0.160838


In [90]:
# reconstruction of Dataset I
recon, _, _ = pca(ndata_list[0], 2, None)
recon = pd.DataFrame(recon, columns=["Sepal.Length","Sepal.Width","Petal.Length","Petal.Width"])
recon.to_csv('yjkwon2-recon.csv', header=True, index=None)