In [48]:
# Shiyi (Amy) Qin and Victor Z
# UW-Madison, 2024
# identify data redundancies in Gibbs using SVD

## Start here if working on google collab

---



In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [50]:
# Specify the path of this file (This may differ for each user)
path = '/content/drive/MyDrive/StatsBookCode/Python/'

In [51]:
# This is to check if the speficied path above is correct
!echo $path
import os
os.path.isdir(path)

/content/drive/MyDrive/StatsBookCode/Python/


True

In [52]:
# This is needed so we can import packages from a different path than standard libraries
import sys
sys.path.append(path)

## Start from here if working on local computer

In [53]:
import numpy as np
from numpy.linalg import svd, matrix_rank
import pandas as pd

In [54]:
# Load the datasets
datat = np.loadtxt(path+'Data/gibbs_covariance.dat')
datan = np.loadtxt(path+'Data/gibbs_lowtemp_class.dat')
data = np.hstack((datat, datan[:, [0]]))

In [55]:
# Perturb data with noise
seed = 2024
np.random.seed(seed)  # For reproducibility
S, n = data.shape
data[:, 1] += np.random.normal(0, 0.05, S)

In [56]:
# Construct input-output matrices
Y = data[:, 1]
X = data[:, [0, 2, 3, 4, 5]]
X = np.hstack((X, np.ones((S, 1))))  # Add ones to introduce bias parameter

In [57]:
# Singular Value Decomposition
U, S_matrix, Vt = svd(X, full_matrices=False)
S_values = np.diag(S_matrix)

In [58]:
# Visualize first rows of X matrix
print("First 6 rows of X matrix:")
Xdf=pd.DataFrame(X)
print(Xdf.head(10))

First 6 rows of X matrix:
            0          1           2          3       4    5
0  166.977686  25.561388  451.122776  74.438612  583.15  1.0
1  170.710337  24.753476  449.506952  75.246524  583.15  1.0
2  141.174815  32.221642  464.443284  67.778358  583.15  1.0
3  146.628653  30.640396  461.280791  69.359604  583.15  1.0
4  164.665427  26.079611  452.159221  73.920389  583.15  1.0
5  157.433396  27.793351  455.586701  72.206649  583.15  1.0
6  151.623689  29.279087  458.558173  70.720913  583.15  1.0
7  147.274565  30.459805  460.919609  69.540195  583.15  1.0
8  137.144740  33.457668  466.915336  66.542332  583.15  1.0
9  189.358255  21.193520  442.387040  78.806480  583.15  1.0


In [59]:
# Eigenvalues of X'X
eigenvalues = np.linalg.eigvals(X.T @ X)
print("Eigenvalues of X'X:")
print(pd.DataFrame(eigenvalues))

Eigenvalues of X'X:
              0
0  1.454314e+08
1  2.362216e+05
2  1.973902e+03
3 -4.246330e-08
4  2.230817e-10
5 -4.350750e-12


In [60]:
# Compute rank as we add columns
print("Rank as we add columns:")
for i in range(1, X.shape[1] + 1):
    rank = matrix_rank(X[:, :i])
    print(f"Rank of first {i} columns: {rank}")

Rank as we add columns:
Rank of first 1 columns: 1
Rank of first 2 columns: 2
Rank of first 3 columns: 3
Rank of first 4 columns: 3
Rank of first 5 columns: 3
Rank of first 6 columns: 3


In [61]:
# Print SVD results
print("\nSingular values of X:")
print(pd.DataFrame(S_values))


Singular values of X:
              0           1          2             3             4  \
0  12059.492499    0.000000   0.000000  0.000000e+00  0.000000e+00   
1      0.000000  486.026352   0.000000  0.000000e+00  0.000000e+00   
2      0.000000    0.000000  44.428612  0.000000e+00  0.000000e+00   
3      0.000000    0.000000   0.000000  5.789335e-13  0.000000e+00   
4      0.000000    0.000000   0.000000  0.000000e+00  9.973563e-14   
5      0.000000    0.000000   0.000000  0.000000e+00  0.000000e+00   

              5  
0  0.000000e+00  
1  0.000000e+00  
2  0.000000e+00  
3  0.000000e+00  
4  0.000000e+00  
5  3.598564e-16  
