In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from ucimlrepo import fetch_ucirepo


np.set_printoptions(precision=5, suppress=True)

# Set random seed to student number
np.random.seed(46387334) # TODO: UNCOMMENT

In [3]:
# Helper functions
def sig_fig(X, sigfigs):
    exp = np.floor(ma.log10(abs(X)).filled(0))
    return np.round(X*10**-exp, sigfigs-1) * 10**exp

def get_diff(a1, a2):
    print(f"{a1.shape=}")
    print(f"{a2.shape=}")

    rows, cols = a1.shape
    for i in range(rows):
        for j in range(cols):
            print(f"[{i},{j}]: {round(a1[i, j], 3) :>5} vs {round(a2[i, j], 3) :>7}  |  Error: {(a1[i, j] - a2[i, j]) * (100 / a1[i, j])  :.2f}%")

## 2.a)

In [4]:
from ucimlrepo import fetch_ucirepo 

# Fetch dataset from https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records
heart_failure_clinical_records = fetch_ucirepo(id=519) 
  
# Extract data (as pandas dataframes) 
X = heart_failure_clinical_records.data.features 
y = heart_failure_clinical_records.data.targets 

# Transform the needed columns into the "normal" forms
X['logcp']    = np.log(X['creatinine_phosphokinase'])
X['sqrtplat'] = np.sqrt(X['platelets'])
X['recipsc']  = 1 / X['serum_creatinine']

# Redfine X to just be the 5 columns of interest
cols_to_keep = ['logcp', 'ejection_fraction', 'sqrtplat', 'recipsc', 'serum_sodium']
X_trans      = X[cols_to_keep]

# Normalize the dataset
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_trans)


# Perform PCA on the original dataset
pca_original = PCA()
pca_original.fit(X_trans)
principal_components_original = pca_original.components_

# Perform PCA on the normalized dataset
pca_normalized = PCA()
pca_normalized.fit(X_norm)
principal_components_normalized = pca_normalized.components_

# Report the principal component direction vectors
print("Principal Components (Original Data):")
print(principal_components_original)


print("\nPrincipal Components (Normalized Data):")
print(principal_components_normalized)


Principal Components (Original Data):
[[ 0.00011  0.00903  0.99996  0.00021  0.00264]
 [-0.00669  0.99717 -0.0092   0.00373  0.07422]
 [ 0.0093  -0.07423 -0.00197  0.01673  0.99706]
 [ 0.99973  0.0073  -0.00016  0.02009 -0.00911]
 [-0.02022 -0.00263 -0.00014  0.99965 -0.01678]]

Principal Components (Normalized Data):
[[ 0.04381  0.48677  0.25249  0.57883  0.60195]
 [ 0.87622 -0.41037 -0.04939  0.24117  0.05689]
 [ 0.09024 -0.04778  0.95792 -0.16265 -0.21332]
 [ 0.46437  0.75945 -0.12267 -0.36123 -0.24913]
 [ 0.0809  -0.1249   0.03391 -0.67071  0.72584]]
