In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
from matplotlib.colors import ListedColormap



np.set_printoptions(precision=5, suppress=True)

# Set random seed to student number
np.random.seed(46387334) # TODO: UNCOMMENT

In [3]:
# Helper functions
def sig_fig(X, sigfigs):
    exp = np.floor(ma.log10(abs(X)).filled(0))
    return np.round(X*10**-exp, sigfigs-1) * 10**exp

def get_diff(a1, a2):
    print(f"{a1.shape=}")
    print(f"{a2.shape=}")

    rows, cols = a1.shape
    for i in range(rows):
        for j in range(cols):
            print(f"[{i},{j}]: {round(a1[i, j], 3) :>5} vs {round(a2[i, j], 3) :>7}  |  Error: {(a1[i, j] - a2[i, j]) * (100 / a1[i, j])  :.2f}%")

## 2.d)

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import mahalanobis
from ucimlrepo import fetch_ucirepo 

# Fetch dataset from https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records
heart_failure_clinical_records = fetch_ucirepo(id=519) 
  
# Extract data (as pandas dataframes) 
X = heart_failure_clinical_records.data.features 
y = heart_failure_clinical_records.data.targets 

# Transform the needed columns into the "normal" forms
X['logcp']    = np.log(X['creatinine_phosphokinase'])
X['sqrtplat'] = np.sqrt(X['platelets'])
X['recipsc']  = 1 / X['serum_creatinine']

# Redfine X to just be the 5 columns of interest
cols_to_keep = ['logcp', 'ejection_fraction', 'sqrtplat', 'recipsc', 'serum_sodium']
X_trans      = X[cols_to_keep]

# Assume X_trans is the un-normalized dataset with the relevant columns
# Compute the sample mean vector
mean_vector = X_trans.mean(axis=0).values

# Compute the covariance matrix of the dataset
cov_matrix = np.cov(X_trans.T)

# Compute the inverse of the covariance matrix
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Compute Mahalanobis distance for each data point
mahalanobis_distances = []
for i in range(X_trans.shape[0]):
    x_i = X_trans.iloc[i].values
    diff_vector = x_i - mean_vector
    mahalanobis_dist = np.sqrt(np.dot(np.dot(diff_vector.T, inv_cov_matrix), diff_vector))
    mahalanobis_distances.append(mahalanobis_dist)

# Convert distances to a pandas Series to identify the point with the largest distance
mahalanobis_distances = pd.Series(mahalanobis_distances)

# Identify the point with the largest Mahalanobis distance
max_mahalanobis_index = mahalanobis_distances.idxmax()
max_mahalanobis_value = mahalanobis_distances[max_mahalanobis_index]

print(f"Point with the largest Mahalanobis distance: {max_mahalanobis_index}")
print(f"Largest Mahalanobis distance value: {max_mahalanobis_value}")
print(f"Point data:\n{X_trans.iloc[max_mahalanobis_index]}")


Point with the largest Mahalanobis distance: 199
Largest Mahalanobis distance value: 5.595096595740964
Point data:
logcp                  7.099202
ejection_fraction     35.000000
sqrtplat             513.184207
recipsc                0.555556
serum_sodium         113.000000
Name: 199, dtype: float64
