In [101]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
from matplotlib.colors import ListedColormap

np.set_printoptions(precision=5, suppress=True)

# Set random seed to student number
np.random.seed(46387334) # TODO: UNCOMMENT

In [102]:
# Helper functions
def sig_fig(X, sigfigs):
    exp = np.floor(ma.log10(abs(X)).filled(0))
    return np.round(X*10**-exp, sigfigs-1) * 10**exp

def get_diff(a1, a2):
    print(f"{a1.shape=}")
    print(f"{a2.shape=}")

    rows, cols = a1.shape
    for i in range(rows):
        for j in range(cols):
            print(f"[{i},{j}]: {round(a1[i, j], 3) :>5} vs {round(a2[i, j], 3) :>7}  |  Error: {(a1[i, j] - a2[i, j]) * (100 / a1[i, j])  :.2f}%")

## 2.3)

In [103]:
from ucimlrepo import fetch_ucirepo 

# Fetch dataset from https://archive.ics.uci.edu/dataset/519/heart+failure+clinical+records
heart_failure_clinical_records = fetch_ucirepo(id=519) 
  
# Extract data (as pandas dataframes) 
X = heart_failure_clinical_records.data.features 
y = heart_failure_clinical_records.data.targets 

# Transform the needed columns into the "normal" forms
X['logcp']    = np.log(X['creatinine_phosphokinase'])
X['sqrtplat'] = np.sqrt(X['platelets'])
X['recipsc']  = 1 / X['serum_creatinine']

# Redfine X to just be the 5 columns of interest
cols_to_keep           = ['logcp', 'ejection_fraction', 'sqrtplat', 'recipsc', 'serum_sodium']
X_trans                = X[cols_to_keep]

# Normalize the dataset
scaler = StandardScaler()
X_norm = scaler.fit_transform(X_trans)
X_norm = pd.DataFrame(X_norm, columns=X_trans.columns)

# Preform PCA on the data set
pca = PCA()
X_pca = pca.fit_transform(X_norm)
X_pca = pd.DataFrame(X_pca)

### 3. Part 1 - Preform Hotelling’s T2 Test

In [None]:
X_norm['death_event'] = y
X_norm.to_csv('X_norm.csv', index=False)

X_trans['death_event'] = y
X_trans.to_csv('X_trans.csv', index=False)

X_pca['death_event'] = y
X_pca.to_csv('X_pca.csv', index=False)

In [104]:
from rpy2.robjects import pandas2ri
import rpy2.robjects as ro


def run_ht2(data_file):
    pandas2ri.activate()
    r_code = f"""
    library(Hotelling)

    data <- read.csv("{data_file}")

    # Separate the groups
    group0 <- data[data$death_event == 0, 1:5]
    group1 <- data[data$death_event == 1, 1:5]

    # Perform Hotelling's T2 test
    test_result <- hotelling.test(group0, group1)
    test_result
    """
    # Execute the R code
    result = ro.r(r_code)
    print(result)


# Nomormalised
print("\n\nNormalised")
run_ht2('X_norm.csv')

# Un nomormalised
print("\n\nUn Normalised")
run_ht2('X_trans.csv')

# Normalised followed by PCA
print("\n\nNormalised + PCA")
run_ht2('X_pca.csv')




Normalised
Test stat:  68.488 
Numerator df:  5 
Denominator df:  293 
P-value:  7.19e-12 



Un Normalised
Test stat:  68.488 
Numerator df:  5 
Denominator df:  293 
P-value:  7.19e-12 



Normalised + PCA
Test stat:  68.488 
Numerator df:  5 
Denominator df:  293 
P-value:  7.19e-12 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_trans['death_event'] = y


In [110]:
def run_t_tests(data_file):
    pandas2ri.activate()
    r_code = f"""
    library(Hotelling)

    # Load the data
    data <- read.csv("{data_file}")

    # Separate the groups
    group0 <- data[data$death_event == 0, 1:5]
    group1 <- data[data$death_event == 1, 1:5]

    # Initialize a list to store t-test results
    t_tests_results <- list()

    # Perform t-tests for each variable
    for (i in 1:5) {{
        t_test_result <- t.test(group0[[i]], group1[[i]])
        t_tests_results[[names(data)[i]]] <- t_test_result
    }}

    t_tests_results
    """
    # Execute the R code and return results as a Python dictionary
    result = ro.r(r_code)
    print(result)

# Nomormalised
print("\n\n=========>>>>>> Normalised")
run_t_tests('X_norm.csv')

# Un nomormalised
print("\n\n=========>>>>>> Un Normalised")
run_t_tests('X_trans.csv')

# Normalised followed by PCA
print("\n\n=========>>>>>> Normalised + PCA")
run_t_tests('X_pca.csv')



$logcp

	Welch Two Sample t-test

data:  group0[[i]] and group1[[i]]
t = -0.55058, df = 184.19, p-value = 0.5826
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.3150157  0.1775554
sample estimates:
  mean of x   mean of y 
-0.02206720  0.04666293 


$ejection_fraction

	Welch Two Sample t-test

data:  group0[[i]] and group1[[i]]
t = 4.567, df = 164.76, p-value = 9.647e-06
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.3265811 0.8240310
sample estimates:
 mean of x  mean of y 
 0.1847136 -0.3905924 


$sqrtplat

	Welch Two Sample t-test

data:  group0[[i]] and group1[[i]]
t = 0.99282, df = 171.15, p-value = 0.3222
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1260366  0.3811196
sample estimates:
  mean of x   mean of y 
 0.04094977 -0.08659171 


$recipsc

	Welch Two Sample t-test

data:  group0[[i]] and group1[[i]]
t = 