 # Example File from our Supervisor

In [1]:
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.utils import resample
from numpy.linalg import svd

In [2]:
data_dir = os.path.join("..", "..", "1_features_selec_project")
dortmund = pd.read_csv(os.path.join(data_dir, "p1_Dortmund.csv"), index_col=0)
lemon = pd.read_csv(os.path.join(data_dir, "p1_lemon.csv"), index_col=0)

In [3]:
def bootstrap_pca(X, n_boot, variance_threshold=0.99):
    n_samples, n_features = X.shape
    components_boot = []
    min_n_comp = n_features
    for i in range(n_boot):
        X_resampled = resample(X, random_state=i)
        pca = PCA(n_components=variance_threshold) 
        pca.fit(X_resampled)
        components_boot.append(pca.components_.T)
        min_n_comp = np.min((min_n_comp, len(pca.components_)))

    return components_boot, min_n_comp

def singular_values(mat1, mat2):
    M = mat1.T @ mat2
    decomp = svd(M)
    return decomp[1]
    
n_boot = 4 # For readability, set 100 at least for your analysis
dort_comp, min_dort = bootstrap_pca(dortmund, n_boot)
lem_comp, min_lem = bootstrap_pca(lemon, n_boot)
min_n_comp = np.min((min_dort, min_lem))
dort_comp = [boot[:, :min_n_comp] for boot in dort_comp]
lem_comp = [boot[:, :min_n_comp] for boot in lem_comp]

In [4]:
results = {}
results["within_Dortmund"] = {}
results["within_Lemon"] = {}
results["between_DL"] = {}

for i, boot1 in enumerate(dort_comp):
    for j, boot2 in enumerate(dort_comp):
        if j > i:
            results["within_Dortmund"][f"boots {i}-{j}"] = singular_values(boot1, boot2)

for i, boot1 in enumerate(lem_comp):
    for j, boot2 in enumerate(lem_comp):
        if j > i:
            results["within_Lemon"][f"boots {i}-{j}"] = singular_values(boot1, boot2)

for i, boot1 in enumerate(dort_comp):
    for j, boot2 in enumerate(lem_comp):
        if j>= i:
            results["between_DL"][f"boots {i}-{j}"] = singular_values(boot1, boot2)


In [5]:
results["within_Dortmund"]

{'boots 0-1': array([0.99999231, 0.99993843, 0.99992603, 0.99989584, 0.99932335,
        0.99880726, 0.99607862, 0.97753644, 0.86309289]),
 'boots 0-2': array([0.9999914 , 0.99995699, 0.99992196, 0.99978033, 0.99938864,
        0.99700079, 0.99342219, 0.96034346, 0.91547356]),
 'boots 0-3': array([0.99999308, 0.99996338, 0.99987577, 0.99936463, 0.99827607,
        0.9969368 , 0.99630074, 0.96692681, 0.95668119]),
 'boots 1-2': array([0.99999627, 0.99994819, 0.99984907, 0.99975486, 0.99927576,
        0.99879729, 0.99786016, 0.96954006, 0.94372354]),
 'boots 1-3': array([0.99999022, 0.99988573, 0.99981138, 0.99934315, 0.99908062,
        0.99856392, 0.99675905, 0.98776022, 0.84858314]),
 'boots 2-3': array([0.99998809, 0.99998046, 0.99987442, 0.99977644, 0.99903267,
        0.99847594, 0.99656191, 0.95835344, 0.92354473])}

In [6]:
results["within_Lemon"]

{'boots 0-1': array([0.99999032, 0.99987554, 0.99958905, 0.99917331, 0.99746035,
        0.99170117, 0.98928208, 0.98226301, 0.64207258]),
 'boots 0-2': array([0.99999054, 0.99986745, 0.99946937, 0.99939314, 0.99813286,
        0.99390618, 0.99218976, 0.98834164, 0.71098852]),
 'boots 0-3': array([0.99998535, 0.99992112, 0.99975063, 0.99922708, 0.99814137,
        0.99646292, 0.99144845, 0.96006584, 0.87487108]),
 'boots 1-2': array([0.99997104, 0.99978243, 0.99866608, 0.99818453, 0.99724103,
        0.99211795, 0.97817851, 0.9669869 , 0.80009055]),
 'boots 1-3': array([0.99998856, 0.99987836, 0.99966367, 0.99879613, 0.99770136,
        0.99185461, 0.98833115, 0.97392643, 0.76419225]),
 'boots 2-3': array([0.99997752, 0.99978164, 0.99944314, 0.99904732, 0.99659444,
        0.99481388, 0.99138033, 0.96204054, 0.81980345])}

In [7]:
results["between_DL"]

{'boots 0-0': array([2.07330722e-01, 1.12645771e-01, 7.27270412e-02, 2.17995003e-02,
        1.54029527e-02, 9.33240792e-03, 2.17160876e-03, 7.01426507e-04,
        1.00690870e-04]),
 'boots 0-1': array([2.22181972e-01, 1.18260666e-01, 6.17746276e-02, 1.71190281e-02,
        1.07935746e-02, 7.04156408e-03, 4.60384523e-03, 7.02183889e-04,
        1.98608793e-04]),
 'boots 0-2': array([2.11364648e-01, 1.26135701e-01, 6.47898838e-02, 1.80439771e-02,
        1.28421133e-02, 7.25083777e-03, 4.80788565e-03, 4.13835308e-04,
        4.68351211e-06]),
 'boots 0-3': array([0.20878962, 0.10642795, 0.05570053, 0.01477429, 0.01234392,
        0.00482361, 0.00173513, 0.00133967, 0.00022344]),
 'boots 1-1': array([2.56884246e-01, 1.24820365e-01, 4.60334913e-02, 2.31775152e-02,
        1.36167677e-02, 8.79363349e-03, 4.71037932e-03, 7.91855300e-04,
        7.32830002e-05]),
 'boots 1-2': array([2.38099796e-01, 1.35591044e-01, 4.43540323e-02, 2.03367039e-02,
        1.49904016e-02, 8.48706963e-03, 4.27

What could be expected is that for each boot x - boot y comparison, the alignement values tend to be high for the first PCs and less stable for the latest (but there also could be PCs explaining a lot of variance that are specific to a dataset, for example due to the recording conditions). The question is what is stable, as assessed by methods that make use of the variance and nice plotting of course.


If you have time, look at the stability of the raw features correlation matrix between datasets. Good luck!