In [1]:
import numpy as np

In [2]:
data = np.load("../datasets/data/phate_from_trajectory/eb_velocity_v5.npz")

In [3]:
print(data.files)

['pcs', 'phate', 'delta_embedding', 'color', 'ixs', 'sample_labels', 'pcs_delta']


In [4]:
data['pcs'].shape

(16819, 100)

In [5]:
for name in data.files:
    print(f"{name}:\n{data[name].shape}\n")


pcs:
(16819, 100)

phate:
(16819, 2)

delta_embedding:
(16819, 2)

color:
(16819, 4)

ixs:
(2436,)

sample_labels:
(16819,)

pcs_delta:
(16819, 100)



In [6]:
unique_labels, label_counts = np.unique(data['sample_labels'], return_counts=True)
for label, count in zip(unique_labels, label_counts):
    print(f"sample_labels == {label}: {count} occurrences")

sample_labels == 0: 2381 occurrences
sample_labels == 1: 4163 occurrences
sample_labels == 2: 3278 occurrences
sample_labels == 3: 3665 occurrences
sample_labels == 4: 3332 occurrences


In [18]:
np.unique(data['color'][:,3])

array([1.])

In [7]:
labels = np.unique(data['sample_labels'])
pcs = data['pcs']

for label in labels:
    mask = data['sample_labels'] == label
    pcs_label = pcs[mask]


In [9]:
max_points = 2381

for label in labels:
    mask = data["sample_labels"] == label
    pcs_label = data["pcs"][mask]
    n = pcs_label.shape[0]
    num_parts = int(np.ceil(n / max_points))
    for part_idx in range(num_parts):
        start = part_idx * max_points
        end = min((part_idx + 1) * max_points, n)
        pcs_part = pcs_label[start:end]
        np.savez(
            f"../datasets/data/phate_from_trajectory/pcs_label_{label}.npz",
            pcs=pcs_part
        )
        print(f"Saved pcs_label_{label}_part{part_idx}.npz with shape {pcs_part.shape}")
        break

Saved pcs_label_0_part0.npz with shape (2381, 100)
Saved pcs_label_1_part0.npz with shape (2381, 100)
Saved pcs_label_2_part0.npz with shape (2381, 100)
Saved pcs_label_3_part0.npz with shape (2381, 100)
Saved pcs_label_4_part0.npz with shape (2381, 100)


In [8]:
unique, counts = np.unique(data['sample_labels'], return_counts=True)
for u, c in zip(unique, counts):
    print(f"Label {u}: {c} occurrences")

Label 0: 2381 occurrences
Label 1: 4163 occurrences
Label 2: 3278 occurrences
Label 3: 3665 occurrences
Label 4: 3332 occurrences


In [9]:
for label in labels:
    mask = data['sample_labels'] == label
    pcs_label = data['pcs'][mask]
    mean_var = pcs_label.var(axis=0).mean()
    print(f"Label {label}: moyenne de la variance = {mean_var}")

Label 0: moyenne de la variance = 1.0990648156274396
Label 1: moyenne de la variance = 1.6585613701494788
Label 2: moyenne de la variance = 2.1609540340355484
Label 3: moyenne de la variance = 2.296500269439102
Label 4: moyenne de la variance = 2.438311652709521


array([False, False, False, ...,  True,  True,  True])

In [14]:
pcs_label.shape

(3332, 100)

In [15]:
# Calculer la variance pour chaque dimension (colonne) de pcs_label, puis afficher la moyenne des variances
variance_par_dimension = pcs_label.var(axis=0)
print("Variance par dimension:", variance_par_dimension)
print("Moyenne de la variance:", variance_par_dimension.mean())

Variance par dimension: [ 4.94270315 14.9477124  16.419531   19.01505997 17.32152064  9.32672897
  7.24633024  8.10152261  4.59857964  4.91223166  3.99693644  4.41762381
  5.3421524   2.90322612  3.22124863  3.0118993   3.88482429  2.91645313
  2.36945339  2.51130106  2.81989911  2.15644238  2.42801874  2.01170607
  2.29491397  2.32558521  2.31851378  2.0004661   1.7782521   1.80130878
  1.76150126  1.82643934  1.60224527  1.76641315  1.59942336  1.37881619
  1.64809591  1.51564144  1.54402286  1.33226739  1.42589092  1.26906627
  1.20043176  1.18349078  1.36955848  1.39637323  1.29501948  1.30514584
  1.33444977  1.08502988  1.27314908  1.21840839  1.1660824   1.2473889
  1.18289989  1.19727119  1.12752722  1.15200579  1.21572817  1.08629317
  1.05116902  1.11051648  1.10034682  1.13137511  1.04596907  1.03517596
  1.07784739  1.09267569  1.06768358  1.03062281  1.02634353  1.01236418
  1.0055407   1.04319897  0.9717423   1.05174445  1.00755557  0.99990477
  0.99411668  0.96403091  1.

In [17]:
variance_par_dimension = pcs_label.mean(axis=0)
print("Variance par dimension (pcs_label):", variance_par_dimension)
print("Moyenne de la variance (pcs_label):", variance_par_dimension.mean())

Variance par dimension (pcs_label): [ 4.88027718e+01  5.88396905e+00  3.73949660e+00  2.12996718e-01
  1.03651328e+00 -1.68206147e+00  6.14391102e-01  1.17511860e+00
 -6.92460510e-01 -6.85646167e-01  2.44485801e-01  3.02721306e-01
 -3.57567451e-01 -5.30569333e-02 -2.18824235e-01 -2.56835498e-01
  3.45308845e-01 -1.55774805e-01  1.57029251e-01 -3.57196674e-01
  2.38776954e-01 -2.41587217e-01  2.98992366e-01  5.96597937e-02
  1.26210282e-01 -4.18937312e-02  4.37822295e-02  2.33623123e-01
  8.95077854e-02 -3.20914290e-02 -2.32093548e-01 -1.13296332e-01
  7.97524850e-02  2.10392999e-01 -3.97532024e-01  5.22482685e-02
  7.78688248e-02  2.03809418e-01 -1.96892021e-01 -8.72497283e-02
 -8.07810863e-02  1.20894501e-01  2.59552834e-02  2.43137859e-02
  9.94572165e-02  4.05094156e-02 -7.06282069e-02  7.19485878e-02
 -2.43957585e-02  4.40003486e-02 -9.45847961e-03  6.06910400e-02
  3.97546751e-02 -6.54957462e-02 -2.10480433e-02 -3.08541171e-02
 -5.19356382e-02  4.80695689e-02  1.06626238e-02 -3.62