# Import

In [254]:
import os
import re
import gc
import sys

from loguru import logger
import numpy as np
import random

import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection


from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA


from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import hdbscan

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# %matplotlib qt
%matplotlib qt

# Detect device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Input Layer

## Definition

In [2]:
READ_RAW_FLAG = False

## Load Data

In [3]:

if not READ_RAW_FLAG:
    # Should always be Seq because not all data is in manifold
    Data_Path = "D:/Baihm/EISNN/Feature/SEQData.npz"
    if os.path.exists(Data_Path):
        AllData = np.load(Data_Path)
        vitro0_data_list = AllData["vitro0_data_list"]
        vitro0_id_list = AllData["vitro0_id_list"]
        vitro0_start_list = AllData["vitro0_start_list"]
        vitro0_start_id_list = AllData["vitro0_start_id_list"]
        vitro0_ele_list = AllData["vitro0_ele_list"]
        
        vitro1_data_list = AllData["vitro1_data_list"]
        vitro1_id_list = AllData["vitro1_id_list"]
        vitro1_start_list = AllData["vitro1_start_list"]
        vitro1_start_id_list = AllData["vitro1_start_id_list"]
        vitro1_ele_list = AllData["vitro1_ele_list"]

        
        vivo0_data_list = AllData["vivo0_data_list"]
        vivo0_id_list = AllData["vivo0_id_list"]
        vivo0_start_list = AllData["vivo0_start_list"]
        vivo0_start_id_list = AllData["vivo0_start_id_list"]
        vivo0_ele_list = AllData["vivo0_ele_list"]

        logger.info(f"Vitro0:\t{vitro0_data_list.shape}\t{vitro0_start_list.shape}")
        logger.info(f"vitro1:\t{vitro1_data_list.shape}\t{vitro1_start_list.shape}")
        logger.info(f"Vivo0:\t{vivo0_data_list.shape}\t{vivo0_start_list.shape}")
        
    else:
        logger.warning(f"{Data_Path} does not exist")


    # Calibrate ID List for concated list

    vitro0_id_list[:,0] = vitro0_id_list[:,0]   + 0
    vitro1_id_list[:,0] = vitro1_id_list[:,0]   + vitro0_ele_list.shape[0]
    vivo0_id_list[:,0]  = vivo0_id_list[:,0]    + vitro0_ele_list.shape[0] + vitro1_ele_list.shape[0]

    vitro0_start_id_list[:,0] = vitro0_start_id_list[:,0]   + 0
    vitro1_start_id_list[:,0] = vitro1_start_id_list[:,0]   + vitro0_ele_list.shape[0]
    vivo0_start_id_list[:,0]  = vivo0_start_id_list[:,0]    + vitro0_ele_list.shape[0] + vitro1_ele_list.shape[0]

[32m2025-05-21 10:22:34.271[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mVitro0:	(98690, 202)	(12170, 202)[0m
[32m2025-05-21 10:22:34.272[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m26[0m - [1mvitro1:	(81674, 202)	(9708, 202)[0m
[32m2025-05-21 10:22:34.272[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m27[0m - [1mVivo0:	(9406, 202)	(719, 202)[0m


## All Data

In [4]:
all_ele_list = np.concatenate([vitro0_ele_list,vitro1_ele_list,vivo0_ele_list], axis=0)

all_data_list = np.vstack((vitro0_data_list, vitro1_data_list, vivo0_data_list))
all_id_list = np.vstack((vitro0_id_list, vitro1_id_list, vivo0_id_list))
all_start_list = np.vstack((vitro0_start_list, vitro1_start_list, vivo0_start_list))
all_start_id_list = np.vstack((vitro0_start_id_list, vitro1_start_id_list, vivo0_start_id_list))
logger.info(f"All:\t{all_data_list.shape}\t{all_start_list.shape}")



[32m2025-05-21 10:22:34.335[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mAll:	(189770, 202)	(22597, 202)[0m


In [5]:
print(np.unique(vitro0_id_list[:,0]).shape, np.unique(vitro1_id_list[:,0]).shape, np.unique(vivo0_id_list[:,0]).shape, np.unique(all_id_list[:,0]).shape)
print(vitro0_ele_list.shape,vitro1_ele_list.shape,vivo0_ele_list.shape,all_ele_list.shape)

(153,) (128,) (6,) (287,)
(218,) (187,) (6,) (411,)


# VAE

## Data Loader

In [891]:
# Helper
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def load_all2ch(data_list, id_list = None):
    '''==================================================
        Load all data and split into 2 channels
        Parameter: 
            data_list: data list    n x 202
            id_list: id list        n x 2
        Returen:
            ch_data_list: channel data list     n x 101 x 2
            ch_id_list: channel id list         n x 2
        ==================================================
    '''
    ch_data_list = np.array([data_list[:,:101],data_list[:,101:]])
    ch_data_list = ch_data_list.transpose(1,2,0)

    ch_id_list = id_list

    return ch_data_list, ch_id_list

## Model Define

In [892]:
class EISDataset_Manifold(Dataset):
    def __init__(self, data_list, id_list = None):
        # data_list: n x m x k x l x 2 list
        # n: number of electrodes
        # m: number of channels
        # k: number of timestamps
        # l: number of freq as dimensions
        # 2: real and imaginary parts after logrithm
        _data = data_list
        _id = id_list
        _data = [torch.tensor(x, dtype=torch.float32) for x in _data]

        self.data = _data
        self.id = _id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Return [2,101] for Conv1D
        return self.data[idx].permute(1,0)  # [2,101] [in_ch, in_dim]

class Curve2VecEncoder_Ver01(nn.Module):
    def __init__(self, in_ch, in_dim, hid_ch, 
                 z_dim, kernel_size):
        super().__init__()


        _layers = []

        pre_ch = in_ch
        poi_ch = hid_ch
        _layers.append(nn.Conv1d(pre_ch, poi_ch, kernel_size=kernel_size))
        _layers.append(nn.ReLU())
        # _layers.append(nn.BatchNorm1d(poi_ch))
        
        pre_ch = poi_ch
        poi_ch = poi_ch * 2
        _layers.append(nn.Conv1d(pre_ch, poi_ch, kernel_size=kernel_size))
        _layers.append(nn.ReLU())
        # _layers.append(nn.BatchNorm1d(poi_ch))
        
        pre_ch = poi_ch
        poi_ch = poi_ch * 2
        _layers.append(nn.Conv1d(pre_ch, poi_ch, kernel_size=kernel_size))
        _layers.append(nn.ReLU())
        # _layers.append(nn.BatchNorm1d(poi_ch))


        self.conv = nn.Sequential(*_layers)
        self.pool = nn.AdaptiveAvgPool1d(1)


        self.fc_mu = nn.Linear(poi_ch, z_dim)
        self.fc_lv = nn.Linear(poi_ch, z_dim)


    def forward(self, x):
        h = self.conv(x)                # [B,ch,in_dim]
        h = self.pool(h).squeeze(-1)    # [B,ch]
        return self.fc_mu(h), self.fc_lv(h) 


class Curve2VecDecoder_Ver01(nn.Module):
    def __init__(self, out_ch, out_dim, hid_ch, 
                 z_dim, kernel_size):
        super().__init__()
        self.hid_ch = hid_ch
        self.out_dim = out_dim


        self.fc_expand = nn.Linear(z_dim, hid_ch * out_dim)


        _layers = []
        _layers.append(nn.ReLU())

        pre_ch = hid_ch
        poi_ch = hid_ch//2
        _layers.append(nn.ConvTranspose1d(pre_ch, poi_ch, kernel_size=kernel_size, padding=kernel_size//2))
        _layers.append(nn.ReLU())
        # _layers.append(nn.BatchNorm1d(poi_ch))
        
        # pre_ch = poi_ch
        # poi_ch = poi_ch//2
        # _layers.append(nn.ConvTranspose1d(pre_ch, poi_ch, kernel_size=kernel_size, padding=kernel_size//2))
        # _layers.append(nn.ReLU())
        # # _layers.append(nn.BatchNorm1d(poi_ch))

        pre_ch = poi_ch
        poi_ch = out_ch
        _layers.append(nn.Conv1d(pre_ch, poi_ch, kernel_size=kernel_size, padding=kernel_size//2))


        # pre_ch = hid_ch
        # poi_ch = out_ch
        # _layers.append(nn.Conv1d(pre_ch, poi_ch, kernel_size=kernel_size, padding=kernel_size//2))


        
        self.deconv = nn.Sequential(*_layers)


    def forward(self, z):
        h = self.fc_expand(z)           # [B,in_ch*in_dim]
        h = h.view(-1, self.hid_ch, self.out_dim)
        h = self.deconv(h)               # [B,in_ch,in_dim]
        return h                        # [B,in_ch,in_dim]

class Curve2VecVAE_Ver01(nn.Module):
    def __init__(self, in_ch=2, in_dim=101, 
                 enc_hid_ch = 16,
                 dec_hid_ch = 16,
                 z_dim = 16, kernel_size = 13):
        super().__init__()
        self.encoder = Curve2VecEncoder_Ver01(in_ch, in_dim, enc_hid_ch, z_dim, kernel_size)
        self.decoder = Curve2VecDecoder_Ver01(in_ch, in_dim, dec_hid_ch, z_dim, kernel_size)

    def reparam(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        mu, lv = self.encoder(x)
        z = self.reparam(mu, lv)
        x_rec = self.decoder(z)
        return x_rec, mu, lv 



## Load Model

In [893]:
eis2vec_save_path = "D:/Baihm/EISNN/Feature/SeqData_Convx2_z_ConvTx1_Convx1.pt"
vae_model_dick = torch.load(eis2vec_save_path)
vae_model = Curve2VecVAE_Ver01().to(device)
vae_model.load_state_dict(vae_model_dick)
vae_model.eval()


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



Curve2VecVAE_Ver01(
  (encoder): Curve2VecEncoder_Ver01(
    (conv): Sequential(
      (0): Conv1d(2, 16, kernel_size=(13,), stride=(1,))
      (1): ReLU()
      (2): Conv1d(16, 32, kernel_size=(13,), stride=(1,))
      (3): ReLU()
      (4): Conv1d(32, 64, kernel_size=(13,), stride=(1,))
      (5): ReLU()
    )
    (pool): AdaptiveAvgPool1d(output_size=1)
    (fc_mu): Linear(in_features=64, out_features=16, bias=True)
    (fc_lv): Linear(in_features=64, out_features=16, bias=True)
  )
  (decoder): Curve2VecDecoder_Ver01(
    (fc_expand): Linear(in_features=16, out_features=1616, bias=True)
    (deconv): Sequential(
      (0): ReLU()
      (1): ConvTranspose1d(16, 8, kernel_size=(13,), stride=(1,), padding=(6,))
      (2): ReLU()
      (3): Conv1d(8, 2, kernel_size=(13,), stride=(1,), padding=(6,))
    )
  )
)

# Manifold

## Dimensionallity Reduction

### Definition

In [894]:
def VAE_latent(model, ds, batch_size=64):
    loader = DataLoader(ds, batch_size=batch_size, shuffle=False)

    _len_data = ds.__len__()
    _poi = 0

    latent_space_inst = []

    model.eval()
    with torch.no_grad():
        for x in loader:
            x = x.to(device)
            mu, lv = model.encoder(x)
            latent_space_inst.append(mu.cpu().numpy())

            _poi = _poi + x.size(0)
            if _poi % 1000 == 0:
                logger.info(f"[{_poi}]/[{_len_data}]")

    latent_space_inst = np.concatenate(latent_space_inst, axis=0)  # [B,z_dim]

    return latent_space_inst

def VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.001):
    explained = _pca_inst.explained_variance_ratio_
    eff_dim = (explained.cumsum() < 0.90).sum() + 1


    fig, axis = plt.subplots(2,1,
                gridspec_kw={'height_ratios': [4,1]},
                figsize=(9, 9))
    axis[0].scatter(latent_dd[:, 0], latent_dd[:, 1], alpha=alpha, s = s)

    # axis[0].set_xlim(-3, 2)
    # axis[0].set_ylim(-3, 3)
    axis[0].set_aspect('equal', adjustable='box')
    # axis[0].set_box_aspect(1)
    axis[0].set_title("Latent Space")

    axis[1].plot(_pca_inst.explained_variance_ratio_,
                label = f"Valid Dimension = {eff_dim}")
    axis[1].legend()
    fig.show()

    return latent_dd

def PCA_eigen(_pca_inst):
    # 获取每个主成分的解释方差比（即贡献率）
    explained_var = _pca_inst.explained_variance_ratio_
    components = np.arange(1, len(explained_var) + 1)

    # 绘图
    plt.figure(figsize=(6, 6))
    bars = plt.bar(components, explained_var, color='skyblue')

    # 在每个柱子上标注数值（百分比形式）
    for bar, var in zip(bars, explained_var):
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{var*100:.2f}%', 
                ha='center', va='bottom', fontsize=10, rotation=45)

    plt.ylim([0,0.35])
    plt.xticks(components)
    plt.xlabel("Principal Component")
    plt.ylabel("Explained Variance Ratio")
    plt.title("PCA Explained Variance per Component")
    plt.grid(axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

### Run PCA

In [895]:
seg0 = vitro0_ele_list.shape[0]
seg1 = vitro0_ele_list.shape[0] + vitro1_ele_list.shape[0]


# Single Dataset
# ch_data_list, ch_id_list  = load_all2ch(all_data_list[all_id_list[:,0]<seg0], all_id_list[all_id_list[:,0]<seg0])
# ch_data_list, ch_id_list  = load_all2ch(all_data_list[(all_id_list[:,0]>=seg0) & (all_id_list[:,0]<seg1)], all_id_list[(all_id_list[:,0]>=seg0) & (all_id_list[:,0]<seg1)])
# ch_data_list, ch_id_list  = load_all2ch(all_data_list[all_id_list[:,0]>=seg1], all_id_list[all_id_list[:,0]>=seg1])

# Couple Dataset
# ch_data_list, ch_id_list  = load_all2ch(all_data_list[all_id_list[:,0]<seg1], all_id_list[all_id_list[:,0]<seg1])
ch_data_list, ch_id_list  = load_all2ch(all_data_list[:], all_id_list[:])

all_data_ds = EISDataset_Manifold(ch_data_list)
latent_space_inst = VAE_latent(vae_model, all_data_ds, batch_size=64)




[32m2025-05-23 04:39:31.331[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[8000]/[189770][0m
[32m2025-05-23 04:39:31.482[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[16000]/[189770][0m
[32m2025-05-23 04:39:31.531[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[24000]/[189770][0m
[32m2025-05-23 04:39:31.578[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[32000]/[189770][0m
[32m2025-05-23 04:39:31.628[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[40000]/[189770][0m
[32m2025-05-23 04:39:31.677[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[48000]/[189770][0m
[32m2025-05-23 04:39:31.726[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[56000]/[189770][0m
[32m2025-05-23 04:39:31.775[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_lat

In [924]:
_pca_inst = PCA(n_components=latent_space_inst.shape[1],random_state=42)
latent_dd = _pca_inst.fit_transform(latent_space_inst)

### Plot PCA Variation

In [898]:

PCA_eigen(_pca_inst)

In [925]:
plt.figure()
plt.scatter(latent_dd[:,0],latent_dd[:,1], s=0.01)

<matplotlib.collections.PathCollection at 0x230cdd8e890>

## Plot Manifold

In [926]:
FLAGIJ = 23
DATA_mask_0 = all_id_list[:,0]<seg1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])
DATA_mask_1 = latent_dd[:,0]>-0.5
latent_dd = latent_dd[DATA_mask_1]

if FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.1)


In [None]:

SAVE_FLAG = False
if SAVE_FLAG:
    manifold_fig_save_path = "D:/Baihm/EISNN/Feature/Manifold23"
    if not os.path.exists(manifold_fig_save_path):
        os.makedirs(manifold_fig_save_path)


_poi_id_list= ch_id_list[DATA_mask_0,:]
_poi_id_list= _poi_id_list[DATA_mask_1,:]

uq_id_list = np.unique(_poi_id_list[:,0])
uq_id_max = np.max(uq_id_list)

cmap = plt.colormaps.get_cmap("rainbow_r")

for i in range(len(uq_id_list)):
# for i in range(0,2):
    # if uq_id_list[i] not in white_id_list:
    #     continue
    fig, axis = plt.subplots(1,1, figsize = (9,9))
    axis.scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray', s=0.05)
    # plt.scatter(_pca_start[:,0],_pca_start[:,1],s=0.1)


    _ele_id = uq_id_list[i]

    ele_mask = _poi_id_list[:,0] == _ele_id
    _ch_list = np.unique(_poi_id_list[ele_mask,1])
    # for j in _ch_list:
    for j in _ch_list:
        _ch_mask = _poi_id_list[:,:2] == [_ele_id,j]
        _ch_mask = _ch_mask[:,0] & _ch_mask[:,1]


        # _ch_data = latent_dd[_ch_mask,:2]
        # _c = cmap(_ele_id / uq_id_max)
        # axis.plot(_ch_data[:,0],_ch_data[:,1], color = _c, alpha = 0.5)

        _cluster_list = np.unique(_poi_id_list[_ch_mask,2])

        _seq_all_len = _poi_id_list[_ch_mask,2].shape[0]
        _seg_poi = 0

        for k in _cluster_list:
            _cluster_mask = _poi_id_list[:,:3] == [_ele_id,j,k]
            _cluster_mask = _cluster_mask[:,0] & _cluster_mask[:,1] & _cluster_mask[:,2]
            # _cluster_data = latent_dd[_cluster_mask,:2]
            _cluster_data = np.stack([latent_dd[_cluster_mask,0],latent_dd[_cluster_mask,1]], axis=1)

            _seg_data = _cluster_data.reshape(-1,1,2)
            _seg_data = np.concatenate([_seg_data[:-1], _seg_data[1:]], axis=1)

            _dx = np.abs(_seg_data[:,1,0] - _seg_data[:,0,0])
            _seg_data = _seg_data[_dx < 1,:,:]

            _seg_len = _cluster_data.shape[0]
            
            color_range = np.linspace(_seg_poi/_seq_all_len, (_seg_poi+_seg_len)/_seq_all_len, _seg_data.shape[0])
            colors = cmap(color_range)

            _seg_poi = _seg_poi+_seg_len
            lc = LineCollection(_seg_data, colors=colors, linewidth=2)
            axis.add_collection(lc)

    
    if FLAGIJ == 12:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 3)
    else:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 5)
    axis.set_aspect('equal', adjustable='box')
    axis.set_title(f"{all_ele_list[int(_ele_id)]}_Manifold")

    if SAVE_FLAG:
        _fig_name = f"{all_ele_list[int(_ele_id)]}_Manifold.png"
        _fig_save_path = os.path.join(manifold_fig_save_path, _fig_name)

        fig.savefig(_fig_save_path)
        plt.close(fig) 

        logger.info(f"{i}/{len(uq_id_list)} Saved")
    else:
        fig.show()



[32m2025-05-20 13:47:08.008[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m0/273 Saved[0m
[32m2025-05-20 13:47:08.508[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m1/273 Saved[0m
[32m2025-05-20 13:47:08.954[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m2/273 Saved[0m
[32m2025-05-20 13:47:09.257[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m3/273 Saved[0m
[32m2025-05-20 13:47:09.569[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m4/273 Saved[0m
[32m2025-05-20 13:47:09.982[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m5/273 Saved[0m
[32m2025-05-20 13:47:10.208[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m6/273 Saved[0m
[32m2025-05-20 13:47:10.306[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m82[0m - [1m7/273 Saved[0m
[32m202

## Plot all manifold

### i vs j

In [None]:


DATA_mask_0 = all_id_list[:,0]<seg1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])


clusterer = hdbscan.HDBSCAN(min_cluster_size=300).fit(latent_dd[:,:])
labels = clusterer.labels_
DATA_mask_1 = labels == 0
# DATA_mask_1 = latent_dd[:,0]>-0.5

latent_dd = latent_dd[DATA_mask_1]

FLAGIJ = 23
if FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.1)



In [682]:

fig, axis = plt.subplots(1,1, figsize = (9,9))
axis.scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray', s=0.05)
# plt.scatter(_pca_start[:,0],_pca_start[:,1],s=0.1)


_poi_id_list= ch_id_list[DATA_mask_0,:]
_poi_id_list= _poi_id_list[DATA_mask_1,:]

uq_id_list = np.unique(_poi_id_list[:,0])
uq_id_max = np.max(uq_id_list)


cmap = plt.colormaps.get_cmap("rainbow_r")

for i in range(len(uq_id_list)):
# for i in range(0,6):
    _ele_id = uq_id_list[i]

    # if _ele_id >= seg0: break
    # if  _ele_id < seg0 or _ele_id >= seg1: continue
    # if _ele_id < seg1: continue

    # if _ele_id >= seg1: break
    

    ele_mask = _poi_id_list[:,0] == _ele_id
    _ch_list = np.unique(_poi_id_list[ele_mask,1])


    for j in _ch_list:
        _ch_mask = _poi_id_list[:,:2] == [_ele_id,j]
        _ch_mask = _ch_mask[:,0] & _ch_mask[:,1]
        # _ch_data = latent_dd[_ch_mask,:2]

        # _c = cmap(_ele_id / uq_id_max)
        # axis.plot(_ch_data[:,0],_ch_data[:,1], color = _c, alpha = 0.5)

        _cluster_list = np.unique(_poi_id_list[_ch_mask,2])

        _seq_all_len = _poi_id_list[_ch_mask,2].shape[0]
        _seg_poi = 0

        for k in _cluster_list:
            _cluster_mask = _poi_id_list[:,:3] == [_ele_id,j,k]
            _cluster_mask = _cluster_mask[:,0] & _cluster_mask[:,1] & _cluster_mask[:,2]
            # _cluster_data = latent_dd[_cluster_mask,:2]
            _cluster_data = np.stack([latent_dd[_cluster_mask,0],latent_dd[_cluster_mask,1]], axis=1)

            _seg_data = _cluster_data.reshape(-1,1,2)
            _seg_data = np.concatenate([_seg_data[:-1], _seg_data[1:]], axis=1)

            _dx = np.abs(_seg_data[:,1,0] - _seg_data[:,0,0])
            _seg_data = _seg_data[_dx < 1,:,:]

            _seg_len = _cluster_data.shape[0]
            
            color_range = np.linspace(_seg_poi/_seq_all_len, (_seg_poi+_seg_len)/_seq_all_len, _seg_data.shape[0])
            colors = cmap(color_range)

            _seg_poi = _seg_poi+_seg_len
            lc = LineCollection(_seg_data, colors=colors, linewidth=1, alpha = 0.05)
            axis.add_collection(lc)

if FLAGIJ == 12:
    axis.set_xlim(-2, 3)
    axis.set_ylim(-3, 3)
else:
    axis.set_xlim(-2, 3)
    axis.set_ylim(-3, 3)

axis.set_aspect('equal', adjustable='box')
# axis[0].set_box_aspect(1)
axis.set_title("Latent Space")

fig.show()



### Lasso Plot

In [None]:
FLAGIJ = 23
DATA_mask_0 = all_id_list[:,0]<seg1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])
DATA_mask_1 = latent_dd[:,0]>-0.5
latent_dd = latent_dd[DATA_mask_1]

if FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.1)


In [398]:

from matplotlib.widgets import LassoSelector
from matplotlib.path import Path
from matplotlib.colors import ListedColormap

# === 数据准备 ===
lasso_labels = np.full(latent_dd.shape[0], -1)
current_label = 0
label_history = []




In [404]:
lasso_data = np.stack([latent_dd[:,0],latent_dd[:,1]], axis=1)

# === 可调色板（支持最多10类）===
color_list = ['lightgray', 'red', 'blue', 'green', 'orange', 'purple', 'cyan', 'magenta', 'brown', 'yellow']
cmap = ListedColormap(color_list)
# cmap = plt.colormaps.get_cmap('tab20c_r')

# === 参数状态 ===
mode = "new"  # 默认模式：new/add/erase
add_target = 0  # 添加模式时指定的目标簇编号

# === 画图 ===
fig, ax = plt.subplots(figsize = (9,9))
pts = ax.scatter(lasso_data[:, 0], lasso_data[:, 1], c='lightgray', s=0.01)
plt.title("Lasso Cluster")

def update_colors():
    color_indices = np.where(lasso_labels == -1, 0, lasso_labels%8 + 1)
    pts.set_facecolor(cmap(color_indices))
    fig.canvas.draw_idle()
    print(np.unique(lasso_labels))

def on_select(verts):
    global current_label, lasso_labels, label_history
    path = Path(verts)
    ind = np.nonzero(path.contains_points(lasso_data))[0]
    
    # 保存当前状态以供撤销
    label_history.append(lasso_labels.copy())

    # 根据模式操作标签
    if mode == "new":
        lasso_labels[ind] = current_label
        # print(f"→ 新建簇 {current_label}, 包含 {len(ind)} 个点")
        current_label += 1
    elif mode == "add":
        lasso_labels[ind] = add_target
        # print(f"→ 添加到簇 {add_target}, 包含 {len(ind)} 个点")
    elif mode == "erase":
        lasso_labels[ind] = -1
        # print(f"→ 反选，{len(ind)} 个点被置为未分簇")

    update_colors()
update_colors()

# === Lasso 绑定 ===
lasso = LassoSelector(ax, on_select)

# === 按键绑定 ===
def on_key(event):
    global mode, add_target, current_label, lasso_labels
    if event.key == 'n':
        mode = 'new'
    elif event.key == 'a':
        mode = 'add'
    elif event.key == 'e':
        mode = 'erase'
    elif event.key == 'z':
        if label_history:
            lasso_labels[:] = label_history.pop()
            update_colors()
        else:
            pass
    elif mode == 'add' and event.key.isdigit():
        add_target = int(event.key)



fig.canvas.mpl_connect('key_press_event', on_key)

plt.show()

# print(np.unique(lasso_labels))


[-1  0  1  3  4  5  6  7]


In [405]:

_poi_data_list= all_data_list[DATA_mask_0,:]
_poi_data_list= _poi_data_list[DATA_mask_1,:]
_poi_data_list.shape


_n = int(len(np.unique(lasso_labels)) / 5) + 1



fig, axis = plt.subplots(_n,5)
for i in range(0,len(np.unique(lasso_labels))):
    _id = np.unique(lasso_labels)[i]
    _data_mask = _poi_data_list[lasso_labels == _id,:]
    for j in range(_data_mask.shape[0]):
        if j > 500: break
        if _n == 1:
            axis[int(i%5)].semilogy(np.exp(_data_mask[j,:101]), color = cmap(_id%8+1), alpha = 0.1) 
        else:
            axis[int(i/5),int(i%5)].semilogy(np.exp(_data_mask[j,:101]), color = cmap(_id%8+1), alpha = 0.1) 
    
    if _n == 1:
        axis[int(i%5)].sharex(axis[0])
        axis[int(i%5)].sharey(axis[0])

    else:
        axis[int(i/5),int(i%5)].sharex(axis[0,0])
        axis[int(i/5),int(i%5)].sharey(axis[0,0])

fig.show()


## Plot Start

In [847]:
ch_start_list, ch_start_id_list  = load_all2ch(all_start_list[:], all_start_id_list[:])
all_start_ds = EISDataset_Manifold(ch_start_list)
latent_space_start_inst = VAE_latent(vae_model, all_start_ds, batch_size=64)


[32m2025-05-23 01:46:33.028[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[8000]/[22597][0m
[32m2025-05-23 01:46:33.138[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[16000]/[22597][0m


### 2D - 1 vs 2

In [848]:
latent_dd = _pca_inst.transform(latent_space_inst[:])

# latent_dd_tmp = latent_dd.copy()
# latent_dd_tmp[:,1] = latent_dd[:,2]
# latent_dd = latent_dd_tmp 



latent_start_dd = _pca_inst.transform(latent_space_start_inst[all_start_id_list[:,0]<seg0])
# latent_start_dd = _pca_inst.transform(latent_space_start_inst[(all_start_id_list[:,0]>=seg0) & (all_start_id_list[:,0]<seg1)])
# latent_start_dd = _pca_inst.transform(latent_space_start_inst[all_start_id_list[:,0]>=seg1])

# latent_start_dd = _pca_inst.transform(latent_space_start_inst[all_start_id_list[:,0]<seg1])
# latent_start_dd = _pca_inst.transform(latent_space_start_inst[:])


# latent_start_dd_tmp = latent_start_dd.copy()
# latent_start_dd_tmp[:,1] = latent_start_dd[:,2]
# latent_start_dd = latent_start_dd_tmp 



In [850]:


fig, axis = plt.subplots(1,1, figsize = (9,9))
axis.scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray',s=0.005)
axis.scatter(latent_start_dd[:,0],latent_start_dd[:,1], color = 'red',s=0.1)

axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
axis.set_aspect('equal', adjustable='box')
# axis[0].set_box_aspect(1)
axis.set_title("Latent Space")

fig.show()



### 2D - 2 vs 3

In [293]:
latent_dd = _pca_inst.transform(latent_space_inst[:])

latent_mask = latent_dd[:,0]>-0.5
latent_dd = latent_dd[latent_mask]



latent_dd_tmp = latent_dd.copy()
latent_dd_tmp[:,0] = latent_dd[:,1]
latent_dd_tmp[:,1] = latent_dd[:,2]
latent_dd = latent_dd_tmp 

latent_start_dd = _pca_inst.transform(latent_space_start_inst[:])

latent_mask = latent_start_dd[:,0]>-0.5
latent_start_dd = latent_start_dd[latent_mask]



latent_dd_tmp = latent_start_dd.copy()
latent_dd_tmp[:,0] = latent_start_dd[:,1]
latent_dd_tmp[:,1] = latent_start_dd[:,2]
latent_start_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.001)



In [None]:




fig, axis = plt.subplots(1,1, figsize = (9,9))
axis.scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray',s=0.005)
axis.scatter(latent_start_dd[:,0],latent_start_dd[:,1], color = 'red',s=0.005)

axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
axis.set_aspect('equal', adjustable='box')
# axis[0].set_box_aspect(1)
axis.set_title("Latent Space")

fig.show()



### 3D

In [290]:
latent_dd = _pca_inst.transform(latent_space_inst[:])
latent_start_dd = _pca_inst.transform(latent_space_start_inst[all_start_id_list[:,0]<seg0])



In [846]:
fig = plt.figure(figsize=(9, 9))
axis = fig.add_subplot(111, projection='3d')

# 3D scatter
axis.scatter(
    latent_dd[:, 0],  # 注意保留你的负号翻转
    latent_dd[:, 1],
    latent_dd[:, 2],
    color='lightgray',
    s=0.05
)

axis.scatter(
    -latent_start_dd[:, 0],
    latent_start_dd[:, 1],
    latent_start_dd[:, 2],
    color='red',
    s=0.05
)

# 设置范围（你可以按实际数据修改）
axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
axis.set_zlim(-3, 3)

axis.set_xlabel("Latent Dimension 1")
axis.set_ylabel("Latent Dimension 2")
axis.set_zlabel("Latent Dimension 3")

axis.set_title("Latent Space (3D)")
plt.tight_layout()
plt.show()

In [292]:
import plotly.graph_objects as go
# 降采样数据
# step = 100
# points = latent_dd[::step]
# starts = latent_start_dd[::step]
step = 100
points = latent_dd[:]
starts = latent_start_dd[:]

fig = go.Figure()

# 背景灰色点
fig.add_trace(go.Scatter3d(
    x=-points[:, 0], y=points[:, 1], z=points[:, 2],
    mode='markers',
    marker=dict(size=0.5, color='lightgray'),
    name='All points'
))

# 起始点红色
fig.add_trace(go.Scatter3d(
    x=-starts[:, 0], y=starts[:, 1], z=starts[:, 2],
    mode='markers',
    marker=dict(size=0.5, color='red'),
    name='Start points'
))

fig.update_layout(
    title="Latent Space (3D)",
    scene=dict(
        xaxis_title='Latent Dim 1',
        yaxis_title='Latent Dim 2',
        zaxis_title='Latent Dim 3'
    ),
    height=800,
)
fig.show(renderer="browser")



## Plot Velocity

### Data Selector

In [929]:
ch_start_list, ch_start_id_list  = load_all2ch(all_start_list[:], all_start_id_list[:])
all_start_ds = EISDataset_Manifold(ch_start_list)
latent_space_start_inst = VAE_latent(vae_model, all_start_ds, batch_size=64)


[32m2025-05-23 05:28:29.021[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[8000]/[22597][0m
[32m2025-05-23 05:28:29.126[0m | [1mINFO    [0m | [36m__main__[0m:[36mVAE_latent[0m:[36m18[0m - [1m[16000]/[22597][0m


In [930]:



# DATA_mask_0 = all_id_list[:,0]<seg1     # Vitro 0 + Vitro 1
# # DATA_mask_0 = all_id_list[:,0]<seg0     # Vitro 0
# # DATA_mask_0 = (all_id_list[:,0]>=seg0) & (all_id_list[:,0]<seg1)     # Vitro 1

# latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])
# DATA_mask_1 = latent_dd[:,0]>-0.5
# latent_dd = latent_dd[DATA_mask_1]

# START_mask_0 = all_start_id_list[:,0]<seg1     # Vitro 0 + Vitro 1
# # START_mask_0 = all_start_id_list[:,0]<seg0     # Vitro 0
# # START_mask_0 = (all_start_id_list[:,0]>=seg0) & (all_start_id_list[:,0]<seg1)     # Vitro 1


# latent_start_dd = _pca_inst.transform(latent_space_start_inst[START_mask_0])
# START_mask_1 = latent_start_dd[:,0]>-0.5
# latent_start_dd = latent_start_dd[START_mask_1]




DATA_mask_0 = all_id_list[:,0]<seg1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])


clusterer = hdbscan.HDBSCAN(min_cluster_size=300, prediction_data=True).fit(latent_dd[:,:])
labels = clusterer.labels_
# DATA_mask_1 = latent_dd[:,0]>-0.5
DATA_mask_1 = labels == 0
latent_dd = latent_dd[DATA_mask_1]

START_mask_0 = all_start_id_list[:,0]<seg1     # Vitro 0 + Vitro 1
latent_start_dd = _pca_inst.transform(latent_space_start_inst[START_mask_0])

labels= hdbscan.approximate_predict(clusterer, latent_start_dd[:,:])[0]
START_mask_1 = labels == 0
latent_start_dd = latent_start_dd[START_mask_1]




'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [932]:
FLAGIJ = 23

if FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 

    latent_dd_tmp = latent_start_dd.copy()
    latent_dd_tmp[:,0] = -latent_start_dd[:,0]
    latent_dd_tmp[:,1] = latent_start_dd[:,2]
    latent_start_dd = latent_dd_tmp 

elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
    
    latent_dd_tmp = latent_start_dd.copy()
    latent_dd_tmp[:,0] = -latent_start_dd[:,1]
    latent_dd_tmp[:,1] = latent_start_dd[:,2]
    latent_start_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.1)


### Calculate Velocity

In [934]:

_poi_id_list= ch_id_list[DATA_mask_0,:]
_poi_id_list= _poi_id_list[DATA_mask_1,:]


_poi_data_list= ch_data_list[DATA_mask_0,:,:]
_poi_data_list= _poi_data_list[DATA_mask_1,:,:]


uq_id_list = np.unique(ch_id_list[:,0])
uq_id_max = np.max(uq_id_list)


cmap = plt.colormaps.get_cmap("rainbow_r")

manifold_vector_list    = []
manifold_time_list      = []
manifold_1kImp_list     = []
manifold_10kImp_list    = []
manifold_100kImp_list   = []

for i in range(len(uq_id_list)):
# for i in range(1):
    logger.info(f"[{i}/{len(uq_id_list)}]")
    _ele_id = uq_id_list[i]

    # if _ele_id >= seg0: break
    # if  _ele_id < seg0 or _ele_id >= seg1: continue
    # if _ele_id < seg1: continue

    # if _ele_id >= seg1: break
    

    ele_mask = _poi_id_list[:,0] == _ele_id
    _ch_list = np.unique(_poi_id_list[ele_mask,1])


    for j in _ch_list:
        _ch_mask = _poi_id_list[:,:2] == [_ele_id,j]
        _ch_mask = _ch_mask[:,0] & _ch_mask[:,1]
        # _ch_data = latent_dd[_ch_mask,:2]

        # _c = cmap(_ele_id / uq_id_max)
        # axis.plot(_ch_data[:,0],_ch_data[:,1], color = _c, alpha = 0.5)

        _cluster_list = np.unique(_poi_id_list[_ch_mask,2])
        for k in _cluster_list:
            _cluster_mask = _poi_id_list[:,:3] == [_ele_id,j,k]
            _cluster_mask = _cluster_mask[:,0] & _cluster_mask[:,1] & _cluster_mask[:,2]
            # _cluster_data = latent_dd[_cluster_mask,:2]
            _cluster_data = np.stack([latent_dd[_cluster_mask,0],latent_dd[_cluster_mask,1]], axis=1)

            # Seg Data
            _seg_data = _cluster_data.reshape(-1,1,2)
            _seg_data = np.concatenate([_seg_data[:-1], _seg_data[1:]], axis=1)

            _dx = np.abs(_seg_data[:,1,0] - _seg_data[:,0,0])
            _seg_data = _seg_data[_dx < 1,:,:]


            # Seg Time
            _seg_time = _poi_id_list[_cluster_mask,3]
            _seg_time = np.diff(_seg_time)
            _seg_time = _seg_time[_dx < 1]

            # Seg Imp
            _seg_Imp_1kHz   = _poi_data_list[_cluster_mask,101//2,0]
            _seg_Imp_1kHz   = np.diff(_seg_Imp_1kHz)
            _seg_Imp_1kHz   = _seg_Imp_1kHz[_dx < 1]

            _seg_Imp_10kHz  = _poi_data_list[_cluster_mask,101*2//3,0]
            _seg_Imp_10kHz  = np.diff(_seg_Imp_10kHz)
            _seg_Imp_10kHz  = _seg_Imp_10kHz[_dx < 1]

            _seg_Imp_100kHz = _poi_data_list[_cluster_mask,101*5//6,0]
            _seg_Imp_100kHz = np.diff(_seg_Imp_100kHz)
            _seg_Imp_100kHz = _seg_Imp_100kHz[_dx < 1]

            

            manifold_vector_list.append(_seg_data)
            manifold_time_list.append(_seg_time)
            manifold_1kImp_list.append(_seg_Imp_1kHz)
            manifold_10kImp_list.append(_seg_Imp_10kHz)
            manifold_100kImp_list.append(_seg_Imp_100kHz)

            
manifold_vector_list = np.concatenate(manifold_vector_list, axis=0)
manifold_time_list = np.concatenate(manifold_time_list, axis=0)
manifold_1kImp_list = np.concatenate(manifold_1kImp_list, axis=0)
manifold_10kImp_list = np.concatenate(manifold_10kImp_list, axis=0)
manifold_100kImp_list = np.concatenate(manifold_100kImp_list, axis=0)



[32m2025-05-23 05:31:11.506[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[0/287][0m
[32m2025-05-23 05:31:11.864[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[1/287][0m
[32m2025-05-23 05:31:12.238[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[2/287][0m
[32m2025-05-23 05:31:12.600[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[3/287][0m
[32m2025-05-23 05:31:12.812[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[4/287][0m
[32m2025-05-23 05:31:12.953[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[5/287][0m
[32m2025-05-23 05:31:13.245[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[6/287][0m
[32m2025-05-23 05:31:13.246[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1m[7/287][0m
[32m2025-05-23 05:31:13.363[0m | [1mI

In [937]:
plt.figure()
plt.scatter(latent_dd[:,0],latent_dd[:,1],s=0.01)

<matplotlib.collections.PathCollection at 0x22f3092c210>

In [936]:
manifold_speed_list = (manifold_vector_list[:,1,:] - manifold_vector_list[:,0,:])
manifold_speed_list = manifold_speed_list/manifold_time_list[:,np.newaxis]
# manifold_speed_list = manifold_time_list[:,np.newaxis]/manifold_speed_list
manifold_speed_list.shape

manifold_speed_1kImp_list   = manifold_1kImp_list/manifold_time_list
manifold_speed_10kImp_list  = manifold_10kImp_list/manifold_time_list
manifold_speed_100kImp_list = manifold_100kImp_list/manifold_time_list




### Plot

#### Plot Velocity Field

In [949]:
# 获取所有点的坐标
all_points = manifold_vector_list.reshape(-1, 2)
x_min, x_max = all_points[:, 0].min(), all_points[:, 0].max()
y_min, y_max = all_points[:, 1].min(), all_points[:, 1].max()

# 定义网格大小
grid_size = 30  # 可根据需要调整
x_bins = np.linspace(x_min, x_max, grid_size + 1)
y_bins = np.linspace(y_min, y_max, grid_size + 1)


# 获取起点坐标
start_points = manifold_vector_list[:, 0, :]

# 计算每个起点所在的网格索引
x_indices = np.digitize(start_points[:, 0], x_bins) - 1
y_indices = np.digitize(start_points[:, 1], y_bins) - 1

# 初始化速度场和计数器
velocity_field = np.zeros((grid_size, grid_size, 2))
count = np.zeros((grid_size, grid_size))

# 累加速度向量
for xi, yi, v in zip(x_indices, y_indices, manifold_speed_list):
    if 0 <= xi < grid_size and 0 <= yi < grid_size:
        velocity_field[yi, xi] += v
        count[yi, xi] += 1

# 计算平均速度
with np.errstate(divide='ignore', invalid='ignore'):
    average_velocity = np.divide(velocity_field, count[:, :, np.newaxis])
    average_velocity[np.isnan(average_velocity)] = 0  # 将 NaN 替换为 0

# 去除噪声样本导致的向量统计
threshold = 20  # 最小样本数量阈值
average_velocity[count < threshold] = 0  # 将低于阈值的单元速度设为零


mag = np.linalg.norm(average_velocity, axis=-1)
vmin = np.percentile(mag, 5)
vmax = np.percentile(mag, 95)
mag_clipped = np.clip(mag, vmin, vmax)
mag_scaled = (mag_clipped - vmin) / (vmax - vmin + 1e-6)
average_velocity = average_velocity * (mag_scaled[:,:,np.newaxis] / (mag[:,:,np.newaxis] + 1e-6))

manifold_vector_field = average_velocity


In [950]:

x_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_centers = (y_bins[:-1] + y_bins[1:]) / 2
alpha = np.clip(np.log(count+1) / np.log(count+1).max(), 0.5, 1.0)
X, Y = np.meshgrid(x_centers, y_centers)

U = average_velocity[:, :, 0]
V = average_velocity[:, :, 1]
speed_mask = (U != 0) | (V != 0)

# Plot Speed Field
fig, axis = plt.subplots(1,1, figsize = (9,9))
# Plot PCA
axis.scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray', s=0.01)

# Plot Start Point
axis.scatter(latent_start_dd[:,0],latent_start_dd[:,1], color = 'red',s=0.01)


# Plot Manifold
axis.quiver(X[speed_mask], Y[speed_mask], U[speed_mask], V[speed_mask], alpha = 1, scale=5, scale_units='xy', angles='xy')
# axis.quiver(X, Y, U, V, scale=1, alpha = alpha, scale_units='xy', angles='xy')


if FLAGIJ == 12:
    axis.set_xlim(-2, 3)
    axis.set_ylim(-3, 3)
else:
    axis.set_xlim(-2, 3)
    axis.set_ylim(-3, 3)
    
axis.set_aspect('equal', adjustable='box')
# axis[0].set_box_aspect(1)
axis.set_title("Latent Space")

axis.set_xlabel('Latent Dimension 1')
axis.set_ylabel('Latent Dimension 2')
axis.set_title('Velocity Field in Latent Space')
# plt.grid(True)
fig.show()


#### Plot Speed 2D

In [633]:
from scipy.ndimage import gaussian_filter

def masked_gaussian_filter(data, mask, sigma):
    data_filtered = gaussian_filter(data * mask, sigma=sigma)
    mask_filtered = gaussian_filter(mask.astype(float), sigma=sigma)
    with np.errstate(divide='ignore', invalid='ignore'):
        result = np.divide(data_filtered, mask_filtered)
        result[mask_filtered == 0] = 0  # 保持无效区域为0
    return result

vx = average_velocity[:,:,0]
vy = average_velocity[:,:,1]
# valid_mask = (vx != 0) | (vy != 0)
valid_mask = (count >= threshold)

vx_filtered = masked_gaussian_filter(vx, valid_mask, sigma=1)
vy_filtered = masked_gaussian_filter(vy, valid_mask, sigma=1)

velocity_filtered = np.stack([vx_filtered, vy_filtered], axis=-1)


In [635]:
X, Y = np.meshgrid(x_centers, y_centers)

U = velocity_filtered[:,:,0]
V = velocity_filtered[:,:,1]

mask_nonzero = (np.linalg.norm(velocity_filtered, axis=-1) > 1e-1)

# 只绘制非零向量
fig, axis = plt.subplots(1,1,figsize=(9,9))
axis.quiver(X[mask_nonzero], Y[mask_nonzero], U[mask_nonzero], V[mask_nonzero], scale=5, scale_units='xy', angles='xy')


axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
fig.show()


In [636]:


# Calculate Speed
speed = np.linalg.norm(average_velocity, axis=-1)
speed = np.linalg.norm(velocity_filtered, axis=-1)
speed = -speed


# speed_filtered = gaussian_filter(speed, sigma=2.0)  # 高斯滤波
speed_filtered = speed


In [638]:

x_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_centers = (y_bins[:-1] + y_bins[1:]) / 2
alpha = np.clip(count / count.max(), 0.2, 1.0)
X, Y = np.meshgrid(x_centers, y_centers)

U = average_velocity[:, :, 0]
V = average_velocity[:, :, 1]
speed_mask = (U != 0) | (V != 0)

# Plot Speed Field
fig, axis = plt.subplots(1,1, figsize = (9,9))


masked_speed = np.ma.masked_where(speed_filtered == 0, speed_filtered)
c2d = axis.contourf(X,Y,masked_speed, levels=10, cmap='viridis', alpha = 0.5)
fig.colorbar(c2d, ax=axis, label="Speed (filtered)")


# Plot Manifold
axis.quiver(X[speed_mask], Y[speed_mask], U[speed_mask], V[speed_mask], alpha = 0.7, scale=5, scale_units='xy', angles='xy')
# axis.quiver(X, Y, U, V, scale=1, alpha = alpha, scale_units='xy', angles='xy')



axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
axis.set_aspect('equal', adjustable='box')
# axis[0].set_box_aspect(1)
axis.set_title("Latent Space")

axis.set_xlabel('Latent Dimension 1')
axis.set_ylabel('Latent Dimension 2')
axis.set_title('Velocity Field in Latent Space')
# plt.grid(True)
fig.show()


#### Plot Velocity 3D

In [123]:

from scipy.ndimage import gaussian_filter


# ==== 1. 对 average_velocity 进行滤波 ====
# 计算速度大小（模）
speed = np.linalg.norm(average_velocity, axis=-1)
# speed[speed > 0] = 1/speed[speed > 0]
# speed[speed == 0] = speed.max()*1.1
speed = -speed


speed_filtered = gaussian_filter(speed, sigma=2.0)  # 高斯滤波
# speed_filtered = speed

# ==== 2. 可视化速度大小的等高线图（2D） ====
fig2d, ax = plt.subplots(figsize=(8, 6))
c2d = ax.contourf(speed_filtered, levels=20, cmap='viridis')
fig2d.colorbar(c2d, ax=ax, label="Speed (filtered)")
ax.set_title("2D Contour of Filtered Speed Field")
ax.set_xlabel("X Grid Index")
ax.set_ylabel("Y Grid Index")

# ==== 3. 可视化速度大小的 3D 等高图 ====
# X, Y = np.meshgrid(np.arange(grid_size), np.arange(grid_size))
fig3d = plt.figure(figsize=(10, 7))
ax3d = fig3d.add_subplot(111, projection='3d')
surf = ax3d.plot_surface(X, Y, speed_filtered, cmap='viridis', edgecolor='none')
fig3d.colorbar(surf, ax=ax3d, shrink=0.5, aspect=10, label="Speed (filtered)")
ax3d.set_title("3D Surface of Filtered Speed Field")
ax3d.set_xlabel("X Grid Index")
ax3d.set_ylabel("Y Grid Index")
ax3d.set_zlabel("Speed Magnitude")

plt.tight_layout()
plt.show()


### Segementation

#### Create Line

In [989]:
# 获取所有点的坐标
all_points = manifold_vector_list.reshape(-1, 2)
x_min, x_max = all_points[:, 0].min(), all_points[:, 0].max()
y_min, y_max = all_points[:, 1].min(), all_points[:, 1].max()

# 定义网格大小
grid_size = 50  # 可根据需要调整
x_bins = np.linspace(x_min, x_max, grid_size + 1)
y_bins = np.linspace(y_min, y_max, grid_size + 1)


# 获取起点坐标
start_points = manifold_vector_list[:, 0, :]

# 计算每个起点所在的网格索引
x_indices = np.digitize(start_points[:, 0], x_bins) - 1
y_indices = np.digitize(start_points[:, 1], y_bins) - 1

# 初始化速度场和计数器
velocity_field = np.zeros((grid_size, grid_size, 2))
count = np.zeros((grid_size, grid_size))

# 累加速度向量
for xi, yi, v in zip(x_indices, y_indices, manifold_speed_list):
    if 0 <= xi < grid_size and 0 <= yi < grid_size:
        velocity_field[yi, xi] += v
        count[yi, xi] += 1

# 计算平均速度
with np.errstate(divide='ignore', invalid='ignore'):
    average_velocity = np.divide(velocity_field, count[:, :, np.newaxis])
    average_velocity[np.isnan(average_velocity)] = 0  # 将 NaN 替换为 0

# 去除噪声样本导致的向量统计
threshold = 30  # 最小样本数量阈值
average_velocity[count < threshold] = 0  # 将低于阈值的单元速度设为零




In [990]:
average_velocity
average_velocity_mean_x = np.mean(average_velocity[:,:,0])
average_velocity_mean_y = np.mean(average_velocity[:,:,1])

average_velocity_weighted_x = np.mean(average_velocity[:,:,0] * count)
average_velocity_weighted_y = np.mean(average_velocity[:,:,1] * count)

logger.info(f"\nmean_vx: {average_velocity_mean_x} mean_vy: {average_velocity_mean_y}\n\
weighted_vx: {average_velocity_weighted_x} weighted_vy: {average_velocity_weighted_y}")

[32m2025-05-23 06:05:20.769[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1m
mean_vx: 0.017014588570163258 mean_vy: 0.016577159640220107
weighted_vx: 2.2103829605658456 weighted_vy: 0.9885546136841449[0m


In [991]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# from skimage.transform import hough_line, hough_line_peaks

# x_bins = np.linspace(-2, 3, grid_size+1)
# y_bins = np.linspace(-3, 5, grid_size+1)

# Grid setup
x_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_centers = (y_bins[:-1] + y_bins[1:]) / 2
X, Y = np.meshgrid(x_centers, y_centers)
U = average_velocity[:, :, 0]
V = average_velocity[:, :, 1]
speed = np.linalg.norm(average_velocity, axis=2)
speed_mask = (U != 0) | (V != 0)


def compute_speed_mask(velocity):
    U, V = velocity[:, :, 0], velocity[:, :, 1]
    return (U != 0) | (V != 0)


def method1_max_mean_speed_diff(velocity, x_coords, y_coords):
    H, W = velocity.shape[:2]
    speed = np.linalg.norm(velocity, axis=2)
    mask = compute_speed_mask(velocity)

    best_line = None
    max_diff = -np.inf

    X, Y = np.meshgrid(x_coords, y_coords)

    for slope in np.linspace(-3, -0.1, 50):
        for intercept in np.linspace(y_coords.min(), y_coords.max(), 50):
            region1 = (Y - slope * X - intercept > 0) & mask
            region2 = (~region1) & mask


            mean1 = speed[region1].mean()
            mean2 = speed[region2].mean()
            diff = abs(mean1 - mean2)
            if diff > max_diff:
                max_diff = diff
                best_line = (slope, intercept)

    return best_line



def method2_kmeans_direction_pca(velocity, x_coords, y_coords, ax=None):
    mask = compute_speed_mask(velocity)
    directions = velocity[mask]
    y_idx, x_idx = np.where(mask)
    positions = np.column_stack([x_coords[x_idx], y_coords[y_idx]])

    if len(directions) < 2:
        return None

    # 归一化方向
    norm = np.linalg.norm(directions, axis=1, keepdims=True)
    norm[norm == 0] = 1
    directions = directions / norm

    # KMeans 聚类
    kmeans = KMeans(n_clusters=2, n_init='auto').fit(directions)
    labels = kmeans.labels_

    # 聚类中心在空间的位置（而非速度空间）
    group_means = [positions[labels == i].mean(axis=0) for i in range(2)]
    mean_pos = np.mean(group_means, axis=0)

    # 主成分分析
    pca = PCA(n_components=2).fit(group_means)
    pc = pca.components_[0]

    slope = -pc[0] / pc[1]
    # slope = pc[1] / pc[0]
    intercept = mean_pos[1] - slope * mean_pos[0]
    
    return (slope, intercept)

def method21_kmeans_speed_only(velocity, x_coords, y_coords, ax=None):
    mask = compute_speed_mask(velocity)
    speed = np.linalg.norm(velocity, axis=2)
    speed_values = speed[mask]
    y_idx, x_idx = np.where(mask)
    positions = np.column_stack([x_coords[x_idx], y_coords[y_idx]])

    if len(speed_values) < 2:
        return None

    # KMeans on speed values
    kmeans = KMeans(n_clusters=2, n_init='auto').fit(speed_values.reshape(-1, 1))
    labels = kmeans.labels_

    # Spatial mean of clusters
    group_means = [positions[labels == i].mean(axis=0) for i in range(2)]
    mean_pos = np.mean(group_means, axis=0)

    # PCA to find boundary line
    pca = PCA(n_components=2).fit(group_means)
    pc = pca.components_[0]

    slope = -pc[0] / pc[1]
    intercept = mean_pos[1] - slope * mean_pos[0]

    # if slope > 0:
    #     slope *= -1
    #     intercept = mean_pos[1] - slope * mean_pos[0]

    # 可视化聚类点
    # if ax is None:
    #     fig,ax = plt.subplots(1,1)
    #     ax.scatter(positions[labels == 0][:, 0], positions[labels == 0][:, 1], s=1, color='purple', label='Cluster 0')
    #     ax.scatter(positions[labels == 1][:, 0], positions[labels == 1][:, 1], s=1, color='orange', label='Cluster 1')

    #     ax.set_aspect('equal')
    # if ax is not None:
    #     ax.scatter(positions[labels == 0][:, 0], positions[labels == 0][:, 1], s=1, color='purple', label='Cluster 0')
    #     ax.scatter(positions[labels == 1][:, 0], positions[labels == 1][:, 1], s=1, color='orange', label='Cluster 1')

    #     ax.set_aspect('equal')

    return (slope, intercept)

def method22_kmeans_speed_direction_combined(velocity, x_coords, y_coords, ax=None):
    mask = compute_speed_mask(velocity)
    vectors = velocity[mask]
    speeds = np.linalg.norm(vectors, axis=1, keepdims=True)
    norm = speeds.copy()
    norm[norm == 0] = 1
    # directions = vectors / norm

    # Combine speed and direction
    # features = np.concatenate([speeds, directions], axis=1)

    
    directions = vectors
    features = np.concatenate([directions], axis=1)

    y_idx, x_idx = np.where(mask)
    positions = np.column_stack([x_coords[x_idx], y_coords[y_idx]])

    if len(features) < 2:
        return None

    kmeans = KMeans(n_clusters=2, n_init='auto').fit(features)
    labels = kmeans.labels_

    group_means = [positions[labels == i].mean(axis=0) for i in range(2)]
    mean_pos = np.mean(group_means, axis=0)

    pca = PCA(n_components=2).fit(group_means)
    pc = pca.components_[0]

    slope = -pc[0] / pc[1]
    intercept = mean_pos[1] - slope * mean_pos[0]

    # if slope > 0:
    #     slope *= -1
    #     intercept = mean_pos[1] - slope * mean_pos[0]

    # 可视化聚类点
    # if ax is None:
    #     fig,ax = plt.subplots(1,1)
    #     ax.scatter(positions[labels == 0][:, 0], positions[labels == 0][:, 1], s=1, color='purple', label='Cluster 0')
    #     ax.scatter(positions[labels == 1][:, 0], positions[labels == 1][:, 1], s=1, color='orange', label='Cluster 1')

    #     ax.set_aspect('equal')
    # if ax is not None:
    #     ax.scatter(positions[labels == 0][:, 0], positions[labels == 0][:, 1], s=1, color='purple', label='Cluster 0')
    #     ax.scatter(positions[labels == 1][:, 0], positions[labels == 1][:, 1], s=1, color='orange', label='Cluster 1')

    #     ax.set_aspect('equal')

    return (slope, intercept)




def method4_min_within_class_var(velocity, x_coords, y_coords):
    H, W = velocity.shape[:2]
    speed = np.linalg.norm(velocity, axis=2)
    mask = compute_speed_mask(velocity)

    best_line = None
    min_var = np.inf

    X, Y = np.meshgrid(x_coords, y_coords)

    for slope in np.linspace(-3, -0.1, 50):
        for intercept in np.linspace(y_coords.min(), y_coords.max(), 50):
            region1 = (Y - slope * X - intercept > 0) & mask
            region2 = (~region1) & mask

            var1 = speed[region1].var()
            var2 = speed[region2].var()
            total_var = var1 + var2

            if total_var < min_var:
                min_var = total_var
                best_line = (slope, intercept)

    return best_line

def method5_max_speed_gradient(velocity, x_coords, y_coords):
    speed = np.linalg.norm(velocity, axis=2)
    mask = compute_speed_mask(velocity)
    speed[~mask] = 0  # 无效点设为0避免污染梯度

    grad_y, grad_x = np.gradient(speed)
    grad_magnitude = np.sqrt(grad_x**2 + grad_y**2)

    # 取梯度最大的几个点，拟合直线
    num_points = 100
    indices = np.dstack(np.unravel_index(np.argsort(grad_magnitude.ravel())[::-1], grad_magnitude.shape))[0]
    selected = indices[:num_points]
    coords = np.array([[x_coords[j], y_coords[i]] for i, j in selected])

    # 用PCA拟合这些点，方向为速度变化最快方向
    if len(coords) < 2:
        return None

    pca = PCA(n_components=2).fit(coords)
    pc = pca.components_[0]
    slope = -pc[0] / pc[1]

    # 加斜率 < 0 限制
    if slope > 0:
        pc = -pc
        slope = -pc[0] / pc[1]

    center = coords.mean(axis=0)
    intercept = center[1] - slope * center[0]

    return (slope, intercept)


def plot_velocity_and_lines(velocity, x_bins, y_bins, lines_dict):
    x_centers = (x_bins[:-1] + x_bins[1:]) / 2
    y_centers = (y_bins[:-1] + y_bins[1:]) / 2
    X, Y = np.meshgrid(x_centers, y_centers)

    U, V = velocity[:, :, 0], velocity[:, :, 1]
    speed_mask = compute_speed_mask(velocity)

    fig, ax = plt.subplots(figsize=(9, 9))
    ax.quiver(X[speed_mask], Y[speed_mask], U[speed_mask], V[speed_mask], alpha=1, scale=2, scale_units='xy', angles='xy')

    colors = ['r', 'g', 'b', 'orange']
    for (name, line), color in zip(lines_dict.items(), colors):
        if line is None:
            continue
        slope, intercept = line
        x_vals = np.array([x_centers.min(), x_centers.max()])
        y_vals = slope * x_vals + intercept
        ax.plot(x_vals, y_vals, label=name, color=color, linestyle='--', linewidth=2)


    if FLAGIJ == 12:
        ax.set_xlim(-2, 3)
        ax.set_ylim(-3, 3)
    else:
        ax.set_xlim(-2, 3)
        ax.set_ylim(-3, 3)

    # ax.set_xlim(x_centers.min(), x_centers.max())
    # ax.set_ylim(y_centers.min(), y_centers.max())
    ax.set_aspect('equal')
    ax.legend()
    ax.set_title("Velocity Field with Boundary Lines")
    plt.show()



# k1, b1 = method1_max_mean_speed_diff(average_velocity)
# k2, b2 = method2_kmeans_direction_pca(average_velocity)
# # k3, b3 = method3_gradient_hough(average_velocity)
# k4, b4 = method4_min_within_class_var(average_velocity)
# figs = [
#     visualize_with_line(k1, b1, "Method 1: Max Mean Speed Difference"),
#     visualize_with_line(k2, b2, "Method 2: K-Means Velocity Direction"),
#     # visualize_with_line(k3, b3, "Method 3: Gradient + Hough Line"),
#     visualize_with_line(k4, b4, "Method 4: Minimize Within-Class Variance")
# ]

# plt.show()


line1 = method1_max_mean_speed_diff(average_velocity, x_centers, y_centers)
line2 = method2_kmeans_direction_pca(average_velocity, x_centers, y_centers)
line4 = method22_kmeans_speed_direction_combined(average_velocity, x_centers, y_centers)
line3 = method21_kmeans_speed_only(average_velocity, x_centers, y_centers)
# line5 = method5_max_speed_gradient(average_velocity, x_centers, y_centers)

line_mean = (average_velocity_mean_y/average_velocity_mean_x, 0)
line_weighted = (average_velocity_weighted_y/average_velocity_weighted_x, 0)

lines = {
    # 'Max Mean Speed Diff': line1,
    # 'KMeans+PCA Direction': line2,
    'KMeans+PCA Speed': line3,
    # 'KMeans+PCA D+R': line4,
    # 'Mean':line_mean,
    # 'Weighted_Mean':line_weighted
    # 'Min Within-Class Var': line4
    # 'Max speed gradient': line5
}



plot_velocity_and_lines(average_velocity, x_bins, y_bins, lines)





Mean of empty slice.


invalid value encountered in scalar divide



In [956]:
Y.shape

(50, 50)

#### Data Segementation

In [182]:
line_seg = method21_kmeans_speed_only(average_velocity, x_centers, y_centers)


if line_seg[0] == 0:
    logger.warning("[LinePosNegError] Slope is 0")

latent_dd_seg = (latent_dd[:,1] - line_seg[1])/line_seg[0] - latent_dd[:,0]
    
latent_dd_pos = latent_dd[latent_dd_seg>0]
latent_dd_neg = latent_dd[latent_dd_seg<0]


_poi_data_list= all_data_list[DATA_mask_0,:]
_poi_data_list= _poi_data_list[DATA_mask_1,:]

pos_data_list = _poi_data_list[latent_dd_seg>0,:]
neg_data_list = _poi_data_list[latent_dd_seg<0,:]


In [184]:
neg_data_list.shape

(22425, 202)

##### Plot Seg

In [920]:



_poi_data_list= all_data_list[DATA_mask_0,:]
_poi_data_list= _poi_data_list[DATA_mask_1,:]
_poi_data_list.shape


_n = int(len(np.unique(lasso_labels)) / 5) + 1



fig, axis = plt.subplots(_n,5)
for i in range(0,len(np.unique(lasso_labels))):
    _id = np.unique(lasso_labels)[i]
    _data_mask = _poi_data_list[lasso_labels == _id,:]
    for j in range(_data_mask.shape[0]):
        if j > 500: break
        if _n == 1:
            axis[int(i%5)].semilogy(np.exp(_data_mask[j,:101]), color = cmap(_id%8+1)) 
        else:
            axis[int(i/5),int(i%5)].semilogy(np.exp(_data_mask[j,:101]), color = cmap(_id%8+1)) 
    
    if _n == 1:
        axis[int(i%5)].sharex(axis[0])
        axis[int(i%5)].sharey(axis[0])

    else:
        axis[int(i/5),int(i%5)].sharex(axis[0,0])
        axis[int(i/5),int(i%5)].sharey(axis[0,0])

fig.show()


IndexError: boolean index did not match indexed array along axis 0; size of axis is 133670 but size of corresponding boolean axis is 131538

# Feature

## Data Loader

In [885]:


DATA_mask_0 = all_id_list[:,0]<seg1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])


clusterer = hdbscan.HDBSCAN(min_cluster_size=300).fit(latent_dd[:,:])
labels = clusterer.labels_


# DATA_mask_1 = latent_dd[:,0]>-0.5
DATA_mask_1 = labels == 0
latent_dd = latent_dd[DATA_mask_1]

FLAGIJ = 23
if FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 


latent_dd = VAE_PCA_Plot(_pca_inst, latent_dd, alpha = 0.5, s = 0.1)




'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



## Feature Extraction

### Z - Imp

In [886]:

_poi_id_list= ch_id_list[DATA_mask_0,:]
_poi_id_list= _poi_id_list[DATA_mask_1,:]


_poi_data_list= ch_data_list[DATA_mask_0,:]
_poi_data_list= _poi_data_list[DATA_mask_1,:]


amp_100Hz = _poi_data_list[:,_poi_data_list.shape[1]*1//3,0]
phz_100Hz = _poi_data_list[:,_poi_data_list.shape[1]*1//3,1]

amp_1kHz = _poi_data_list[:,_poi_data_list.shape[1]//2,0]
phz_1kHz = _poi_data_list[:,_poi_data_list.shape[1]//2,1]
amp_1kHz[amp_1kHz>np.log(1e5)] = np.log(1e5)
 

amp_10kHz = _poi_data_list[:,_poi_data_list.shape[1]*2//3,0]
phz_10kHz = _poi_data_list[:,_poi_data_list.shape[1]*2//3,1]
amp_10kHz[amp_10kHz>np.log(7e4)] = np.log(7e4)


amp_100kHz = _poi_data_list[:,_poi_data_list.shape[1]*5//6,0]
phz_100kHz = _poi_data_list[:,_poi_data_list.shape[1]*5//6,1]





# amp_1kHz_sort = np.sort(amp_1kHz)
# amp_10kHz_sort = np.sort(amp_10kHz)



# values = amp_100Hz 
# values = amp_1kHz 
# values = amp_10kHz
# values = amp_100kHz
# values = phz_100Hz 
# values = phz_1kHz 
# values = phz_10kHz
# values = phz_100kHz

# plt.figure()
# plt.plot(np.sort(values))

feature_list = {
    'amp_100Hz': amp_100Hz,
    'amp_1kHz': amp_1kHz,
    'amp_10kHz': amp_10kHz,
    'amp_100kHz': amp_100kHz,
    'phz_100Hz': phz_100Hz,
    'phz_1kHz': phz_1kHz,
    'phz_10kHz': phz_10kHz,
    'phz_100kHz': phz_100kHz,
}

speed_feature_list = {
    'amp_1kHz_v': manifold_speed_1kImp_list,
    'amp_10kHz_v': manifold_speed_10kImp_list,
    'amp_100kHz_v': manifold_speed_100kImp_list
}



### Calculate Velocity Field

#### Speed Field - dz/dt
这种计算只是统计了流行内的 feature变化,并不是从整个空间考虑的,所以相当于采样不均匀, 所以还是更倾向于用两个vector field乘起来的方案

In [887]:

all_points = manifold_vector_list.reshape(-1, 2)
x_min, x_max = all_points[:, 0].min(), all_points[:, 0].max()
y_min, y_max = all_points[:, 1].min(), all_points[:, 1].max()

# 定义网格大小
grid_size = 30  # 可根据需要调整
x_bins = np.linspace(x_min, x_max, grid_size + 1)
y_bins = np.linspace(y_min, y_max, grid_size + 1) 


# 获取起点坐标
start_points = manifold_vector_list[:, 0, :]

# 计算每个起点所在的网格索引
x_indices = np.digitize(start_points[:, 0], x_bins) - 1
y_indices = np.digitize(start_points[:, 1], y_bins) - 1

speed_field_list = {}

for key in speed_feature_list.keys():
    if speed_field_list.get(key) is not None:
        logger.info(f"Feature: {key} skipped.\t {speed_field_list[key].shape}")
        continue
    vals = speed_feature_list[key]
    speed_field = np.zeros((grid_size, grid_size))
    count = np.zeros((grid_size, grid_size))


    # 累加速度向量
    for xi, yi, v in zip(x_indices, y_indices, vals):
        if 0 <= xi < grid_size and 0 <= yi < grid_size:
            speed_field[yi, xi] += v
            count[yi, xi] += 1

    # 计算平均速度
    with np.errstate(divide='ignore', invalid='ignore'):
        average_speed = np.divide(speed_field, count[:, :])
        average_speed[np.isnan(average_speed)] = 0  # 将 NaN 替换为 0


    threshold = 60  # 最小样本数量阈值
    average_speed[count < threshold] = 0  # 将低于阈值的单元速度设为零


    mag = average_speed
    vmin = np.percentile(mag, 5)
    vmax = np.percentile(mag, 95)
    mag_clipped = np.clip(mag, vmin, vmax)
    mag_scaled = (mag_clipped - vmin) / (vmax - vmin + 1e-6)
    average_speed = average_speed * (mag_scaled[:,:] / (mag[:,:] + 1e-6))

    speed_field_list[key] = average_speed
    logger.info(f"Feature: {key}.\t\t {speed_field_list[key].shape}")


[32m2025-05-23 04:37:26.775[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [1mFeature: amp_1kHz_v.		 (30, 30)[0m
[32m2025-05-23 04:37:26.841[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [1mFeature: amp_10kHz_v.		 (30, 30)[0m
[32m2025-05-23 04:37:26.908[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m53[0m - [1mFeature: amp_100kHz_v.		 (30, 30)[0m


#### Plot Speed Field

In [888]:

from scipy.ndimage import zoom

# 假设 score_map 是你的 30x30 差异图
# 假设 x_bins, y_bins 已存在（W+1, H+1）
x_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_centers = (y_bins[:-1] + y_bins[1:]) / 2
for key in speed_field_list.keys():
    # 插值放大 score_map
    score_map_up = zoom(speed_field_list[key], 3, order=3)  # Cubic interpolation，变成 150x150
    # score_map_up = score_map
    # 同步放大坐标网格
    X_up = np.linspace(x_centers.min(), x_centers.max(), score_map_up.shape[1])
    Y_up = np.linspace(y_centers.min(), y_centers.max(), score_map_up.shape[0])

    fig, axis = plt.subplots(1,1, figsize=(9, 9))

    # 背景灰点
    # axis.scatter(latent_dd[:,0], latent_dd[:,1], color='lightgray', s=0.01)

    # 起始点
    # axis.scatter(latent_start_dd[:,0], latent_start_dd[:,1], color='red', s=0.01)

    # 画出 score_map 的热力图
    vmax = np.abs(score_map_up).max()
    vmin = -vmax
    im = axis.imshow(
        score_map_up, 
        extent=[X_up.min(), X_up.max(), Y_up.min(), Y_up.max()],
        origin='lower',
        cmap='RdBu_r',  # 你也可以换成 'viridis', 'plasma', 'rainbow' 等
        vmin=vmin,
        vmax=vmax,
        alpha=0.8,
        aspect='auto',
    )

    # 添加 colorbar
    cbar = plt.colorbar(im, ax=axis)
    cbar.set_label('Vector Similarity (log-scale projection ratio)')

    # 设置边界
    if FLAGIJ == 12:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 3)
    else:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 3)

    axis.set_aspect('equal', adjustable='box')
    axis.set_xlabel('Latent Dimension 1')
    axis.set_ylabel('Latent Dimension 2')
    axis.set_title(f"dz/dt scaler field {key}")
    plt.show()


#### Vector Field - dz/dx

In [882]:
 

# 网格定义

H, W = 30, 30
x_bins = np.linspace(latent_dd[:, 0].min(), latent_dd[:, 0].max(), W + 1)
y_bins = np.linspace(latent_dd[:, 1].min(), latent_dd[:, 1].max(), H + 1)



# 将样本归属到每个格子
x_idx = np.digitize(latent_dd[:, 0], x_bins) - 1
y_idx = np.digitize(latent_dd[:, 1], y_bins) - 1


# 去掉越界点
valid = (x_idx >= 0) & (x_idx < W) & (y_idx >= 0) & (y_idx < H)
x_idx = x_idx[valid]
y_idx = y_idx[valid]
pts = latent_dd[valid,:2]

vector_field_list = {}

for key in feature_list.keys():
    if vector_field_list.get(key) is not None:
        logger.info(f"Feature: {key} skipped.\t {average_velocity.shape}")
        continue
    vals = feature_list[key]
    vals = vals[valid]
    # 初始化 velocity 场
    average_velocity = np.zeros((H, W, 2))
    count = np.zeros((H, W))

    # Local Weighted Gradient
    for i in range(H):
        for j in range(W):
            mask = (x_idx == j) & (y_idx == i)
            if np.sum(mask) < 5:
                continue
            x_local = pts[mask]
            z_local = vals[mask]
            dz = z_local[:, None] - z_local[None, :]
            dx = x_local[:, None, :] - x_local[None, :, :]
            # weights = np.exp(-np.linalg.norm(dx, axis=2)**2 / 0.1)  # 可调参数
            weights = np.exp(-np.linalg.norm(dx, axis=2)**2 / 0.1)  # 可调参数
            gx = np.sum(weights * dz * dx[..., 0]) / np.sum(weights)
            gy = np.sum(weights * dz * dx[..., 1]) / np.sum(weights)
            average_velocity[i, j, :] = -np.array([gx, gy])
            count[i, j] = np.sum(mask)


    threshold = 60  # 最小样本数量阈值
    average_velocity[count < threshold] = 0  # 将低于阈值的单元速度设为零


    mag = np.linalg.norm(average_velocity, axis=-1)
    vmin = np.percentile(mag, 5)
    vmax = np.percentile(mag, 95)
    mag_clipped = np.clip(mag, vmin, vmax)
    mag_scaled = (mag_clipped - vmin) / (vmax - vmin + 1e-6)
    average_velocity = average_velocity * (mag_scaled[:,:,np.newaxis] / (mag[:,:,np.newaxis] + 1e-6))

    vector_field_list[key] = average_velocity
    logger.info(f"Feature: {key}.\t\t {average_velocity.shape}")


[32m2025-05-23 04:14:06.928[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: amp_100Hz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:15.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: amp_1kHz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:23.440[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: amp_10kHz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:31.701[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: amp_100kHz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:39.989[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: phz_100Hz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:48.321[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: phz_1kHz.		 (30, 30, 2)[0m
[32m2025-05-23 04:14:56.693[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m62[0m - [1mFeature: phz_10kH

#### Plot Velocity Field

In [883]:
# _key = 'amp_100Hz'
# _key = 'amp_1kHz'

for _key in feature_list.keys():
    values = feature_list[_key]
    average_velocity = vector_field_list[_key]

    fig, axis = plt.subplots(1,2, figsize = (16,8))
    fig.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)



    ## Plot Latent Space 
    sc = axis[0].scatter(latent_dd[:,0],latent_dd[:,1], c = values, cmap='rainbow',s=0.5)


    axis[0].set_aspect('equal', adjustable='box')
    # axis[0].set_box_aspect(1)
    axis[0].set_title("Latent Space")


    cbar = fig.colorbar(sc, ax=axis[0], fraction=0.05, pad=0.05)
    # cbar.set_label('Impedance')

    ## Plot Vector Field

    x_centers = (x_bins[:-1] + x_bins[1:]) / 2
    y_centers = (y_bins[:-1] + y_bins[1:]) / 2
    # 可视化向量场
    alpha = np.clip(np.log(count+1) / np.log(count+1).max(), 0.5, 1.0)
    X, Y = np.meshgrid(x_centers, y_centers)
    U, V = average_velocity[:, :, 0], average_velocity[:, :, 1]
    speed_mask = (U != 0) | (V != 0)



    # Plot PCA
    axis[1].scatter(latent_dd[:,0],latent_dd[:,1], color = 'lightgray', s=0.01)


    axis[1].quiver(X[speed_mask], Y[speed_mask], U[speed_mask], V[speed_mask], scale=5, scale_units='xy', angles='xy')


    axis[1].set_aspect('equal', adjustable='box')
    # axis[1].set_box_aspect(1)
    axis[1].set_title(f"Vector Space {_key}")





    if FLAGIJ == 12:
        axis[0].set_xlim(-2, 3)
        axis[0].set_ylim(-3, 3)
        axis[1].set_xlim(-2, 3)
        axis[1].set_ylim(-3, 3)
    else:
        axis[0].set_xlim(-2, 3)
        axis[0].set_ylim(-3, 3)
        axis[1].set_xlim(-2, 3)
        axis[1].set_ylim(-3, 3)

    fig.show()






## Feature Compare

### dz/dt = \nabla z dx/dt

In [822]:
def evaluate_feature(dz_dt):
    dz = np.array(dz_dt, dtype=float)
    mask = np.abs(dz) > 1e-8
    if not np.any(mask):
        return {"sign_consistency": 0.0, "std": 0.0, "score": 0.0}
    
    # Valuence
    signs = np.sign(dz[mask])
    sign_consistency = np.abs(np.sum(signs)) / signs.size

    # STD
    std = np.std(dz[mask])

    # Score
    score = sign_consistency / (1.0 + std)

    return {
        "sign_consistency": float(sign_consistency),
        "std": float(std),
        "score": float(score)
    }

In [829]:
v0 = manifold_vector_field
LieDerivative_list = {}
feature_score_list = {}
for key in vector_field_list.keys():
    v1 = vector_field_list[key]
    LieDerivative_list[key] = np.sum(v0 * v1, axis=2)
    feature_score_list[key] = evaluate_feature(LieDerivative_list[key])

### Plot Lie Derivative

In [821]:

from scipy.ndimage import zoom

# 假设 score_map 是你的 30x30 差异图
# 假设 x_bins, y_bins 已存在（W+1, H+1）
x_centers = (x_bins[:-1] + x_bins[1:]) / 2
y_centers = (y_bins[:-1] + y_bins[1:]) / 2
for key in vector_field_list.keys():
    # 插值放大 score_map
    _LieD_up = zoom(LieDerivative_list[key], 5, order=5)  # Cubic interpolation，变成 150x150
    # score_map_up = score_map
    # 同步放大坐标网格
    X_up = np.linspace(x_centers.min(), x_centers.max(), _LieD_up.shape[1])
    Y_up = np.linspace(y_centers.min(), y_centers.max(), _LieD_up.shape[0])

    fig, axis = plt.subplots(1,1, figsize=(9, 9))

    # 背景灰点
    # axis.scatter(latent_dd[:,0], latent_dd[:,1], color='lightgray', s=0.01)

    # 起始点
    # axis.scatter(latent_start_dd[:,0], latent_start_dd[:,1], color='red', s=0.01)

    # 画出 score_map 的热力图
    vmax = np.abs(_LieD_up).max()
    vmin = -vmax
    im = axis.imshow(
        _LieD_up, 
        extent=[X_up.min(), X_up.max(), Y_up.min(), Y_up.max()],
        origin='lower',
        cmap='RdBu_r',  # 你也可以换成 'viridis', 'plasma', 'rainbow' 等
        vmin=vmin,
        vmax=vmax,
        alpha=0.8,
        aspect='auto',
    )

    # 添加 colorbar
    cbar = plt.colorbar(im, ax=axis)
    cbar.set_label('Vector Similarity (log-scale projection ratio)')

    # 设置边界
    if FLAGIJ == 12:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 3)
    else:
        axis.set_xlim(-2, 3)
        axis.set_ylim(-3, 3)

    axis.set_aspect('equal', adjustable='box')
    axis.set_xlabel('Latent Dimension 1')
    axis.set_ylabel('Latent Dimension 2')
    axis.set_title(f"dz/dt scaler field {key}")
    plt.show()


### Plot Score

In [992]:
def plot_ranked_feature_scores(feature_scores, metric='score'):

    # Check metric and extract score
    valid_items = [(feat, scores.get(metric, None)) 
                   for feat, scores in feature_scores.items()]
    
    valid_items = [(feat, val) for feat, val in valid_items if val is not None]
    if not valid_items:
        raise ValueError(f"No valid items found for metric '{metric}'")

    # Ranked score
    sorted_items = sorted(valid_items, key=lambda x: x[1], reverse=True)
    keys, values = zip(*sorted_items)

    # Plot
    fig, ax = plt.subplots(figsize=(8, 5))
    x = np.arange(len(keys))
    bars = ax.bar(x, values, color='skyblue')

    ax.set_xticks(x)
    ax.set_xticklabels(keys, rotation=45, ha='right')

    for bar, val in zip(bars, values):
        ax.text(
            bar.get_x() + bar.get_width() / 2,
            bar.get_height(),
            f"{val:.2f}",
            ha='center', va='bottom'
        )

    ax.set_ylabel(metric)
    ax.set_title(f'Feature Ranking by {metric}')
    plt.tight_layout()
    plt.show()



# 根据不同 metric 绘制w
plot_ranked_feature_scores(feature_score_list, metric='sign_consistency')
plot_ranked_feature_scores(feature_score_list, metric='std')
plot_ranked_feature_scores(feature_score_list, metric='score')


In [999]:
a = feature_score_list.copy()
a.pop('amp_100Hz')
a.pop('phz_100Hz')
plot_ranked_feature_scores(a, metric='sign_consistency')
plot_ranked_feature_scores(a, metric='std')
plot_ranked_feature_scores(a, metric='score')

# Cluster

## Data Loader

In [902]:
FLAGIJ = 23

In [864]:
DATA_mask_0 = all_id_list[:,0]>-1
latent_dd = _pca_inst.transform(latent_space_inst[DATA_mask_0])
# DATA_mask_1 = latent_dd[:,0]>-0.5
# latent_dd = latent_dd[DATA_mask_1]

if FLAGIJ == 12:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    # latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 13:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,0]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 
elif FLAGIJ == 23:
    latent_dd_tmp = latent_dd.copy()
    latent_dd_tmp[:,0] = -latent_dd[:,1]
    latent_dd_tmp[:,1] = latent_dd[:,2]
    latent_dd = latent_dd_tmp 




In [865]:
alpha = 0.5
s = 0.001

explained = _pca_inst.explained_variance_ratio_
eff_dim = (explained.cumsum() < 0.90).sum() + 1


fig, axis = plt.subplots(2,1,
            gridspec_kw={'height_ratios': [4,1]},
            figsize=(9, 9))
axis[0].scatter(latent_dd[:, 0], latent_dd[:, 1], alpha=alpha, s = s)

axis[0].set_xlim(-2, 3)
axis[0].set_ylim(-3, 3)
# axis[0].set_aspect('equal', adjustable='box')
axis[0].set_box_aspect(1)
axis[0].set_title("Latent Space")

axis[1].plot(_pca_inst.explained_variance_ratio_,
            label = f"Valid Dimension = {eff_dim}")
axis[1].legend()
fig.show()


In [None]:

fig, axis = plt.subplots(1,1,
            figsize=(9, 9))
axis.scatter(latent_dd[:, 0], latent_dd[:, 1], alpha=alpha, s = s)

axis.set_xlim(-2, 3)
axis.set_ylim(-3, 3)
# axis[0].set_aspect('equal', adjustable='box')
axis.set_box_aspect(1)
axis.set_title("Latent Space")

Text(0.5, 1.0, 'Latent Space')

## Run Cluster

In [None]:
# 重新执行，因为代码执行状态已重置，需要重新载入数据和库


# 存储结果
results = {
    'KMeans': {},
    'AffinityPropagation': {},
    'OPTICS': {},
    'HDBSCAN': {}
}

# 定义聚类评估函数
def evaluate_clustering(X, labels):
    if len(set(labels)) <= 1 or (len(set(labels)) == 2 and -1 in set(labels)):
        return {"silhouette": -1, "db": np.inf, "ch": 0}
    return {
        # "silhouette": silhouette_score(X, labels),
        "db": davies_bouldin_score(X, labels),
        "ch": calinski_harabasz_score(X, labels)
    }


## KNN

In [230]:

# KMeans 聚类（2~5类）
for n_clusters in range(2, 6):
    logger.info(f"KMeans: {n_clusters} clusters")
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1000, random_state=42).fit(latent_dd[:,:2])
    labels = kmeans.labels_
    logger.info(f"KMeans: {n_clusters} clusters")
    metrics = evaluate_clustering(latent_dd[:,:2], labels)
    results['KMeans'][n_clusters] = {"labels": labels, **metrics}



[32m2025-05-21 22:18:16.200[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mKMeans: 2 clusters[0m
[32m2025-05-21 22:18:16.320[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mKMeans: 2 clusters[0m
[32m2025-05-21 22:18:16.362[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mKMeans: 3 clusters[0m
[32m2025-05-21 22:18:16.466[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mKMeans: 3 clusters[0m
[32m2025-05-21 22:18:16.505[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mKMeans: 4 clusters[0m
[32m2025-05-21 22:18:16.536[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mKMeans: 4 clusters[0m
[32m2025-05-21 22:18:16.568[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1mKMeans: 5 clusters[0m
[32m2025-05-21 22:18:16.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:

In [None]:

# 提取聚类结果
cluster_nums = list(range(2, 6))
db_scores = [results['KMeans'][k]['db'] for k in cluster_nums]
ch_scores = [results['KMeans'][k]['ch'] for k in cluster_nums]

# ---------------------------------------------
# 1. 聚类可视化子图（2x2）
fig1, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()

for i, k in enumerate(cluster_nums):
    labels = results['KMeans'][k]['labels']
    db = results['KMeans'][k]['db']
    ch = results['KMeans'][k]['ch']

    sc = axes[i].scatter(latent_dd[:, 0], latent_dd[:, 1], c=labels, cmap='tab10', s=1)
    axes[i].set_title(f"KMeans - {k} Clusters\nDB={db:.2f}, CH={ch:.2f}")
    axes[i].set_xlim(-2, 3)
    axes[i].set_ylim(-3, 3)
    axes[i].set_box_aspect(1)

plt.tight_layout()
plt.show()

# ---------------------------------------------
# 2. 评估指标柱状图（2x1）
fig2, axes = plt.subplots(2, 1, figsize=(10, 8))

# DBI
axes[0].bar(cluster_nums, db_scores, color='salmon')
axes[0].set_title("Davies-Bouldin Index vs. Cluster Number")
axes[0].set_ylabel("DB Index")
axes[0].set_xticks(cluster_nums)
for i, score in enumerate(db_scores):
    axes[0].text(cluster_nums[i], score + 0.02, f"{score:.2f}", ha='center', va='bottom')

# CH
axes[1].bar(cluster_nums, ch_scores, color='skyblue')
axes[1].set_title("Calinski-Harabasz Index vs. Cluster Number")
axes[1].set_ylabel("CH Index")
axes[1].set_xticks(cluster_nums)
for i, score in enumerate(ch_scores):
    axes[1].text(cluster_nums[i], score + 0.02 * score, f"{score:.1f}", ha='center', va='bottom')

plt.tight_layout()
plt.show()


## hdbscan

In [253]:


results['HDBSCAN'] = {}
# cluster_size_list = list(range(2, 6))
# cluster_size_list = [100, 300, 1000, 3000]
cluster_size_list = [50,100, 200, 300]
for sz in cluster_size_list:
    logger.info(f"HDBSCAN: {sz} clusters")
    clusterer = hdbscan.HDBSCAN(min_cluster_size=sz).fit(latent_dd[:,:])
    labels = clusterer.labels_
    metrics = evaluate_clustering(latent_dd[:,:], labels)
    results['HDBSCAN'][sz] = {"labels": labels, **metrics}


[32m2025-05-21 22:38:04.179[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mHDBSCAN: 50 clusters[0m

'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.

[32m2025-05-21 22:38:30.248[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mHDBSCAN: 100 clusters[0m

'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.

[32m2025-05-21 22:38:57.682[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mHDBSCAN: 200 clusters[0m

'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.

[32m2025-05-21 22:39:28.727[0m | [1mINFO    [0m | [

### Plot 2D

In [870]:


# 提取结果
db_scores = [results['HDBSCAN'][sz]['db'] for sz in cluster_size_list]
ch_scores = [results['HDBSCAN'][sz]['ch'] for sz in cluster_size_list]

# ---------------------------------------------
# 1. 聚类可视化（2x2）
fig1, axes = plt.subplots(2, 2, figsize=(9, 9))
axes = axes.flatten()

for i, sz in enumerate(cluster_size_list):
    labels = results['HDBSCAN'][sz]['labels']
    db = results['HDBSCAN'][sz]['db']
    ch = results['HDBSCAN'][sz]['ch']

    axes[i].scatter(latent_dd[:, 0], latent_dd[:, 1], c=labels, cmap='tab10', s=0.01)
    # axes[i].scatter(latent_dd[:, 0], latent_dd[:, 1], c=labels, cmap='tab10', s=1)
    axes[i].set_title(f"HDBSCAN min_size={sz}\nDB={db:.2f}, CH={ch:.2f}")
    axes[i].set_xlim(-2, 3)
    axes[i].set_ylim(-3, 3)
    axes[i].set_box_aspect(1)

plt.tight_layout()
plt.show()


### Plot 3D

In [869]:
import plotly.graph_objects as go
points = latent_dd[:]

fig = go.Figure()

# 背景灰色点
fig.add_trace(go.Scatter3d(
    x=-points[:, 0], y=points[:, 1], z=points[:, 2],
    mode='markers',
    marker=dict(size=0.3, 
                color=results['HDBSCAN'][50]['labels'],
                colorscale='Rainbow'),
    name='All points'
))


fig.update_layout(
    title="Latent Space (3D)",
    scene=dict(
        xaxis_title='Latent Dim 1',
        yaxis_title='Latent Dim 2',
        zaxis_title='Latent Dim 3',
        aspectmode='cube'
    ),
    height=800,
)
fig.show(renderer="browser")



In [283]:
a = results['HDBSCAN'][300]['labels']
np.unique_counts(a)
# np.unique(a)
# a[a==np.unique(a)].shape

UniqueCountsResult(values=array([-1,  0,  1,  2]), counts=array([ 15766, 130803,  34444,   8757]))

In [194]:

# ---------------------------------------------
# 2. DB、CH 柱状图
fig2, axes = plt.subplots(2, 1, figsize=(10, 8))

x = np.arange(len(cluster_size_list))

# DBI 柱状图
axes[0].bar(x, db_scores, color='salmon')
axes[0].set_title("DB Index vs. min_cluster_size (HDBSCAN)")
axes[0].set_ylabel("Davies-Bouldin")
axes[0].set_xticks(x)
axes[0].set_xticklabels(cluster_size_list)
for i, score in enumerate(db_scores):
    axes[0].text(x[i], score + 0.02, f"{score:.2f}", ha='center', va='bottom')

# CH 柱状图
axes[1].bar(x, ch_scores, color='skyblue')
axes[1].set_title("CH Index vs. min_cluster_size (HDBSCAN)")
axes[1].set_ylabel("Calinski-Harabasz")
axes[1].set_xticks(x)
axes[1].set_xticklabels(cluster_size_list)
for i, score in enumerate(ch_scores):
    axes[1].text(x[i], score + 0.02 * score, f"{score:.1f}", ha='center', va='bottom')


plt.tight_layout()
plt.show()
