In [1]:
import matplotlib
import seaborn
import matplotlib
matplotlib.use('agg')  # 或 'nbagg'，'notebook'，'qt5agg'，'tkagg'等
print(matplotlib.__version__)
print(seaborn.__version__)

3.7.3
0.11.0


In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import random
import torch
import os
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # 你可以换成其它数字
# 你的三维嵌入
real_embeddings = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
generated_embeddings = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")

# 变成二维
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)

print(f"real_embeddings_2d shape: {real_embeddings_2d.shape}")
print(f"generated_embeddings_2d shape: {generated_embeddings_2d.shape}")

# 合并，制作标签
all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))

# UMAP降维可视化
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

plt.figure(figsize=(8, 6))
plt.scatter(
    embedding_2d[labels == 0, 0], embedding_2d[labels == 0, 1],
    s=30, alpha=0.7, label="Real", c="#4F8FC6", edgecolors='none'
)
plt.scatter(
    embedding_2d[labels == 1, 0], embedding_2d[labels == 1, 1],
    s=30, alpha=0.7, label="Generated", c="#F29E4C", edgecolors='none'
)
plt.legend(frameon=False, fontsize=14)
plt.title("UMAP Visualization of Real vs Generated Protein Embeddings", fontsize=16, fontweight='bold')
plt.xlabel("UMAP 1", fontsize=14)
plt.ylabel("UMAP 2", fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.savefig("/exp_data/sjx/star/experiments/experimnet_picture/umap_real_vs_generated.svg", format="svg")
plt.show()

# 用 LogisticRegression 区分
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, all_embeddings, labels, cv=5)
print(f"Classification accuracy to distinguish real vs generated: {scores.mean():.3f} ± {scores.std():.3f}")


2025-07-22 14:26:35.485265: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-22 14:26:35.824227: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753165595.993094 2687566 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753165596.044751 2687566 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753165596.363153 2687566 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

real_embeddings_2d shape: (1654, 1152)
generated_embeddings_2d shape: (2435, 1152)


  warn(


In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import seaborn as sns
from matplotlib import gridspec
import random
import torch
import os
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)  # 你可以换成其它数字
# 你的三维嵌入
real_embeddings = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
generated_embeddings = np.load("/exp_data/sjx/star/gan_data/fake_negative_embeddings.npy")

# 变成二维
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)

# 合并，制作标签
all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))

# UMAP降维
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

# 分组
umap1_real = embedding_2d[labels == 0, 0]
umap2_real = embedding_2d[labels == 0, 1]
umap1_gen = embedding_2d[labels == 1, 0]
umap2_gen = embedding_2d[labels == 1, 1]

# 设置画布
fig = plt.figure(figsize=(8, 8))
gs = gridspec.GridSpec(2, 2, width_ratios=[4, 1], height_ratios=[1, 4],
                       wspace=0.05, hspace=0.05)

ax_main = plt.subplot(gs[1, 0])
ax_xkde = plt.subplot(gs[0, 0], sharex=ax_main)
ax_ykde = plt.subplot(gs[1, 1], sharey=ax_main)

# 主图：散点
ax_main.scatter(umap1_real, umap2_real, s=30, alpha=0.7, label="Real", c="#4F8FC6", edgecolors='none')
ax_main.scatter(umap1_gen, umap2_gen, s=30, alpha=0.7, label="Generated", c="#F29E4C", edgecolors='none')
ax_main.legend(frameon=False, fontsize=14)
ax_main.set_xlabel("UMAP 1", fontsize=14)
ax_main.set_ylabel("UMAP 2", fontsize=14)
ax_main.tick_params(axis='both', which='major', labelsize=12)

# 上方KDE
sns.kdeplot(umap1_real, ax=ax_xkde, color="#4F8FC6", fill=True, alpha=0.5, linewidth=2, label="Real")
sns.kdeplot(umap1_gen, ax=ax_xkde, color="#F29E4C", fill=True, alpha=0.5, linewidth=2, label="Generated")
ax_xkde.axis('off')

# 右侧KDE
sns.kdeplot(umap2_real, ax=ax_ykde, color="#4F8FC6", fill=True, alpha=0.5, linewidth=2, vertical=True, label="Real")
sns.kdeplot(umap2_gen, ax=ax_ykde, color="#F29E4C", fill=True, alpha=0.5, linewidth=2, vertical=True, label="Generated")
ax_ykde.axis('off')

plt.suptitle("UMAP Visualization with KDE Marginals", fontsize=16, fontweight='bold')
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.savefig('/exp_data/sjx/star/main_transformer_moe_weight/moe_analysis/gan_umap.pdf',format='pdf')
plt.show()

In [None]:
import numpy as np
import umap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 你的三维嵌入
real_embeddings = np.load("/exp_data/sjx/star/first_data/ESM-embedding/negative_all_embedding.npy")
generated_embeddings = np.load("/exp_data/sjx/star/gan_data/260_e_fake_negative_embeddings.npy")

# 变成二维
real_embeddings_2d = real_embeddings.mean(axis=1)
generated_embeddings_2d = generated_embeddings.mean(axis=1)

print(f"real_embeddings_2d shape: {real_embeddings_2d.shape}")
print(f"generated_embeddings_2d shape: {generated_embeddings_2d.shape}")

# 合并，制作标签
all_embeddings = np.vstack([real_embeddings_2d, generated_embeddings_2d])
labels = np.array([0]*len(real_embeddings_2d) + [1]*len(generated_embeddings_2d))

# UMAP降维可视化
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean', random_state=42)
embedding_2d = reducer.fit_transform(all_embeddings)

plt.figure(figsize=(8,6))
plt.scatter(embedding_2d[labels==0, 0], embedding_2d[labels==0, 1], s=10, alpha=0.6, label="Real")
plt.scatter(embedding_2d[labels==1, 0], embedding_2d[labels==1, 1], s=10, alpha=0.6, label="Generated")
plt.legend()
plt.title("UMAP Visualization of Real vs Generated Protein Embeddings")
plt.xlabel("UMAP 1")
plt.ylabel("UMAP 2")
plt.show()

# 用 LogisticRegression 区分
clf = LogisticRegression(max_iter=1000)
scores = cross_val_score(clf, all_embeddings, labels, cv=5)
print(f"Classification accuracy to distinguish real vs generated: {scores.mean():.3f} ± {scores.std():.3f}")